diff --git a/CMakeLists.txt b/CMakeLists.txt index 2856a20606..3c8827805b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,97 +64,118 @@ else() include_directories(${LLVM_INCLUDE_DIRS}) include_directories(${CLANG_INCLUDE_DIRS}) set(EMBEDDED_LLD_LIB_SOURCES - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Driver/DarwinLdDriver.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Config/Version.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/YAML/ReaderWriterYAML.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/LayoutPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ObjCPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileBinaryReader.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileToAtoms.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/TLVPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileYAML.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/GOTPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_x86.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileBinaryWriter.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_x86_64.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachOLinkingContext.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ShimPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/WriterMachO.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/StubsPass.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_arm.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileFromAtoms.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/FileArchive.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/TargetOptionsCommandFlags.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/File.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Error.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/SymbolTable.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Reader.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Reproduce.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Writer.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/LinkingContext.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Resolver.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Args.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/ErrorHandler.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Memory.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Reproduce.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Strings.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/TargetOptionsCommandFlags.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Threads.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/Common/Version.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/DefinedAtom.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Error.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/File.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/LinkingContext.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Reader.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Resolver.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/SymbolTable.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Core/Writer.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/Driver/DarwinLdDriver.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/FileArchive.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_arm.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_arm64.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_x86.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ArchHandler_x86_64.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/CompactUnwindPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/GOTPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/LayoutPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachOLinkingContext.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileBinaryReader.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileBinaryWriter.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileFromAtoms.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileToAtoms.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/MachONormalizedFileYAML.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ObjCPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/ShimPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/StubsPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/TLVPass.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/MachO/WriterMachO.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/lib/ReaderWriter/YAML/ReaderWriterYAML.cpp" ) set(EMBEDDED_LLD_ELF_SOURCES - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/ScriptLexer.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/AArch64ErrataFix.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/AArch64.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/AMDGPU.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/PPC.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/ARM.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/AVR.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/SPARCV9.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/Mips.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/AArch64.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/X86_64.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/PPC64.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/MipsArchTree.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/PPC.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/PPC64.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/SPARCV9.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/X86.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/GdbIndex.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Arch/X86_64.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Driver.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Relocations.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Error.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/LTO.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Strings.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/ScriptParser.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/MarkLive.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/SyntheticSections.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/SymbolTable.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/LinkerScript.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/DriverUtils.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/EhFrame.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Target.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Filesystem.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/OutputSections.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Symbols.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/GdbIndex.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/ICF.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/InputFiles.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Thunks.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/DriverUtils.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Writer.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/InputSection.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/LTO.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/LinkerScript.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/ELF/MapFile.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/MarkLive.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/OutputSections.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Relocations.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/ScriptLexer.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/ScriptParser.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Strings.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/SymbolTable.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Symbols.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/SyntheticSections.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Target.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Thunks.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/ELF/Writer.cpp" ) + set(EMBEDDED_LLD_COFF_SOURCES + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Chunks.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/COFF/DLL.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Driver.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Chunks.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/PDB.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Error.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/LTO.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Strings.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/MarkLive.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/SymbolTable.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Symbols.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/DriverUtils.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/COFF/ICF.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/COFF/InputFiles.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/DriverUtils.cpp" - "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Writer.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/LTO.cpp" "${CMAKE_SOURCE_DIR}/deps/lld/COFF/MapFile.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/MarkLive.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/MinGW.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/PDB.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Strings.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/SymbolTable.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Symbols.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/COFF/Writer.cpp" + ) + set(EMBEDDED_LLD_MINGW_SOURCES + "${CMAKE_SOURCE_DIR}/deps/lld/MinGW/Driver.cpp" + ) + set(EMBEDDED_LLD_WASM_SOURCES + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/Driver.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/InputFiles.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/InputSegment.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/OutputSections.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/Symbols.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/SymbolTable.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/Writer.cpp" + "${CMAKE_SOURCE_DIR}/deps/lld/wasm/WriterUtils.cpp" ) add_library(embedded_lld_lib ${EMBEDDED_LLD_LIB_SOURCES}) add_library(embedded_lld_elf ${EMBEDDED_LLD_ELF_SOURCES}) add_library(embedded_lld_coff ${EMBEDDED_LLD_COFF_SOURCES}) + add_library(embedded_lld_mingw ${EMBEDDED_LLD_MINGW_SOURCES}) + add_library(embedded_lld_wasm ${EMBEDDED_LLD_WASM_SOURCES}) if(MSVC) set(ZIG_LLD_COMPILE_FLAGS "-std=c++11 -D_CRT_SECURE_NO_WARNINGS /w") else() @@ -172,6 +193,14 @@ else() COMPILE_FLAGS ${ZIG_LLD_COMPILE_FLAGS} LINK_FLAGS " " ) + set_target_properties(embedded_lld_mingw PROPERTIES + COMPILE_FLAGS ${ZIG_LLD_COMPILE_FLAGS} + LINK_FLAGS " " + ) + set_target_properties(embedded_lld_wasm PROPERTIES + COMPILE_FLAGS ${ZIG_LLD_COMPILE_FLAGS} + LINK_FLAGS " " + ) target_include_directories(embedded_lld_lib PRIVATE "${CMAKE_SOURCE_DIR}/deps/lld/include" "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt" @@ -188,13 +217,27 @@ else() "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt/COFF" "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt" ) + target_include_directories(embedded_lld_mingw PRIVATE + "${CMAKE_SOURCE_DIR}/deps/lld/MinGW" + "${CMAKE_SOURCE_DIR}/deps/lld/include" + "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt/MinGW" + "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt" + ) + target_include_directories(embedded_lld_wasm PRIVATE + "${CMAKE_SOURCE_DIR}/deps/lld/wasm" + "${CMAKE_SOURCE_DIR}/deps/lld/include" + "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt/wasm" + "${CMAKE_SOURCE_DIR}/deps/lld-prebuilt" + ) set(LLD_INCLUDE_DIRS "") set(LLD_LIBRARIES embedded_lld_elf embedded_lld_coff + embedded_lld_mingw + embedded_lld_wasm embedded_lld_lib ) - install(TARGETS embedded_lld_elf embedded_lld_coff embedded_lld_lib DESTINATION "${ZIG_CPP_LIB_DIR}") + install(TARGETS embedded_lld_elf embedded_lld_coff embedded_lld_mingw embedded_lld_wasm embedded_lld_lib DESTINATION "${ZIG_CPP_LIB_DIR}") endif() # No patches have been applied to SoftFloat-3d @@ -498,10 +541,12 @@ set(ZIG_C_HEADER_FILES "adxintrin.h" "altivec.h" "ammintrin.h" + "arm64intr.h" "arm_acle.h" "arm_neon.h" "armintr.h" "avx2intrin.h" + "avx512bitalgintrin.h" "avx512bwintrin.h" "avx512cdintrin.h" "avx512dqintrin.h" @@ -510,17 +555,25 @@ set(ZIG_C_HEADER_FILES "avx512ifmaintrin.h" "avx512ifmavlintrin.h" "avx512pfintrin.h" + "avx512vbmi2intrin.h" "avx512vbmiintrin.h" "avx512vbmivlintrin.h" + "avx512vlbitalgintrin.h" "avx512vlbwintrin.h" "avx512vlcdintrin.h" "avx512vldqintrin.h" "avx512vlintrin.h" + "avx512vlvbmi2intrin.h" + "avx512vlvnniintrin.h" + "avx512vnniintrin.h" "avx512vpopcntdqintrin.h" + "avx512vpopcntdqvlintrin.h" "avxintrin.h" "bmi2intrin.h" "bmiintrin.h" + "cetintrin.h" "clflushoptintrin.h" + "clwbintrin.h" "clzerointrin.h" "cpuid.h" "cuda_wrappers/algorithm" @@ -532,6 +585,7 @@ set(ZIG_C_HEADER_FILES "fma4intrin.h" "fmaintrin.h" "fxsrintrin.h" + "gfniintrin.h" "htmintrin.h" "htmxlintrin.h" "ia32intrin.h" @@ -571,8 +625,10 @@ set(ZIG_C_HEADER_FILES "tmmintrin.h" "unwind.h" "vadefs.h" + "vaesintrin.h" "varargs.h" "vecintrin.h" + "vpclmulqdqintrin.h" "wmmintrin.h" "x86intrin.h" "xmmintrin.h" diff --git a/README.md b/README.md index 252a73e9d1..e58b6d564b 100644 --- a/README.md +++ b/README.md @@ -124,14 +124,14 @@ libc. Create demo games using Zig. ##### POSIX * cmake >= 2.8.5 - * g++ >= 5.0.0 or clang >= 3.6.0 - * LLVM, Clang, LLD development libraries == 5.0.1, compiled with the same gcc or clang version above + * gcc >= 5.0.0 or clang >= 3.6.0 + * LLVM, Clang, LLD development libraries == 6.x, compiled with the same gcc or clang version above ##### Windows * cmake >= 2.8.5 * Microsoft Visual Studio 2015 - * LLVM, Clang, LLD development libraries == 5.0.1, compiled with the same MSVC version above + * LLVM, Clang, LLD development libraries == 6.x, compiled with the same MSVC version above #### Instructions @@ -155,11 +155,11 @@ make install `ZIG_LIBC_LIB_DIR` and `ZIG_LIBC_STATIC_LIB_DIR` are unused. ``` -brew install cmake llvm@5 -brew outdated llvm@5 || brew upgrade llvm@5 +brew install cmake llvm@6 +brew outdated llvm@6 || brew upgrade llvm@6 mkdir build cd build -cmake .. -DCMAKE_PREFIX_PATH=/usr/local/opt/llvm@5/ -DCMAKE_INSTALL_PREFIX=$(pwd) +cmake .. -DCMAKE_PREFIX_PATH=/usr/local/opt/llvm@6/ -DCMAKE_INSTALL_PREFIX=$(pwd) make install ./zig build --build-file ../build.zig test ``` diff --git a/c_headers/__clang_cuda_cmath.h b/c_headers/__clang_cuda_cmath.h index 9bef82611a..5331ba401a 100644 --- a/c_headers/__clang_cuda_cmath.h +++ b/c_headers/__clang_cuda_cmath.h @@ -131,15 +131,6 @@ __DEVICE__ float ldexp(float __arg, int __exp) { __DEVICE__ float log(float __x) { return ::logf(__x); } __DEVICE__ float log10(float __x) { return ::log10f(__x); } __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); } -__DEVICE__ float nexttoward(float __from, double __to) { - return __builtin_nexttowardf(__from, __to); -} -__DEVICE__ double nexttoward(double __from, double __to) { - return __builtin_nexttoward(__from, __to); -} -__DEVICE__ float nexttowardf(float __from, double __to) { - return __builtin_nexttowardf(__from, __to); -} __DEVICE__ float pow(float __base, float __exp) { return ::powf(__base, __exp); } @@ -157,6 +148,10 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); } __DEVICE__ float tan(float __x) { return ::tanf(__x); } __DEVICE__ float tanh(float __x) { return ::tanhf(__x); } +// Notably missing above is nexttoward. We omit it because +// libdevice doesn't provide an implementation, and we don't want to be in the +// business of implementing tricky libm functions in this header. + // Now we've defined everything we promised we'd define in // __clang_cuda_math_forward_declares.h. We need to do two additional things to // fix up our math functions. @@ -295,13 +290,6 @@ ldexp(__T __x, int __exp) { return std::ldexp((double)__x, __exp); } -template -__DEVICE__ typename __clang_cuda_enable_if::is_integer, - double>::type -nexttoward(__T __from, double __to) { - return std::nexttoward((double)__from, __to); -} - template __DEVICE__ typename __clang_cuda_enable_if< std::numeric_limits<__T1>::is_specialized && @@ -388,7 +376,6 @@ using ::lrint; using ::lround; using ::nearbyint; using ::nextafter; -using ::nexttoward; using ::pow; using ::remainder; using ::remquo; @@ -456,8 +443,6 @@ using ::lroundf; using ::modff; using ::nearbyintf; using ::nextafterf; -using ::nexttowardf; -using ::nexttowardf; using ::powf; using ::remainderf; using ::remquof; diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h index b43ce21d0b..1794eb3dc1 100644 --- a/c_headers/__clang_cuda_intrinsics.h +++ b/c_headers/__clang_cuda_intrinsics.h @@ -34,23 +34,24 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #pragma push_macro("__MAKE_SHUFFLES") -#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ - inline __device__ int __FnName(int __val, int __offset, \ +#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ + __Type) \ + inline __device__ int __FnName(int __val, __Type __offset, \ int __width = warpSize) { \ return __IntIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ float __FnName(float __val, int __offset, \ + inline __device__ float __FnName(float __val, __Type __offset, \ int __width = warpSize) { \ return __FloatIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \ + inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast( \ ::__FnName(static_cast(__val), __offset, __width)); \ } \ - inline __device__ long long __FnName(long long __val, int __offset, \ + inline __device__ long long __FnName(long long __val, __Type __offset, \ int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ @@ -65,12 +66,29 @@ memcpy(&__ret, &__tmp, sizeof(__tmp)); \ return __ret; \ } \ + inline __device__ long __FnName(long __val, __Type __offset, \ + int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned long __val, __Type __offset, int __width = warpSize) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ + } \ inline __device__ unsigned long long __FnName( \ - unsigned long long __val, int __offset, int __width = warpSize) { \ + unsigned long long __val, __Type __offset, int __width = warpSize) { \ return static_cast(::__FnName( \ static_cast(__val), __offset, __width)); \ } \ - inline __device__ double __FnName(double __val, int __offset, \ + inline __device__ double __FnName(double __val, __Type __offset, \ int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ @@ -81,17 +99,166 @@ return __ret; \ } -__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f); +__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. -__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0); -__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f); -__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); - +__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, + unsigned int); +__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, + unsigned int); +__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, + int); #pragma pop_macro("__MAKE_SHUFFLES") #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +#if CUDA_VERSION >= 9000 +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +// __shfl_sync_* variants available in CUDA-9 +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask, __Type) \ + inline __device__ int __FnName(unsigned int __mask, int __val, \ + __Type __offset, int __width = warpSize) { \ + return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(unsigned int __mask, float __val, \ + __Type __offset, int __width = warpSize) { \ + return __FloatIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __mask, \ + unsigned int __val, __Type __offset, \ + int __width = warpSize) { \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ + } \ + inline __device__ long long __FnName(unsigned int __mask, long long __val, \ + __Type __offset, \ + int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__val) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__val, &__tmp, sizeof(__val)); \ + __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ + long long __ret; \ + memcpy(&__ret, &__tmp, sizeof(__tmp)); \ + return __ret; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned int __mask, unsigned long long __val, __Type __offset, \ + int __width = warpSize) { \ + return static_cast(::__FnName( \ + __mask, static_cast(__val), __offset, __width)); \ + } \ + inline __device__ long __FnName(unsigned int __mask, long __val, \ + __Type __offset, int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast(::__FnName( \ + __mask, static_cast(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned int __mask, unsigned long __val, __Type __offset, \ + int __width = warpSize) { \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ + } \ + inline __device__ double __FnName(unsigned int __mask, double __val, \ + __Type __offset, int __width = warpSize) { \ + long long __tmp; \ + _Static_assert(sizeof(__tmp) == sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ + __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ + double __ret; \ + memcpy(&__ret, &__tmp, sizeof(__ret)); \ + return __ret; \ + } +__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, + __nvvm_shfl_sync_idx_f32, 0x1f, int); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, + __nvvm_shfl_sync_up_f32, 0, unsigned int); +__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, + __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); +__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, + __nvvm_shfl_sync_bfly_f32, 0x1f, int); +#pragma pop_macro("__MAKE_SYNC_SHUFFLES") + +inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { + return __nvvm_bar_warp_sync(mask); +} + +inline __device__ void __barrier_sync(unsigned int id) { + __nvvm_barrier_sync(id); +} + +inline __device__ void __barrier_sync_count(unsigned int id, + unsigned int count) { + __nvvm_barrier_sync_cnt(id, count); +} + +inline __device__ int __all_sync(unsigned int mask, int pred) { + return __nvvm_vote_all_sync(mask, pred); +} + +inline __device__ int __any_sync(unsigned int mask, int pred) { + return __nvvm_vote_any_sync(mask, pred); +} + +inline __device__ int __uni_sync(unsigned int mask, int pred) { + return __nvvm_vote_uni_sync(mask, pred); +} + +inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { + return __nvvm_vote_ballot_sync(mask, pred); +} + +inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } + +inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { + return __nvvm_fns(mask, base, offset); +} + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +// Define __match* builtins CUDA-9 headers expect to see. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +inline __device__ unsigned int __match32_any_sync(unsigned int mask, + unsigned int value) { + return __nvvm_match_any_sync_i32(mask, value); +} + +inline __device__ unsigned long long +__match64_any_sync(unsigned int mask, unsigned long long value) { + return __nvvm_match_any_sync_i64(mask, value); +} + +inline __device__ unsigned int +__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { + return __nvvm_match_all_sync_i32p(mask, value, pred); +} + +inline __device__ unsigned long long +__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { + return __nvvm_match_all_sync_i64p(mask, value, pred); +} +#include "crt/sm_70_rt.hpp" + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#endif // __CUDA_VERSION >= 9000 + // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. // Prevent the vanilla sm_32 intrinsics header from being included. diff --git a/c_headers/__clang_cuda_math_forward_declares.h b/c_headers/__clang_cuda_math_forward_declares.h index 49c805151d..c31b1f4cda 100644 --- a/c_headers/__clang_cuda_math_forward_declares.h +++ b/c_headers/__clang_cuda_math_forward_declares.h @@ -149,9 +149,6 @@ __DEVICE__ double nearbyint(double); __DEVICE__ float nearbyint(float); __DEVICE__ double nextafter(double, double); __DEVICE__ float nextafter(float, float); -__DEVICE__ double nexttoward(double, double); -__DEVICE__ float nexttoward(float, double); -__DEVICE__ float nexttowardf(float, double); __DEVICE__ double pow(double, double); __DEVICE__ double pow(double, int); __DEVICE__ float pow(float, float); @@ -185,6 +182,10 @@ __DEVICE__ float tgamma(float); __DEVICE__ double trunc(double); __DEVICE__ float trunc(float); +// Notably missing above is nexttoward, which we don't define on +// the device side because libdevice doesn't give us an implementation, and we +// don't want to be in the business of writing one ourselves. + // We need to define these overloads in exactly the namespace our standard // library uses (including the right inline namespace), otherwise they won't be // picked up by other functions in the standard library (e.g. functions in @@ -255,7 +256,6 @@ using ::nan; using ::nanf; using ::nearbyint; using ::nextafter; -using ::nexttoward; using ::pow; using ::remainder; using ::remquo; diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h index 931d44b696..a82a8490f3 100644 --- a/c_headers/__clang_cuda_runtime_wrapper.h +++ b/c_headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000 #error "Unsupported CUDA version!" #endif @@ -86,7 +86,11 @@ #define __COMMON_FUNCTIONS_H__ #undef __CUDACC__ +#if CUDA_VERSION < 9000 #define __CUDABE__ +#else +#define __CUDA_LIBDEVICE__ +#endif // Disables definitions of device-side runtime support stubs in // cuda_device_runtime_api.h #include "driver_types.h" @@ -94,6 +98,7 @@ #include "host_defines.h" #undef __CUDABE__ +#undef __CUDA_LIBDEVICE__ #define __CUDACC__ #include "cuda_runtime.h" @@ -105,7 +110,9 @@ #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n) #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n) +#if CUDA_VERSION < 9000 #include "crt/device_runtime.h" +#endif #include "crt/host_runtime.h" // device_runtime.h defines __cxa_* macros that will conflict with // cxxabi.h. @@ -166,7 +173,18 @@ inline __host__ double __signbitd(double x) { // __device__. #pragma push_macro("__forceinline__") #define __forceinline__ __device__ __inline__ __attribute__((always_inline)) + +#pragma push_macro("__float2half_rn") +#if CUDA_VERSION >= 9000 +// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in +// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in +// device_functions.hpp out of the way. +#define __float2half_rn __float2half_rn_disabled +#endif + #include "device_functions.hpp" +#pragma pop_macro("__float2half_rn") + // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we // get the slow-but-accurate or fast-but-inaccurate versions of functions like @@ -247,7 +265,23 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } #pragma push_macro("__GNUC__") #undef __GNUC__ #define signbit __ignored_cuda_signbit + +// CUDA-9 omits device-side definitions of some math functions if it sees +// include guard from math.h wrapper from libstdc++. We have to undo the header +// guard temporarily to get the definitions we need. +#pragma push_macro("_GLIBCXX_MATH_H") +#pragma push_macro("_LIBCPP_VERSION") +#if CUDA_VERSION >= 9000 +#undef _GLIBCXX_MATH_H +// We also need to undo another guard that checks for libc++ 3.8+ +#ifdef _LIBCPP_VERSION +#define _LIBCPP_VERSION 3700 +#endif +#endif + #include "math_functions.hpp" +#pragma pop_macro("_GLIBCXX_MATH_H") +#pragma pop_macro("_LIBCPP_VERSION") #pragma pop_macro("__GNUC__") #pragma pop_macro("signbit") diff --git a/c_headers/arm64intr.h b/c_headers/arm64intr.h new file mode 100644 index 0000000000..be52283618 --- /dev/null +++ b/c_headers/arm64intr.h @@ -0,0 +1,49 @@ +/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +/* Only include this if we're compiling for the windows platform. */ +#ifndef _MSC_VER +#include_next +#else + +#ifndef __ARM64INTR_H +#define __ARM64INTR_H + +typedef enum +{ + _ARM64_BARRIER_SY = 0xF, + _ARM64_BARRIER_ST = 0xE, + _ARM64_BARRIER_LD = 0xD, + _ARM64_BARRIER_ISH = 0xB, + _ARM64_BARRIER_ISHST = 0xA, + _ARM64_BARRIER_ISHLD = 0x9, + _ARM64_BARRIER_NSH = 0x7, + _ARM64_BARRIER_NSHST = 0x6, + _ARM64_BARRIER_NSHLD = 0x5, + _ARM64_BARRIER_OSH = 0x3, + _ARM64_BARRIER_OSHST = 0x2, + _ARM64_BARRIER_OSHLD = 0x1 +} _ARM64INTR_BARRIER_TYPE; + +#endif /* __ARM64INTR_H */ +#endif /* _MSC_VER */ diff --git a/c_headers/arm_neon.h b/c_headers/arm_neon.h index f5ca59bb32..3da63d994d 100644 --- a/c_headers/arm_neon.h +++ b/c_headers/arm_neon.h @@ -40460,6 +40460,3318 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) } #endif +#endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vabsq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vabsq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vabs_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vabs_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 + __p1; + return __ret; +} +#else +__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 + __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 + __p1; + return __ret; +} +#else +__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 + __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 == __p1); + return __ret; +} +#else +__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 == __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 == __p1); + return __ret; +} +#else +__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 == __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vceqzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vceqzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vceqz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vceqz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 >= __p1); + return __ret; +} +#else +__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 >= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 >= __p1); + return __ret; +} +#else +__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 >= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgezq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcgezq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgez_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcgez_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 > __p1); + return __ret; +} +#else +__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 > __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 > __p1); + return __ret; +} +#else +__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 > __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgtz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcgtz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 <= __p1); + return __ret; +} +#else +__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 <= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 <= __p1); + return __ret; +} +#else +__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 <= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vclezq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vclezq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vclez_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vclez_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 < __p1); + return __ret; +} +#else +__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 < __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 < __p1); + return __ret; +} +#else +__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 < __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcltzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcltzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcltz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcltz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) { + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) { + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) { + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) { + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x8_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 49); \ + __ret; \ +}) +#else +#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x8_t __s0 = __p0; \ + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x8_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 33); \ + __ret; \ +}) +#else +#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x8_t __s0 = __p0; \ + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x4_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 17); \ + __ret; \ +}) +#else +#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x4_t __s0 = __p0; \ + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 17); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x4_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 1); \ + __ret; \ +}) +#else +#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x4_t __s0 = __p0; \ + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + int16x8_t __ret; \ + __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__s0, __p1, 33); \ + __ret; \ +}) +#else +#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __ret; \ + __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__rev0, __p1, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + int16x4_t __ret; \ + __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__s0, __p1, 1); \ + __ret; \ +}) +#else +#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + int16x4_t __ret; \ + __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__rev0, __p1, 1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + uint16x8_t __ret; \ + __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__s0, __p1, 49); \ + __ret; \ +}) +#else +#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __ret; \ + __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__rev0, __p1, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + uint16x4_t __ret; \ + __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__s0, __p1, 17); \ + __ret; \ +}) +#else +#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint16x4_t __ret; \ + __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__rev0, __p1, 17); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \ + __ret; \ +}) +#else +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +__ai float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +__ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \ + __ret; \ +}) +#else +#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__rev2, __p3); \ + __ret; \ +}) +#define __noswap_vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \ + __ret; \ +}) +#else +#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \ + __ret; \ +}) +#else +#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \ + __ret; \ +}) +#else +#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__rev2, __p3); \ + __ret; \ +}) +#define __noswap_vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \ + __ret; \ +}) +#else +#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \ + __ret; \ +}) +#else +#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = vfmaq_f16(__s0, __s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vfmaq_f16(__rev0, __rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = vfma_f16(__s0, __s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vfma_f16(__rev0, __rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = vfmaq_f16(__p0, -__p1, __p2); + return __ret; +} +#else +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = vfma_f16(__p0, -__p1, __p2); + return __ret; +} +#else +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsh_lane_f16(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \ + float16_t __s0_0 = __p0_0; \ + float16_t __s1_0 = __p1_0; \ + float16x4_t __s2_0 = __p2_0; \ + float16_t __ret_0; \ + __ret_0 = vfmah_lane_f16(__s0_0, -__s1_0, __s2_0, __p3_0); \ + __ret_0; \ +}) +#else +#define vfmsh_lane_f16(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \ + float16_t __s0_1 = __p0_1; \ + float16_t __s1_1 = __p1_1; \ + float16x4_t __s2_1 = __p2_1; \ + float16x4_t __rev2_1; __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 3, 2, 1, 0); \ + float16_t __ret_1; \ + __ret_1 = __noswap_vfmah_lane_f16(__s0_1, -__s1_1, __rev2_1, __p3_1); \ + __ret_1; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_lane_f16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \ + float16x8_t __s0_2 = __p0_2; \ + float16x8_t __s1_2 = __p1_2; \ + float16x4_t __s2_2 = __p2_2; \ + float16x8_t __ret_2; \ + __ret_2 = vfmaq_lane_f16(__s0_2, -__s1_2, __s2_2, __p3_2); \ + __ret_2; \ +}) +#else +#define vfmsq_lane_f16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \ + float16x8_t __s0_3 = __p0_3; \ + float16x8_t __s1_3 = __p1_3; \ + float16x4_t __s2_3 = __p2_3; \ + float16x8_t __rev0_3; __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1_3; __rev1_3 = __builtin_shufflevector(__s1_3, __s1_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_3; __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \ + float16x8_t __ret_3; \ + __ret_3 = __noswap_vfmaq_lane_f16(__rev0_3, -__rev1_3, __rev2_3, __p3_3); \ + __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_3; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_lane_f16(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \ + float16x4_t __s0_4 = __p0_4; \ + float16x4_t __s1_4 = __p1_4; \ + float16x4_t __s2_4 = __p2_4; \ + float16x4_t __ret_4; \ + __ret_4 = vfma_lane_f16(__s0_4, -__s1_4, __s2_4, __p3_4); \ + __ret_4; \ +}) +#else +#define vfms_lane_f16(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \ + float16x4_t __s0_5 = __p0_5; \ + float16x4_t __s1_5 = __p1_5; \ + float16x4_t __s2_5 = __p2_5; \ + float16x4_t __rev0_5; __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 3, 2, 1, 0); \ + float16x4_t __rev1_5; __rev1_5 = __builtin_shufflevector(__s1_5, __s1_5, 3, 2, 1, 0); \ + float16x4_t __rev2_5; __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 3, 2, 1, 0); \ + float16x4_t __ret_5; \ + __ret_5 = __noswap_vfma_lane_f16(__rev0_5, -__rev1_5, __rev2_5, __p3_5); \ + __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 3, 2, 1, 0); \ + __ret_5; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsh_laneq_f16(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \ + float16_t __s0_6 = __p0_6; \ + float16_t __s1_6 = __p1_6; \ + float16x8_t __s2_6 = __p2_6; \ + float16_t __ret_6; \ + __ret_6 = vfmah_laneq_f16(__s0_6, -__s1_6, __s2_6, __p3_6); \ + __ret_6; \ +}) +#else +#define vfmsh_laneq_f16(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \ + float16_t __s0_7 = __p0_7; \ + float16_t __s1_7 = __p1_7; \ + float16x8_t __s2_7 = __p2_7; \ + float16x8_t __rev2_7; __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_7; \ + __ret_7 = __noswap_vfmah_laneq_f16(__s0_7, -__s1_7, __rev2_7, __p3_7); \ + __ret_7; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f16(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \ + float16x8_t __s0_8 = __p0_8; \ + float16x8_t __s1_8 = __p1_8; \ + float16x8_t __s2_8 = __p2_8; \ + float16x8_t __ret_8; \ + __ret_8 = vfmaq_laneq_f16(__s0_8, -__s1_8, __s2_8, __p3_8); \ + __ret_8; \ +}) +#else +#define vfmsq_laneq_f16(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \ + float16x8_t __s0_9 = __p0_9; \ + float16x8_t __s1_9 = __p1_9; \ + float16x8_t __s2_9 = __p2_9; \ + float16x8_t __rev0_9; __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1_9; __rev1_9 = __builtin_shufflevector(__s1_9, __s1_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2_9; __rev2_9 = __builtin_shufflevector(__s2_9, __s2_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret_9; \ + __ret_9 = __noswap_vfmaq_laneq_f16(__rev0_9, -__rev1_9, __rev2_9, __p3_9); \ + __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_9; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \ + float16x4_t __s0_10 = __p0_10; \ + float16x4_t __s1_10 = __p1_10; \ + float16x8_t __s2_10 = __p2_10; \ + float16x4_t __ret_10; \ + __ret_10 = vfma_laneq_f16(__s0_10, -__s1_10, __s2_10, __p3_10); \ + __ret_10; \ +}) +#else +#define vfms_laneq_f16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \ + float16x4_t __s0_11 = __p0_11; \ + float16x4_t __s1_11 = __p1_11; \ + float16x8_t __s2_11 = __p2_11; \ + float16x4_t __rev0_11; __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 3, 2, 1, 0); \ + float16x4_t __rev1_11; __rev1_11 = __builtin_shufflevector(__s1_11, __s1_11, 3, 2, 1, 0); \ + float16x8_t __rev2_11; __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret_11; \ + __ret_11 = __noswap_vfma_laneq_f16(__rev0_11, -__rev1_11, __rev2_11, __p3_11); \ + __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 3, 2, 1, 0); \ + __ret_11; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = vfmaq_f16(__s0, -__s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vfmaq_f16(__rev0, -__rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = vfma_f16(__s0, -__s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vfma_f16(__rev0, -__rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vmaxnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vmaxnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vmaxvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vmaxv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vminnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vminnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vminvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vminv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ + __ret; \ +}) +#else +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ + __ret; \ +}) +#else +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +__ai float16x8_t __noswap_vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +__ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \ + __ret; \ +}) +#else +#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, (float16x4_t) {__s1, __s1, __s1, __s1}); \ + __ret; \ +}) +#else +#define vmulx_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, (float16x4_t) {__s1, __s1, __s1, __s1}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vnegq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = -__p0; + return __ret; +} +#else +__ai float16x8_t vnegq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = -__p0; + return __ret; +} +#else +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4); + return __ret; +} +#else +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrev64_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + return __ret; +} +#else +__ai float16x4_t vrev64_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndiq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndiq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndi_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndi_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vsqrtq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vsqrtq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vsqrt_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vsqrt_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14); + return __ret; +} +#else +__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6); + return __ret; +} +#else +__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15); + return __ret; +} +#else +__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7); + return __ret; +} +#else +__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14); + return __ret; +} +#else +__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6); + return __ret; +} +#else +__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15); + return __ret; +} +#else +__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7); + return __ret; +} +#else +__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11); + return __ret; +} +#else +__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5); + return __ret; +} +#else +__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15); + return __ret; +} +#else +__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7); + return __ret; +} +#else +__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + #endif #if defined(__ARM_FEATURE_QRDMX) #ifdef __LITTLE_ENDIAN__ @@ -44220,917 +47532,917 @@ __ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p8(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \ - poly8x16_t __s0_0 = __p0_0; \ - poly8x8_t __s2_0 = __p2_0; \ - poly8x16_t __ret_0; \ - __ret_0 = vsetq_lane_p8(vget_lane_p8(__s2_0, __p3_0), __s0_0, __p1_0); \ - __ret_0; \ -}) -#else -#define vcopyq_lane_p8(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \ - poly8x16_t __s0_1 = __p0_1; \ - poly8x8_t __s2_1 = __p2_1; \ - poly8x16_t __rev0_1; __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __rev2_1; __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __ret_1; \ - __ret_1 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_1, __p3_1), __rev0_1, __p1_1); \ - __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_1; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \ - poly16x8_t __s0_2 = __p0_2; \ - poly16x4_t __s2_2 = __p2_2; \ - poly16x8_t __ret_2; \ - __ret_2 = vsetq_lane_p16(vget_lane_p16(__s2_2, __p3_2), __s0_2, __p1_2); \ - __ret_2; \ -}) -#else -#define vcopyq_lane_p16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \ - poly16x8_t __s0_3 = __p0_3; \ - poly16x4_t __s2_3 = __p2_3; \ - poly16x8_t __rev0_3; __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x4_t __rev2_3; __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \ - poly16x8_t __ret_3; \ - __ret_3 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_3, __p3_3), __rev0_3, __p1_3); \ - __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_3; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u8(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \ - uint8x16_t __s0_4 = __p0_4; \ - uint8x8_t __s2_4 = __p2_4; \ - uint8x16_t __ret_4; \ - __ret_4 = vsetq_lane_u8(vget_lane_u8(__s2_4, __p3_4), __s0_4, __p1_4); \ - __ret_4; \ -}) -#else -#define vcopyq_lane_u8(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \ - uint8x16_t __s0_5 = __p0_5; \ - uint8x8_t __s2_5 = __p2_5; \ - uint8x16_t __rev0_5; __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_5; __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_5; \ - __ret_5 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_5, __p3_5), __rev0_5, __p1_5); \ - __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_5; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u32(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \ - uint32x4_t __s0_6 = __p0_6; \ - uint32x2_t __s2_6 = __p2_6; \ - uint32x4_t __ret_6; \ - __ret_6 = vsetq_lane_u32(vget_lane_u32(__s2_6, __p3_6), __s0_6, __p1_6); \ - __ret_6; \ -}) -#else -#define vcopyq_lane_u32(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \ - uint32x4_t __s0_7 = __p0_7; \ - uint32x2_t __s2_7 = __p2_7; \ - uint32x4_t __rev0_7; __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 3, 2, 1, 0); \ - uint32x2_t __rev2_7; __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 1, 0); \ - uint32x4_t __ret_7; \ - __ret_7 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_7, __p3_7), __rev0_7, __p1_7); \ - __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \ - __ret_7; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u64(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \ - uint64x2_t __s0_8 = __p0_8; \ - uint64x1_t __s2_8 = __p2_8; \ - uint64x2_t __ret_8; \ - __ret_8 = vsetq_lane_u64(vget_lane_u64(__s2_8, __p3_8), __s0_8, __p1_8); \ - __ret_8; \ -}) -#else -#define vcopyq_lane_u64(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \ - uint64x2_t __s0_9 = __p0_9; \ - uint64x1_t __s2_9 = __p2_9; \ - uint64x2_t __rev0_9; __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 1, 0); \ - uint64x2_t __ret_9; \ - __ret_9 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_9, __p3_9), __rev0_9, __p1_9); \ - __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 1, 0); \ - __ret_9; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \ - uint16x8_t __s0_10 = __p0_10; \ - uint16x4_t __s2_10 = __p2_10; \ - uint16x8_t __ret_10; \ - __ret_10 = vsetq_lane_u16(vget_lane_u16(__s2_10, __p3_10), __s0_10, __p1_10); \ - __ret_10; \ -}) -#else -#define vcopyq_lane_u16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \ - uint16x8_t __s0_11 = __p0_11; \ - uint16x4_t __s2_11 = __p2_11; \ - uint16x8_t __rev0_11; __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x4_t __rev2_11; __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 3, 2, 1, 0); \ - uint16x8_t __ret_11; \ - __ret_11 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_11, __p3_11), __rev0_11, __p1_11); \ - __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_11; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \ - int8x16_t __s0_12 = __p0_12; \ - int8x8_t __s2_12 = __p2_12; \ - int8x16_t __ret_12; \ - __ret_12 = vsetq_lane_s8(vget_lane_s8(__s2_12, __p3_12), __s0_12, __p1_12); \ +#define vcopyq_lane_p8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \ + poly8x16_t __s0_12 = __p0_12; \ + poly8x8_t __s2_12 = __p2_12; \ + poly8x16_t __ret_12; \ + __ret_12 = vsetq_lane_p8(vget_lane_p8(__s2_12, __p3_12), __s0_12, __p1_12); \ __ret_12; \ }) #else -#define vcopyq_lane_s8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \ - int8x16_t __s0_13 = __p0_13; \ - int8x8_t __s2_13 = __p2_13; \ - int8x16_t __rev0_13; __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_13; __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_13; \ - __ret_13 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_13, __p3_13), __rev0_13, __p1_13); \ +#define vcopyq_lane_p8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \ + poly8x16_t __s0_13 = __p0_13; \ + poly8x8_t __s2_13 = __p2_13; \ + poly8x16_t __rev0_13; __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __rev2_13; __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __ret_13; \ + __ret_13 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_13, __p3_13), __rev0_13, __p1_13); \ __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_13; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_f32(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \ - float32x4_t __s0_14 = __p0_14; \ - float32x2_t __s2_14 = __p2_14; \ - float32x4_t __ret_14; \ - __ret_14 = vsetq_lane_f32(vget_lane_f32(__s2_14, __p3_14), __s0_14, __p1_14); \ +#define vcopyq_lane_p16(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \ + poly16x8_t __s0_14 = __p0_14; \ + poly16x4_t __s2_14 = __p2_14; \ + poly16x8_t __ret_14; \ + __ret_14 = vsetq_lane_p16(vget_lane_p16(__s2_14, __p3_14), __s0_14, __p1_14); \ __ret_14; \ }) #else -#define vcopyq_lane_f32(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \ - float32x4_t __s0_15 = __p0_15; \ - float32x2_t __s2_15 = __p2_15; \ - float32x4_t __rev0_15; __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \ - float32x2_t __rev2_15; __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 1, 0); \ - float32x4_t __ret_15; \ - __ret_15 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_15, __p3_15), __rev0_15, __p1_15); \ - __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 3, 2, 1, 0); \ +#define vcopyq_lane_p16(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \ + poly16x8_t __s0_15 = __p0_15; \ + poly16x4_t __s2_15 = __p2_15; \ + poly16x8_t __rev0_15; __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x4_t __rev2_15; __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 3, 2, 1, 0); \ + poly16x8_t __ret_15; \ + __ret_15 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_15, __p3_15), __rev0_15, __p1_15); \ + __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_15; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s32(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \ - int32x4_t __s0_16 = __p0_16; \ - int32x2_t __s2_16 = __p2_16; \ - int32x4_t __ret_16; \ - __ret_16 = vsetq_lane_s32(vget_lane_s32(__s2_16, __p3_16), __s0_16, __p1_16); \ +#define vcopyq_lane_u8(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \ + uint8x16_t __s0_16 = __p0_16; \ + uint8x8_t __s2_16 = __p2_16; \ + uint8x16_t __ret_16; \ + __ret_16 = vsetq_lane_u8(vget_lane_u8(__s2_16, __p3_16), __s0_16, __p1_16); \ __ret_16; \ }) #else -#define vcopyq_lane_s32(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \ - int32x4_t __s0_17 = __p0_17; \ - int32x2_t __s2_17 = __p2_17; \ - int32x4_t __rev0_17; __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 3, 2, 1, 0); \ - int32x2_t __rev2_17; __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 1, 0); \ - int32x4_t __ret_17; \ - __ret_17 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_17, __p3_17), __rev0_17, __p1_17); \ - __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 3, 2, 1, 0); \ +#define vcopyq_lane_u8(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \ + uint8x16_t __s0_17 = __p0_17; \ + uint8x8_t __s2_17 = __p2_17; \ + uint8x16_t __rev0_17; __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_17; __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_17; \ + __ret_17 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_17, __p3_17), __rev0_17, __p1_17); \ + __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_17; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s64(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \ - int64x2_t __s0_18 = __p0_18; \ - int64x1_t __s2_18 = __p2_18; \ - int64x2_t __ret_18; \ - __ret_18 = vsetq_lane_s64(vget_lane_s64(__s2_18, __p3_18), __s0_18, __p1_18); \ +#define vcopyq_lane_u32(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \ + uint32x4_t __s0_18 = __p0_18; \ + uint32x2_t __s2_18 = __p2_18; \ + uint32x4_t __ret_18; \ + __ret_18 = vsetq_lane_u32(vget_lane_u32(__s2_18, __p3_18), __s0_18, __p1_18); \ __ret_18; \ }) #else -#define vcopyq_lane_s64(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \ - int64x2_t __s0_19 = __p0_19; \ - int64x1_t __s2_19 = __p2_19; \ - int64x2_t __rev0_19; __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \ - int64x2_t __ret_19; \ - __ret_19 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_19, __p3_19), __rev0_19, __p1_19); \ - __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 1, 0); \ +#define vcopyq_lane_u32(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \ + uint32x4_t __s0_19 = __p0_19; \ + uint32x2_t __s2_19 = __p2_19; \ + uint32x4_t __rev0_19; __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 3, 2, 1, 0); \ + uint32x2_t __rev2_19; __rev2_19 = __builtin_shufflevector(__s2_19, __s2_19, 1, 0); \ + uint32x4_t __ret_19; \ + __ret_19 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_19, __p3_19), __rev0_19, __p1_19); \ + __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 3, 2, 1, 0); \ __ret_19; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s16(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \ - int16x8_t __s0_20 = __p0_20; \ - int16x4_t __s2_20 = __p2_20; \ - int16x8_t __ret_20; \ - __ret_20 = vsetq_lane_s16(vget_lane_s16(__s2_20, __p3_20), __s0_20, __p1_20); \ +#define vcopyq_lane_u64(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \ + uint64x2_t __s0_20 = __p0_20; \ + uint64x1_t __s2_20 = __p2_20; \ + uint64x2_t __ret_20; \ + __ret_20 = vsetq_lane_u64(vget_lane_u64(__s2_20, __p3_20), __s0_20, __p1_20); \ __ret_20; \ }) #else -#define vcopyq_lane_s16(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \ - int16x8_t __s0_21 = __p0_21; \ - int16x4_t __s2_21 = __p2_21; \ - int16x8_t __rev0_21; __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x4_t __rev2_21; __rev2_21 = __builtin_shufflevector(__s2_21, __s2_21, 3, 2, 1, 0); \ - int16x8_t __ret_21; \ - __ret_21 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_21, __p3_21), __rev0_21, __p1_21); \ - __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_lane_u64(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \ + uint64x2_t __s0_21 = __p0_21; \ + uint64x1_t __s2_21 = __p2_21; \ + uint64x2_t __rev0_21; __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 1, 0); \ + uint64x2_t __ret_21; \ + __ret_21 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_21, __p3_21), __rev0_21, __p1_21); \ + __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 1, 0); \ __ret_21; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p8(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \ - poly8x8_t __s0_22 = __p0_22; \ - poly8x8_t __s2_22 = __p2_22; \ - poly8x8_t __ret_22; \ - __ret_22 = vset_lane_p8(vget_lane_p8(__s2_22, __p3_22), __s0_22, __p1_22); \ +#define vcopyq_lane_u16(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \ + uint16x8_t __s0_22 = __p0_22; \ + uint16x4_t __s2_22 = __p2_22; \ + uint16x8_t __ret_22; \ + __ret_22 = vsetq_lane_u16(vget_lane_u16(__s2_22, __p3_22), __s0_22, __p1_22); \ __ret_22; \ }) #else -#define vcopy_lane_p8(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \ - poly8x8_t __s0_23 = __p0_23; \ - poly8x8_t __s2_23 = __p2_23; \ - poly8x8_t __rev0_23; __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __rev2_23; __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __ret_23; \ - __ret_23 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_23, __p3_23), __rev0_23, __p1_23); \ +#define vcopyq_lane_u16(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \ + uint16x8_t __s0_23 = __p0_23; \ + uint16x4_t __s2_23 = __p2_23; \ + uint16x8_t __rev0_23; __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x4_t __rev2_23; __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 3, 2, 1, 0); \ + uint16x8_t __ret_23; \ + __ret_23 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_23, __p3_23), __rev0_23, __p1_23); \ __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_23; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p16(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \ - poly16x4_t __s0_24 = __p0_24; \ - poly16x4_t __s2_24 = __p2_24; \ - poly16x4_t __ret_24; \ - __ret_24 = vset_lane_p16(vget_lane_p16(__s2_24, __p3_24), __s0_24, __p1_24); \ +#define vcopyq_lane_s8(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \ + int8x16_t __s0_24 = __p0_24; \ + int8x8_t __s2_24 = __p2_24; \ + int8x16_t __ret_24; \ + __ret_24 = vsetq_lane_s8(vget_lane_s8(__s2_24, __p3_24), __s0_24, __p1_24); \ __ret_24; \ }) #else -#define vcopy_lane_p16(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \ - poly16x4_t __s0_25 = __p0_25; \ - poly16x4_t __s2_25 = __p2_25; \ - poly16x4_t __rev0_25; __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 3, 2, 1, 0); \ - poly16x4_t __rev2_25; __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 3, 2, 1, 0); \ - poly16x4_t __ret_25; \ - __ret_25 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_25, __p3_25), __rev0_25, __p1_25); \ - __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 3, 2, 1, 0); \ +#define vcopyq_lane_s8(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \ + int8x16_t __s0_25 = __p0_25; \ + int8x8_t __s2_25 = __p2_25; \ + int8x16_t __rev0_25; __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_25; __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_25; \ + __ret_25 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_25, __p3_25), __rev0_25, __p1_25); \ + __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_25; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u8(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \ - uint8x8_t __s0_26 = __p0_26; \ - uint8x8_t __s2_26 = __p2_26; \ - uint8x8_t __ret_26; \ - __ret_26 = vset_lane_u8(vget_lane_u8(__s2_26, __p3_26), __s0_26, __p1_26); \ +#define vcopyq_lane_f32(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \ + float32x4_t __s0_26 = __p0_26; \ + float32x2_t __s2_26 = __p2_26; \ + float32x4_t __ret_26; \ + __ret_26 = vsetq_lane_f32(vget_lane_f32(__s2_26, __p3_26), __s0_26, __p1_26); \ __ret_26; \ }) #else -#define vcopy_lane_u8(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \ - uint8x8_t __s0_27 = __p0_27; \ - uint8x8_t __s2_27 = __p2_27; \ - uint8x8_t __rev0_27; __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_27; __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __ret_27; \ - __ret_27 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_27, __p3_27), __rev0_27, __p1_27); \ - __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_lane_f32(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \ + float32x4_t __s0_27 = __p0_27; \ + float32x2_t __s2_27 = __p2_27; \ + float32x4_t __rev0_27; __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 3, 2, 1, 0); \ + float32x2_t __rev2_27; __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 1, 0); \ + float32x4_t __ret_27; \ + __ret_27 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_27, __p3_27), __rev0_27, __p1_27); \ + __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 3, 2, 1, 0); \ __ret_27; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \ - uint32x2_t __s0_28 = __p0_28; \ - uint32x2_t __s2_28 = __p2_28; \ - uint32x2_t __ret_28; \ - __ret_28 = vset_lane_u32(vget_lane_u32(__s2_28, __p3_28), __s0_28, __p1_28); \ +#define vcopyq_lane_s32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \ + int32x4_t __s0_28 = __p0_28; \ + int32x2_t __s2_28 = __p2_28; \ + int32x4_t __ret_28; \ + __ret_28 = vsetq_lane_s32(vget_lane_s32(__s2_28, __p3_28), __s0_28, __p1_28); \ __ret_28; \ }) #else -#define vcopy_lane_u32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \ - uint32x2_t __s0_29 = __p0_29; \ - uint32x2_t __s2_29 = __p2_29; \ - uint32x2_t __rev0_29; __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 1, 0); \ - uint32x2_t __rev2_29; __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \ - uint32x2_t __ret_29; \ - __ret_29 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_29, __p3_29), __rev0_29, __p1_29); \ - __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 1, 0); \ +#define vcopyq_lane_s32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \ + int32x4_t __s0_29 = __p0_29; \ + int32x2_t __s2_29 = __p2_29; \ + int32x4_t __rev0_29; __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 3, 2, 1, 0); \ + int32x2_t __rev2_29; __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \ + int32x4_t __ret_29; \ + __ret_29 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_29, __p3_29), __rev0_29, __p1_29); \ + __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 3, 2, 1, 0); \ __ret_29; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \ - uint64x1_t __s0_30 = __p0_30; \ - uint64x1_t __s2_30 = __p2_30; \ - uint64x1_t __ret_30; \ - __ret_30 = vset_lane_u64(vget_lane_u64(__s2_30, __p3_30), __s0_30, __p1_30); \ +#define vcopyq_lane_s64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \ + int64x2_t __s0_30 = __p0_30; \ + int64x1_t __s2_30 = __p2_30; \ + int64x2_t __ret_30; \ + __ret_30 = vsetq_lane_s64(vget_lane_s64(__s2_30, __p3_30), __s0_30, __p1_30); \ __ret_30; \ }) #else -#define vcopy_lane_u64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \ - uint64x1_t __s0_31 = __p0_31; \ - uint64x1_t __s2_31 = __p2_31; \ - uint64x1_t __ret_31; \ - __ret_31 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_31, __p3_31), __s0_31, __p1_31); \ +#define vcopyq_lane_s64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \ + int64x2_t __s0_31 = __p0_31; \ + int64x1_t __s2_31 = __p2_31; \ + int64x2_t __rev0_31; __rev0_31 = __builtin_shufflevector(__s0_31, __s0_31, 1, 0); \ + int64x2_t __ret_31; \ + __ret_31 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_31, __p3_31), __rev0_31, __p1_31); \ + __ret_31 = __builtin_shufflevector(__ret_31, __ret_31, 1, 0); \ __ret_31; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \ - uint16x4_t __s0_32 = __p0_32; \ - uint16x4_t __s2_32 = __p2_32; \ - uint16x4_t __ret_32; \ - __ret_32 = vset_lane_u16(vget_lane_u16(__s2_32, __p3_32), __s0_32, __p1_32); \ +#define vcopyq_lane_s16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \ + int16x8_t __s0_32 = __p0_32; \ + int16x4_t __s2_32 = __p2_32; \ + int16x8_t __ret_32; \ + __ret_32 = vsetq_lane_s16(vget_lane_s16(__s2_32, __p3_32), __s0_32, __p1_32); \ __ret_32; \ }) #else -#define vcopy_lane_u16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \ - uint16x4_t __s0_33 = __p0_33; \ - uint16x4_t __s2_33 = __p2_33; \ - uint16x4_t __rev0_33; __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 3, 2, 1, 0); \ - uint16x4_t __rev2_33; __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \ - uint16x4_t __ret_33; \ - __ret_33 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_33, __p3_33), __rev0_33, __p1_33); \ - __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 3, 2, 1, 0); \ +#define vcopyq_lane_s16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \ + int16x8_t __s0_33 = __p0_33; \ + int16x4_t __s2_33 = __p2_33; \ + int16x8_t __rev0_33; __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x4_t __rev2_33; __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \ + int16x8_t __ret_33; \ + __ret_33 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_33, __p3_33), __rev0_33, __p1_33); \ + __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_33; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \ - int8x8_t __s0_34 = __p0_34; \ - int8x8_t __s2_34 = __p2_34; \ - int8x8_t __ret_34; \ - __ret_34 = vset_lane_s8(vget_lane_s8(__s2_34, __p3_34), __s0_34, __p1_34); \ +#define vcopy_lane_p8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \ + poly8x8_t __s0_34 = __p0_34; \ + poly8x8_t __s2_34 = __p2_34; \ + poly8x8_t __ret_34; \ + __ret_34 = vset_lane_p8(vget_lane_p8(__s2_34, __p3_34), __s0_34, __p1_34); \ __ret_34; \ }) #else -#define vcopy_lane_s8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \ - int8x8_t __s0_35 = __p0_35; \ - int8x8_t __s2_35 = __p2_35; \ - int8x8_t __rev0_35; __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_35; __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __ret_35; \ - __ret_35 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_35, __p3_35), __rev0_35, __p1_35); \ +#define vcopy_lane_p8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \ + poly8x8_t __s0_35 = __p0_35; \ + poly8x8_t __s2_35 = __p2_35; \ + poly8x8_t __rev0_35; __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __rev2_35; __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __ret_35; \ + __ret_35 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_35, __p3_35), __rev0_35, __p1_35); \ __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_35; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_f32(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \ - float32x2_t __s0_36 = __p0_36; \ - float32x2_t __s2_36 = __p2_36; \ - float32x2_t __ret_36; \ - __ret_36 = vset_lane_f32(vget_lane_f32(__s2_36, __p3_36), __s0_36, __p1_36); \ +#define vcopy_lane_p16(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \ + poly16x4_t __s0_36 = __p0_36; \ + poly16x4_t __s2_36 = __p2_36; \ + poly16x4_t __ret_36; \ + __ret_36 = vset_lane_p16(vget_lane_p16(__s2_36, __p3_36), __s0_36, __p1_36); \ __ret_36; \ }) #else -#define vcopy_lane_f32(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \ - float32x2_t __s0_37 = __p0_37; \ - float32x2_t __s2_37 = __p2_37; \ - float32x2_t __rev0_37; __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 1, 0); \ - float32x2_t __rev2_37; __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 1, 0); \ - float32x2_t __ret_37; \ - __ret_37 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_37, __p3_37), __rev0_37, __p1_37); \ - __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 1, 0); \ +#define vcopy_lane_p16(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \ + poly16x4_t __s0_37 = __p0_37; \ + poly16x4_t __s2_37 = __p2_37; \ + poly16x4_t __rev0_37; __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 3, 2, 1, 0); \ + poly16x4_t __rev2_37; __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 3, 2, 1, 0); \ + poly16x4_t __ret_37; \ + __ret_37 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_37, __p3_37), __rev0_37, __p1_37); \ + __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 3, 2, 1, 0); \ __ret_37; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s32(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \ - int32x2_t __s0_38 = __p0_38; \ - int32x2_t __s2_38 = __p2_38; \ - int32x2_t __ret_38; \ - __ret_38 = vset_lane_s32(vget_lane_s32(__s2_38, __p3_38), __s0_38, __p1_38); \ +#define vcopy_lane_u8(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \ + uint8x8_t __s0_38 = __p0_38; \ + uint8x8_t __s2_38 = __p2_38; \ + uint8x8_t __ret_38; \ + __ret_38 = vset_lane_u8(vget_lane_u8(__s2_38, __p3_38), __s0_38, __p1_38); \ __ret_38; \ }) #else -#define vcopy_lane_s32(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \ - int32x2_t __s0_39 = __p0_39; \ - int32x2_t __s2_39 = __p2_39; \ - int32x2_t __rev0_39; __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \ - int32x2_t __rev2_39; __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 1, 0); \ - int32x2_t __ret_39; \ - __ret_39 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_39, __p3_39), __rev0_39, __p1_39); \ - __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \ +#define vcopy_lane_u8(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \ + uint8x8_t __s0_39 = __p0_39; \ + uint8x8_t __s2_39 = __p2_39; \ + uint8x8_t __rev0_39; __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_39; __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __ret_39; \ + __ret_39 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_39, __p3_39), __rev0_39, __p1_39); \ + __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_39; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s64(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \ - int64x1_t __s0_40 = __p0_40; \ - int64x1_t __s2_40 = __p2_40; \ - int64x1_t __ret_40; \ - __ret_40 = vset_lane_s64(vget_lane_s64(__s2_40, __p3_40), __s0_40, __p1_40); \ +#define vcopy_lane_u32(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \ + uint32x2_t __s0_40 = __p0_40; \ + uint32x2_t __s2_40 = __p2_40; \ + uint32x2_t __ret_40; \ + __ret_40 = vset_lane_u32(vget_lane_u32(__s2_40, __p3_40), __s0_40, __p1_40); \ __ret_40; \ }) #else -#define vcopy_lane_s64(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \ - int64x1_t __s0_41 = __p0_41; \ - int64x1_t __s2_41 = __p2_41; \ - int64x1_t __ret_41; \ - __ret_41 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_41, __p3_41), __s0_41, __p1_41); \ +#define vcopy_lane_u32(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \ + uint32x2_t __s0_41 = __p0_41; \ + uint32x2_t __s2_41 = __p2_41; \ + uint32x2_t __rev0_41; __rev0_41 = __builtin_shufflevector(__s0_41, __s0_41, 1, 0); \ + uint32x2_t __rev2_41; __rev2_41 = __builtin_shufflevector(__s2_41, __s2_41, 1, 0); \ + uint32x2_t __ret_41; \ + __ret_41 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_41, __p3_41), __rev0_41, __p1_41); \ + __ret_41 = __builtin_shufflevector(__ret_41, __ret_41, 1, 0); \ __ret_41; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s16(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \ - int16x4_t __s0_42 = __p0_42; \ - int16x4_t __s2_42 = __p2_42; \ - int16x4_t __ret_42; \ - __ret_42 = vset_lane_s16(vget_lane_s16(__s2_42, __p3_42), __s0_42, __p1_42); \ +#define vcopy_lane_u64(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \ + uint64x1_t __s0_42 = __p0_42; \ + uint64x1_t __s2_42 = __p2_42; \ + uint64x1_t __ret_42; \ + __ret_42 = vset_lane_u64(vget_lane_u64(__s2_42, __p3_42), __s0_42, __p1_42); \ __ret_42; \ }) #else -#define vcopy_lane_s16(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \ - int16x4_t __s0_43 = __p0_43; \ - int16x4_t __s2_43 = __p2_43; \ - int16x4_t __rev0_43; __rev0_43 = __builtin_shufflevector(__s0_43, __s0_43, 3, 2, 1, 0); \ - int16x4_t __rev2_43; __rev2_43 = __builtin_shufflevector(__s2_43, __s2_43, 3, 2, 1, 0); \ - int16x4_t __ret_43; \ - __ret_43 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_43, __p3_43), __rev0_43, __p1_43); \ - __ret_43 = __builtin_shufflevector(__ret_43, __ret_43, 3, 2, 1, 0); \ +#define vcopy_lane_u64(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \ + uint64x1_t __s0_43 = __p0_43; \ + uint64x1_t __s2_43 = __p2_43; \ + uint64x1_t __ret_43; \ + __ret_43 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_43, __p3_43), __s0_43, __p1_43); \ __ret_43; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p8(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \ - poly8x16_t __s0_44 = __p0_44; \ - poly8x16_t __s2_44 = __p2_44; \ - poly8x16_t __ret_44; \ - __ret_44 = vsetq_lane_p8(vgetq_lane_p8(__s2_44, __p3_44), __s0_44, __p1_44); \ +#define vcopy_lane_u16(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \ + uint16x4_t __s0_44 = __p0_44; \ + uint16x4_t __s2_44 = __p2_44; \ + uint16x4_t __ret_44; \ + __ret_44 = vset_lane_u16(vget_lane_u16(__s2_44, __p3_44), __s0_44, __p1_44); \ __ret_44; \ }) #else -#define vcopyq_laneq_p8(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \ - poly8x16_t __s0_45 = __p0_45; \ - poly8x16_t __s2_45 = __p2_45; \ - poly8x16_t __rev0_45; __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __rev2_45; __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __ret_45; \ - __ret_45 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_45, __p3_45), __rev0_45, __p1_45); \ - __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_u16(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \ + uint16x4_t __s0_45 = __p0_45; \ + uint16x4_t __s2_45 = __p2_45; \ + uint16x4_t __rev0_45; __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 3, 2, 1, 0); \ + uint16x4_t __rev2_45; __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 3, 2, 1, 0); \ + uint16x4_t __ret_45; \ + __ret_45 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_45, __p3_45), __rev0_45, __p1_45); \ + __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 3, 2, 1, 0); \ __ret_45; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p16(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \ - poly16x8_t __s0_46 = __p0_46; \ - poly16x8_t __s2_46 = __p2_46; \ - poly16x8_t __ret_46; \ - __ret_46 = vsetq_lane_p16(vgetq_lane_p16(__s2_46, __p3_46), __s0_46, __p1_46); \ +#define vcopy_lane_s8(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \ + int8x8_t __s0_46 = __p0_46; \ + int8x8_t __s2_46 = __p2_46; \ + int8x8_t __ret_46; \ + __ret_46 = vset_lane_s8(vget_lane_s8(__s2_46, __p3_46), __s0_46, __p1_46); \ __ret_46; \ }) #else -#define vcopyq_laneq_p16(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \ - poly16x8_t __s0_47 = __p0_47; \ - poly16x8_t __s2_47 = __p2_47; \ - poly16x8_t __rev0_47; __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x8_t __rev2_47; __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x8_t __ret_47; \ - __ret_47 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_47, __p3_47), __rev0_47, __p1_47); \ +#define vcopy_lane_s8(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \ + int8x8_t __s0_47 = __p0_47; \ + int8x8_t __s2_47 = __p2_47; \ + int8x8_t __rev0_47; __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_47; __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __ret_47; \ + __ret_47 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_47, __p3_47), __rev0_47, __p1_47); \ __ret_47 = __builtin_shufflevector(__ret_47, __ret_47, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_47; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u8(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \ - uint8x16_t __s0_48 = __p0_48; \ - uint8x16_t __s2_48 = __p2_48; \ - uint8x16_t __ret_48; \ - __ret_48 = vsetq_lane_u8(vgetq_lane_u8(__s2_48, __p3_48), __s0_48, __p1_48); \ +#define vcopy_lane_f32(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \ + float32x2_t __s0_48 = __p0_48; \ + float32x2_t __s2_48 = __p2_48; \ + float32x2_t __ret_48; \ + __ret_48 = vset_lane_f32(vget_lane_f32(__s2_48, __p3_48), __s0_48, __p1_48); \ __ret_48; \ }) #else -#define vcopyq_laneq_u8(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \ - uint8x16_t __s0_49 = __p0_49; \ - uint8x16_t __s2_49 = __p2_49; \ - uint8x16_t __rev0_49; __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __rev2_49; __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_49; \ - __ret_49 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_49, __p3_49), __rev0_49, __p1_49); \ - __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_f32(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \ + float32x2_t __s0_49 = __p0_49; \ + float32x2_t __s2_49 = __p2_49; \ + float32x2_t __rev0_49; __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 1, 0); \ + float32x2_t __rev2_49; __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 1, 0); \ + float32x2_t __ret_49; \ + __ret_49 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_49, __p3_49), __rev0_49, __p1_49); \ + __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 1, 0); \ __ret_49; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \ - uint32x4_t __s0_50 = __p0_50; \ - uint32x4_t __s2_50 = __p2_50; \ - uint32x4_t __ret_50; \ - __ret_50 = vsetq_lane_u32(vgetq_lane_u32(__s2_50, __p3_50), __s0_50, __p1_50); \ +#define vcopy_lane_s32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \ + int32x2_t __s0_50 = __p0_50; \ + int32x2_t __s2_50 = __p2_50; \ + int32x2_t __ret_50; \ + __ret_50 = vset_lane_s32(vget_lane_s32(__s2_50, __p3_50), __s0_50, __p1_50); \ __ret_50; \ }) #else -#define vcopyq_laneq_u32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \ - uint32x4_t __s0_51 = __p0_51; \ - uint32x4_t __s2_51 = __p2_51; \ - uint32x4_t __rev0_51; __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 3, 2, 1, 0); \ - uint32x4_t __rev2_51; __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 3, 2, 1, 0); \ - uint32x4_t __ret_51; \ - __ret_51 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_51, __p3_51), __rev0_51, __p1_51); \ - __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 3, 2, 1, 0); \ +#define vcopy_lane_s32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \ + int32x2_t __s0_51 = __p0_51; \ + int32x2_t __s2_51 = __p2_51; \ + int32x2_t __rev0_51; __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 1, 0); \ + int32x2_t __rev2_51; __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 1, 0); \ + int32x2_t __ret_51; \ + __ret_51 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_51, __p3_51), __rev0_51, __p1_51); \ + __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 1, 0); \ __ret_51; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \ - uint64x2_t __s0_52 = __p0_52; \ - uint64x2_t __s2_52 = __p2_52; \ - uint64x2_t __ret_52; \ - __ret_52 = vsetq_lane_u64(vgetq_lane_u64(__s2_52, __p3_52), __s0_52, __p1_52); \ +#define vcopy_lane_s64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \ + int64x1_t __s0_52 = __p0_52; \ + int64x1_t __s2_52 = __p2_52; \ + int64x1_t __ret_52; \ + __ret_52 = vset_lane_s64(vget_lane_s64(__s2_52, __p3_52), __s0_52, __p1_52); \ __ret_52; \ }) #else -#define vcopyq_laneq_u64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \ - uint64x2_t __s0_53 = __p0_53; \ - uint64x2_t __s2_53 = __p2_53; \ - uint64x2_t __rev0_53; __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 1, 0); \ - uint64x2_t __rev2_53; __rev2_53 = __builtin_shufflevector(__s2_53, __s2_53, 1, 0); \ - uint64x2_t __ret_53; \ - __ret_53 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_53, __p3_53), __rev0_53, __p1_53); \ - __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 1, 0); \ +#define vcopy_lane_s64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \ + int64x1_t __s0_53 = __p0_53; \ + int64x1_t __s2_53 = __p2_53; \ + int64x1_t __ret_53; \ + __ret_53 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_53, __p3_53), __s0_53, __p1_53); \ __ret_53; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \ - uint16x8_t __s0_54 = __p0_54; \ - uint16x8_t __s2_54 = __p2_54; \ - uint16x8_t __ret_54; \ - __ret_54 = vsetq_lane_u16(vgetq_lane_u16(__s2_54, __p3_54), __s0_54, __p1_54); \ +#define vcopy_lane_s16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \ + int16x4_t __s0_54 = __p0_54; \ + int16x4_t __s2_54 = __p2_54; \ + int16x4_t __ret_54; \ + __ret_54 = vset_lane_s16(vget_lane_s16(__s2_54, __p3_54), __s0_54, __p1_54); \ __ret_54; \ }) #else -#define vcopyq_laneq_u16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \ - uint16x8_t __s0_55 = __p0_55; \ - uint16x8_t __s2_55 = __p2_55; \ - uint16x8_t __rev0_55; __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev2_55; __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __ret_55; \ - __ret_55 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_55, __p3_55), __rev0_55, __p1_55); \ - __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_s16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \ + int16x4_t __s0_55 = __p0_55; \ + int16x4_t __s2_55 = __p2_55; \ + int16x4_t __rev0_55; __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 3, 2, 1, 0); \ + int16x4_t __rev2_55; __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 3, 2, 1, 0); \ + int16x4_t __ret_55; \ + __ret_55 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_55, __p3_55), __rev0_55, __p1_55); \ + __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 3, 2, 1, 0); \ __ret_55; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \ - int8x16_t __s0_56 = __p0_56; \ - int8x16_t __s2_56 = __p2_56; \ - int8x16_t __ret_56; \ - __ret_56 = vsetq_lane_s8(vgetq_lane_s8(__s2_56, __p3_56), __s0_56, __p1_56); \ +#define vcopyq_laneq_p8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \ + poly8x16_t __s0_56 = __p0_56; \ + poly8x16_t __s2_56 = __p2_56; \ + poly8x16_t __ret_56; \ + __ret_56 = vsetq_lane_p8(vgetq_lane_p8(__s2_56, __p3_56), __s0_56, __p1_56); \ __ret_56; \ }) #else -#define vcopyq_laneq_s8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \ - int8x16_t __s0_57 = __p0_57; \ - int8x16_t __s2_57 = __p2_57; \ - int8x16_t __rev0_57; __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __rev2_57; __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_57; \ - __ret_57 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_57, __p3_57), __rev0_57, __p1_57); \ +#define vcopyq_laneq_p8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \ + poly8x16_t __s0_57 = __p0_57; \ + poly8x16_t __s2_57 = __p2_57; \ + poly8x16_t __rev0_57; __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __rev2_57; __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __ret_57; \ + __ret_57 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_57, __p3_57), __rev0_57, __p1_57); \ __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_57; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \ - float32x4_t __s0_58 = __p0_58; \ - float32x4_t __s2_58 = __p2_58; \ - float32x4_t __ret_58; \ - __ret_58 = vsetq_lane_f32(vgetq_lane_f32(__s2_58, __p3_58), __s0_58, __p1_58); \ +#define vcopyq_laneq_p16(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \ + poly16x8_t __s0_58 = __p0_58; \ + poly16x8_t __s2_58 = __p2_58; \ + poly16x8_t __ret_58; \ + __ret_58 = vsetq_lane_p16(vgetq_lane_p16(__s2_58, __p3_58), __s0_58, __p1_58); \ __ret_58; \ }) #else -#define vcopyq_laneq_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \ - float32x4_t __s0_59 = __p0_59; \ - float32x4_t __s2_59 = __p2_59; \ - float32x4_t __rev0_59; __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \ - float32x4_t __rev2_59; __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 3, 2, 1, 0); \ - float32x4_t __ret_59; \ - __ret_59 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_59, __p3_59), __rev0_59, __p1_59); \ - __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \ +#define vcopyq_laneq_p16(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \ + poly16x8_t __s0_59 = __p0_59; \ + poly16x8_t __s2_59 = __p2_59; \ + poly16x8_t __rev0_59; __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x8_t __rev2_59; __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x8_t __ret_59; \ + __ret_59 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_59, __p3_59), __rev0_59, __p1_59); \ + __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_59; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \ - int32x4_t __s0_60 = __p0_60; \ - int32x4_t __s2_60 = __p2_60; \ - int32x4_t __ret_60; \ - __ret_60 = vsetq_lane_s32(vgetq_lane_s32(__s2_60, __p3_60), __s0_60, __p1_60); \ +#define vcopyq_laneq_u8(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \ + uint8x16_t __s0_60 = __p0_60; \ + uint8x16_t __s2_60 = __p2_60; \ + uint8x16_t __ret_60; \ + __ret_60 = vsetq_lane_u8(vgetq_lane_u8(__s2_60, __p3_60), __s0_60, __p1_60); \ __ret_60; \ }) #else -#define vcopyq_laneq_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \ - int32x4_t __s0_61 = __p0_61; \ - int32x4_t __s2_61 = __p2_61; \ - int32x4_t __rev0_61; __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \ - int32x4_t __rev2_61; __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 3, 2, 1, 0); \ - int32x4_t __ret_61; \ - __ret_61 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_61, __p3_61), __rev0_61, __p1_61); \ - __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \ +#define vcopyq_laneq_u8(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \ + uint8x16_t __s0_61 = __p0_61; \ + uint8x16_t __s2_61 = __p2_61; \ + uint8x16_t __rev0_61; __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2_61; __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_61; \ + __ret_61 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_61, __p3_61), __rev0_61, __p1_61); \ + __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_61; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s64(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \ - int64x2_t __s0_62 = __p0_62; \ - int64x2_t __s2_62 = __p2_62; \ - int64x2_t __ret_62; \ - __ret_62 = vsetq_lane_s64(vgetq_lane_s64(__s2_62, __p3_62), __s0_62, __p1_62); \ +#define vcopyq_laneq_u32(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \ + uint32x4_t __s0_62 = __p0_62; \ + uint32x4_t __s2_62 = __p2_62; \ + uint32x4_t __ret_62; \ + __ret_62 = vsetq_lane_u32(vgetq_lane_u32(__s2_62, __p3_62), __s0_62, __p1_62); \ __ret_62; \ }) #else -#define vcopyq_laneq_s64(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \ - int64x2_t __s0_63 = __p0_63; \ - int64x2_t __s2_63 = __p2_63; \ - int64x2_t __rev0_63; __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 1, 0); \ - int64x2_t __rev2_63; __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 1, 0); \ - int64x2_t __ret_63; \ - __ret_63 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_63, __p3_63), __rev0_63, __p1_63); \ - __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 1, 0); \ +#define vcopyq_laneq_u32(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \ + uint32x4_t __s0_63 = __p0_63; \ + uint32x4_t __s2_63 = __p2_63; \ + uint32x4_t __rev0_63; __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 3, 2, 1, 0); \ + uint32x4_t __rev2_63; __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 3, 2, 1, 0); \ + uint32x4_t __ret_63; \ + __ret_63 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_63, __p3_63), __rev0_63, __p1_63); \ + __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 3, 2, 1, 0); \ __ret_63; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s16(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \ - int16x8_t __s0_64 = __p0_64; \ - int16x8_t __s2_64 = __p2_64; \ - int16x8_t __ret_64; \ - __ret_64 = vsetq_lane_s16(vgetq_lane_s16(__s2_64, __p3_64), __s0_64, __p1_64); \ +#define vcopyq_laneq_u64(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \ + uint64x2_t __s0_64 = __p0_64; \ + uint64x2_t __s2_64 = __p2_64; \ + uint64x2_t __ret_64; \ + __ret_64 = vsetq_lane_u64(vgetq_lane_u64(__s2_64, __p3_64), __s0_64, __p1_64); \ __ret_64; \ }) #else -#define vcopyq_laneq_s16(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \ - int16x8_t __s0_65 = __p0_65; \ - int16x8_t __s2_65 = __p2_65; \ - int16x8_t __rev0_65; __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev2_65; __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __ret_65; \ - __ret_65 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_65, __p3_65), __rev0_65, __p1_65); \ - __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_laneq_u64(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \ + uint64x2_t __s0_65 = __p0_65; \ + uint64x2_t __s2_65 = __p2_65; \ + uint64x2_t __rev0_65; __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 1, 0); \ + uint64x2_t __rev2_65; __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 1, 0); \ + uint64x2_t __ret_65; \ + __ret_65 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_65, __p3_65), __rev0_65, __p1_65); \ + __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 1, 0); \ __ret_65; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p8(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \ - poly8x8_t __s0_66 = __p0_66; \ - poly8x16_t __s2_66 = __p2_66; \ - poly8x8_t __ret_66; \ - __ret_66 = vset_lane_p8(vgetq_lane_p8(__s2_66, __p3_66), __s0_66, __p1_66); \ +#define vcopyq_laneq_u16(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \ + uint16x8_t __s0_66 = __p0_66; \ + uint16x8_t __s2_66 = __p2_66; \ + uint16x8_t __ret_66; \ + __ret_66 = vsetq_lane_u16(vgetq_lane_u16(__s2_66, __p3_66), __s0_66, __p1_66); \ __ret_66; \ }) #else -#define vcopy_laneq_p8(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \ - poly8x8_t __s0_67 = __p0_67; \ - poly8x16_t __s2_67 = __p2_67; \ - poly8x8_t __rev0_67; __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __rev2_67; __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __ret_67; \ - __ret_67 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_67, __p3_67), __rev0_67, __p1_67); \ +#define vcopyq_laneq_u16(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \ + uint16x8_t __s0_67 = __p0_67; \ + uint16x8_t __s2_67 = __p2_67; \ + uint16x8_t __rev0_67; __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev2_67; __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __ret_67; \ + __ret_67 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_67, __p3_67), __rev0_67, __p1_67); \ __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_67; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p16(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \ - poly16x4_t __s0_68 = __p0_68; \ - poly16x8_t __s2_68 = __p2_68; \ - poly16x4_t __ret_68; \ - __ret_68 = vset_lane_p16(vgetq_lane_p16(__s2_68, __p3_68), __s0_68, __p1_68); \ +#define vcopyq_laneq_s8(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \ + int8x16_t __s0_68 = __p0_68; \ + int8x16_t __s2_68 = __p2_68; \ + int8x16_t __ret_68; \ + __ret_68 = vsetq_lane_s8(vgetq_lane_s8(__s2_68, __p3_68), __s0_68, __p1_68); \ __ret_68; \ }) #else -#define vcopy_laneq_p16(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \ - poly16x4_t __s0_69 = __p0_69; \ - poly16x8_t __s2_69 = __p2_69; \ - poly16x4_t __rev0_69; __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 3, 2, 1, 0); \ - poly16x8_t __rev2_69; __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x4_t __ret_69; \ - __ret_69 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_69, __p3_69), __rev0_69, __p1_69); \ - __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 3, 2, 1, 0); \ +#define vcopyq_laneq_s8(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \ + int8x16_t __s0_69 = __p0_69; \ + int8x16_t __s2_69 = __p2_69; \ + int8x16_t __rev0_69; __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2_69; __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_69; \ + __ret_69 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_69, __p3_69), __rev0_69, __p1_69); \ + __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_69; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u8(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \ - uint8x8_t __s0_70 = __p0_70; \ - uint8x16_t __s2_70 = __p2_70; \ - uint8x8_t __ret_70; \ - __ret_70 = vset_lane_u8(vgetq_lane_u8(__s2_70, __p3_70), __s0_70, __p1_70); \ +#define vcopyq_laneq_f32(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \ + float32x4_t __s0_70 = __p0_70; \ + float32x4_t __s2_70 = __p2_70; \ + float32x4_t __ret_70; \ + __ret_70 = vsetq_lane_f32(vgetq_lane_f32(__s2_70, __p3_70), __s0_70, __p1_70); \ __ret_70; \ }) #else -#define vcopy_laneq_u8(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \ - uint8x8_t __s0_71 = __p0_71; \ - uint8x16_t __s2_71 = __p2_71; \ - uint8x8_t __rev0_71; __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __rev2_71; __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __ret_71; \ - __ret_71 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_71, __p3_71), __rev0_71, __p1_71); \ - __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_laneq_f32(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \ + float32x4_t __s0_71 = __p0_71; \ + float32x4_t __s2_71 = __p2_71; \ + float32x4_t __rev0_71; __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 3, 2, 1, 0); \ + float32x4_t __rev2_71; __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 3, 2, 1, 0); \ + float32x4_t __ret_71; \ + __ret_71 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_71, __p3_71), __rev0_71, __p1_71); \ + __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 3, 2, 1, 0); \ __ret_71; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \ - uint32x2_t __s0_72 = __p0_72; \ - uint32x4_t __s2_72 = __p2_72; \ - uint32x2_t __ret_72; \ - __ret_72 = vset_lane_u32(vgetq_lane_u32(__s2_72, __p3_72), __s0_72, __p1_72); \ +#define vcopyq_laneq_s32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \ + int32x4_t __s0_72 = __p0_72; \ + int32x4_t __s2_72 = __p2_72; \ + int32x4_t __ret_72; \ + __ret_72 = vsetq_lane_s32(vgetq_lane_s32(__s2_72, __p3_72), __s0_72, __p1_72); \ __ret_72; \ }) #else -#define vcopy_laneq_u32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \ - uint32x2_t __s0_73 = __p0_73; \ - uint32x4_t __s2_73 = __p2_73; \ - uint32x2_t __rev0_73; __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 1, 0); \ - uint32x4_t __rev2_73; __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \ - uint32x2_t __ret_73; \ - __ret_73 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_73, __p3_73), __rev0_73, __p1_73); \ - __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 1, 0); \ +#define vcopyq_laneq_s32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \ + int32x4_t __s0_73 = __p0_73; \ + int32x4_t __s2_73 = __p2_73; \ + int32x4_t __rev0_73; __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 3, 2, 1, 0); \ + int32x4_t __rev2_73; __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \ + int32x4_t __ret_73; \ + __ret_73 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_73, __p3_73), __rev0_73, __p1_73); \ + __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 3, 2, 1, 0); \ __ret_73; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \ - uint64x1_t __s0_74 = __p0_74; \ - uint64x2_t __s2_74 = __p2_74; \ - uint64x1_t __ret_74; \ - __ret_74 = vset_lane_u64(vgetq_lane_u64(__s2_74, __p3_74), __s0_74, __p1_74); \ +#define vcopyq_laneq_s64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \ + int64x2_t __s0_74 = __p0_74; \ + int64x2_t __s2_74 = __p2_74; \ + int64x2_t __ret_74; \ + __ret_74 = vsetq_lane_s64(vgetq_lane_s64(__s2_74, __p3_74), __s0_74, __p1_74); \ __ret_74; \ }) #else -#define vcopy_laneq_u64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \ - uint64x1_t __s0_75 = __p0_75; \ - uint64x2_t __s2_75 = __p2_75; \ - uint64x2_t __rev2_75; __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \ - uint64x1_t __ret_75; \ - __ret_75 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_75, __p3_75), __s0_75, __p1_75); \ +#define vcopyq_laneq_s64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \ + int64x2_t __s0_75 = __p0_75; \ + int64x2_t __s2_75 = __p2_75; \ + int64x2_t __rev0_75; __rev0_75 = __builtin_shufflevector(__s0_75, __s0_75, 1, 0); \ + int64x2_t __rev2_75; __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \ + int64x2_t __ret_75; \ + __ret_75 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_75, __p3_75), __rev0_75, __p1_75); \ + __ret_75 = __builtin_shufflevector(__ret_75, __ret_75, 1, 0); \ __ret_75; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \ - uint16x4_t __s0_76 = __p0_76; \ - uint16x8_t __s2_76 = __p2_76; \ - uint16x4_t __ret_76; \ - __ret_76 = vset_lane_u16(vgetq_lane_u16(__s2_76, __p3_76), __s0_76, __p1_76); \ +#define vcopyq_laneq_s16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \ + int16x8_t __s0_76 = __p0_76; \ + int16x8_t __s2_76 = __p2_76; \ + int16x8_t __ret_76; \ + __ret_76 = vsetq_lane_s16(vgetq_lane_s16(__s2_76, __p3_76), __s0_76, __p1_76); \ __ret_76; \ }) #else -#define vcopy_laneq_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \ - uint16x4_t __s0_77 = __p0_77; \ - uint16x8_t __s2_77 = __p2_77; \ - uint16x4_t __rev0_77; __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 3, 2, 1, 0); \ - uint16x8_t __rev2_77; __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x4_t __ret_77; \ - __ret_77 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_77, __p3_77), __rev0_77, __p1_77); \ - __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 3, 2, 1, 0); \ +#define vcopyq_laneq_s16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \ + int16x8_t __s0_77 = __p0_77; \ + int16x8_t __s2_77 = __p2_77; \ + int16x8_t __rev0_77; __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev2_77; __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __ret_77; \ + __ret_77 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_77, __p3_77), __rev0_77, __p1_77); \ + __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_77; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \ - int8x8_t __s0_78 = __p0_78; \ - int8x16_t __s2_78 = __p2_78; \ - int8x8_t __ret_78; \ - __ret_78 = vset_lane_s8(vgetq_lane_s8(__s2_78, __p3_78), __s0_78, __p1_78); \ +#define vcopy_laneq_p8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \ + poly8x8_t __s0_78 = __p0_78; \ + poly8x16_t __s2_78 = __p2_78; \ + poly8x8_t __ret_78; \ + __ret_78 = vset_lane_p8(vgetq_lane_p8(__s2_78, __p3_78), __s0_78, __p1_78); \ __ret_78; \ }) #else -#define vcopy_laneq_s8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \ - int8x8_t __s0_79 = __p0_79; \ - int8x16_t __s2_79 = __p2_79; \ - int8x8_t __rev0_79; __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __rev2_79; __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __ret_79; \ - __ret_79 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_79, __p3_79), __rev0_79, __p1_79); \ +#define vcopy_laneq_p8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \ + poly8x8_t __s0_79 = __p0_79; \ + poly8x16_t __s2_79 = __p2_79; \ + poly8x8_t __rev0_79; __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __rev2_79; __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __ret_79; \ + __ret_79 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_79, __p3_79), __rev0_79, __p1_79); \ __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_79; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_f32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \ - float32x2_t __s0_80 = __p0_80; \ - float32x4_t __s2_80 = __p2_80; \ - float32x2_t __ret_80; \ - __ret_80 = vset_lane_f32(vgetq_lane_f32(__s2_80, __p3_80), __s0_80, __p1_80); \ +#define vcopy_laneq_p16(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \ + poly16x4_t __s0_80 = __p0_80; \ + poly16x8_t __s2_80 = __p2_80; \ + poly16x4_t __ret_80; \ + __ret_80 = vset_lane_p16(vgetq_lane_p16(__s2_80, __p3_80), __s0_80, __p1_80); \ __ret_80; \ }) #else -#define vcopy_laneq_f32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \ - float32x2_t __s0_81 = __p0_81; \ - float32x4_t __s2_81 = __p2_81; \ - float32x2_t __rev0_81; __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 1, 0); \ - float32x4_t __rev2_81; __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 3, 2, 1, 0); \ - float32x2_t __ret_81; \ - __ret_81 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_81, __p3_81), __rev0_81, __p1_81); \ - __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 1, 0); \ +#define vcopy_laneq_p16(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \ + poly16x4_t __s0_81 = __p0_81; \ + poly16x8_t __s2_81 = __p2_81; \ + poly16x4_t __rev0_81; __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 3, 2, 1, 0); \ + poly16x8_t __rev2_81; __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x4_t __ret_81; \ + __ret_81 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_81, __p3_81), __rev0_81, __p1_81); \ + __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 3, 2, 1, 0); \ __ret_81; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s32(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \ - int32x2_t __s0_82 = __p0_82; \ - int32x4_t __s2_82 = __p2_82; \ - int32x2_t __ret_82; \ - __ret_82 = vset_lane_s32(vgetq_lane_s32(__s2_82, __p3_82), __s0_82, __p1_82); \ +#define vcopy_laneq_u8(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \ + uint8x8_t __s0_82 = __p0_82; \ + uint8x16_t __s2_82 = __p2_82; \ + uint8x8_t __ret_82; \ + __ret_82 = vset_lane_u8(vgetq_lane_u8(__s2_82, __p3_82), __s0_82, __p1_82); \ __ret_82; \ }) #else -#define vcopy_laneq_s32(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \ - int32x2_t __s0_83 = __p0_83; \ - int32x4_t __s2_83 = __p2_83; \ - int32x2_t __rev0_83; __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 1, 0); \ - int32x4_t __rev2_83; __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \ - int32x2_t __ret_83; \ - __ret_83 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_83, __p3_83), __rev0_83, __p1_83); \ - __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 1, 0); \ +#define vcopy_laneq_u8(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \ + uint8x8_t __s0_83 = __p0_83; \ + uint8x16_t __s2_83 = __p2_83; \ + uint8x8_t __rev0_83; __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2_83; __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __ret_83; \ + __ret_83 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_83, __p3_83), __rev0_83, __p1_83); \ + __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_83; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s64(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \ - int64x1_t __s0_84 = __p0_84; \ - int64x2_t __s2_84 = __p2_84; \ - int64x1_t __ret_84; \ - __ret_84 = vset_lane_s64(vgetq_lane_s64(__s2_84, __p3_84), __s0_84, __p1_84); \ +#define vcopy_laneq_u32(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \ + uint32x2_t __s0_84 = __p0_84; \ + uint32x4_t __s2_84 = __p2_84; \ + uint32x2_t __ret_84; \ + __ret_84 = vset_lane_u32(vgetq_lane_u32(__s2_84, __p3_84), __s0_84, __p1_84); \ __ret_84; \ }) #else -#define vcopy_laneq_s64(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \ - int64x1_t __s0_85 = __p0_85; \ - int64x2_t __s2_85 = __p2_85; \ - int64x2_t __rev2_85; __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \ - int64x1_t __ret_85; \ - __ret_85 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_85, __p3_85), __s0_85, __p1_85); \ +#define vcopy_laneq_u32(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \ + uint32x2_t __s0_85 = __p0_85; \ + uint32x4_t __s2_85 = __p2_85; \ + uint32x2_t __rev0_85; __rev0_85 = __builtin_shufflevector(__s0_85, __s0_85, 1, 0); \ + uint32x4_t __rev2_85; __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 3, 2, 1, 0); \ + uint32x2_t __ret_85; \ + __ret_85 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_85, __p3_85), __rev0_85, __p1_85); \ + __ret_85 = __builtin_shufflevector(__ret_85, __ret_85, 1, 0); \ __ret_85; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \ - int16x4_t __s0_86 = __p0_86; \ - int16x8_t __s2_86 = __p2_86; \ - int16x4_t __ret_86; \ - __ret_86 = vset_lane_s16(vgetq_lane_s16(__s2_86, __p3_86), __s0_86, __p1_86); \ +#define vcopy_laneq_u64(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \ + uint64x1_t __s0_86 = __p0_86; \ + uint64x2_t __s2_86 = __p2_86; \ + uint64x1_t __ret_86; \ + __ret_86 = vset_lane_u64(vgetq_lane_u64(__s2_86, __p3_86), __s0_86, __p1_86); \ __ret_86; \ }) #else -#define vcopy_laneq_s16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \ - int16x4_t __s0_87 = __p0_87; \ - int16x8_t __s2_87 = __p2_87; \ - int16x4_t __rev0_87; __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \ - int16x8_t __rev2_87; __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x4_t __ret_87; \ - __ret_87 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_87, __p3_87), __rev0_87, __p1_87); \ - __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \ +#define vcopy_laneq_u64(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \ + uint64x1_t __s0_87 = __p0_87; \ + uint64x2_t __s2_87 = __p2_87; \ + uint64x2_t __rev2_87; __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 1, 0); \ + uint64x1_t __ret_87; \ + __ret_87 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_87, __p3_87), __s0_87, __p1_87); \ __ret_87; \ }) #endif +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_u16(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \ + uint16x4_t __s0_88 = __p0_88; \ + uint16x8_t __s2_88 = __p2_88; \ + uint16x4_t __ret_88; \ + __ret_88 = vset_lane_u16(vgetq_lane_u16(__s2_88, __p3_88), __s0_88, __p1_88); \ + __ret_88; \ +}) +#else +#define vcopy_laneq_u16(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \ + uint16x4_t __s0_89 = __p0_89; \ + uint16x8_t __s2_89 = __p2_89; \ + uint16x4_t __rev0_89; __rev0_89 = __builtin_shufflevector(__s0_89, __s0_89, 3, 2, 1, 0); \ + uint16x8_t __rev2_89; __rev2_89 = __builtin_shufflevector(__s2_89, __s2_89, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x4_t __ret_89; \ + __ret_89 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_89, __p3_89), __rev0_89, __p1_89); \ + __ret_89 = __builtin_shufflevector(__ret_89, __ret_89, 3, 2, 1, 0); \ + __ret_89; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s8(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \ + int8x8_t __s0_90 = __p0_90; \ + int8x16_t __s2_90 = __p2_90; \ + int8x8_t __ret_90; \ + __ret_90 = vset_lane_s8(vgetq_lane_s8(__s2_90, __p3_90), __s0_90, __p1_90); \ + __ret_90; \ +}) +#else +#define vcopy_laneq_s8(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \ + int8x8_t __s0_91 = __p0_91; \ + int8x16_t __s2_91 = __p2_91; \ + int8x8_t __rev0_91; __rev0_91 = __builtin_shufflevector(__s0_91, __s0_91, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2_91; __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __ret_91; \ + __ret_91 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_91, __p3_91), __rev0_91, __p1_91); \ + __ret_91 = __builtin_shufflevector(__ret_91, __ret_91, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_91; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_f32(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \ + float32x2_t __s0_92 = __p0_92; \ + float32x4_t __s2_92 = __p2_92; \ + float32x2_t __ret_92; \ + __ret_92 = vset_lane_f32(vgetq_lane_f32(__s2_92, __p3_92), __s0_92, __p1_92); \ + __ret_92; \ +}) +#else +#define vcopy_laneq_f32(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \ + float32x2_t __s0_93 = __p0_93; \ + float32x4_t __s2_93 = __p2_93; \ + float32x2_t __rev0_93; __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \ + float32x4_t __rev2_93; __rev2_93 = __builtin_shufflevector(__s2_93, __s2_93, 3, 2, 1, 0); \ + float32x2_t __ret_93; \ + __ret_93 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_93, __p3_93), __rev0_93, __p1_93); \ + __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \ + __ret_93; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \ + int32x2_t __s0_94 = __p0_94; \ + int32x4_t __s2_94 = __p2_94; \ + int32x2_t __ret_94; \ + __ret_94 = vset_lane_s32(vgetq_lane_s32(__s2_94, __p3_94), __s0_94, __p1_94); \ + __ret_94; \ +}) +#else +#define vcopy_laneq_s32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \ + int32x2_t __s0_95 = __p0_95; \ + int32x4_t __s2_95 = __p2_95; \ + int32x2_t __rev0_95; __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 1, 0); \ + int32x4_t __rev2_95; __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 3, 2, 1, 0); \ + int32x2_t __ret_95; \ + __ret_95 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_95, __p3_95), __rev0_95, __p1_95); \ + __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 1, 0); \ + __ret_95; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \ + int64x1_t __s0_96 = __p0_96; \ + int64x2_t __s2_96 = __p2_96; \ + int64x1_t __ret_96; \ + __ret_96 = vset_lane_s64(vgetq_lane_s64(__s2_96, __p3_96), __s0_96, __p1_96); \ + __ret_96; \ +}) +#else +#define vcopy_laneq_s64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \ + int64x1_t __s0_97 = __p0_97; \ + int64x2_t __s2_97 = __p2_97; \ + int64x2_t __rev2_97; __rev2_97 = __builtin_shufflevector(__s2_97, __s2_97, 1, 0); \ + int64x1_t __ret_97; \ + __ret_97 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_97, __p3_97), __s0_97, __p1_97); \ + __ret_97; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s16(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \ + int16x4_t __s0_98 = __p0_98; \ + int16x8_t __s2_98 = __p2_98; \ + int16x4_t __ret_98; \ + __ret_98 = vset_lane_s16(vgetq_lane_s16(__s2_98, __p3_98), __s0_98, __p1_98); \ + __ret_98; \ +}) +#else +#define vcopy_laneq_s16(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \ + int16x4_t __s0_99 = __p0_99; \ + int16x8_t __s2_99 = __p2_99; \ + int16x4_t __rev0_99; __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 3, 2, 1, 0); \ + int16x8_t __rev2_99; __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x4_t __ret_99; \ + __ret_99 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_99, __p3_99), __rev0_99, __p1_99); \ + __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 3, 2, 1, 0); \ + __ret_99; \ +}) +#endif + #ifdef __LITTLE_ENDIAN__ __ai poly64x1_t vcreate_p64(uint64_t __p0) { poly64x1_t __ret; @@ -47736,272 +51048,272 @@ __ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsd_lane_f64(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \ - float64_t __s0_88 = __p0_88; \ - float64_t __s1_88 = __p1_88; \ - float64x1_t __s2_88 = __p2_88; \ - float64_t __ret_88; \ - __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \ - __ret_88; \ -}) -#else -#define vfmsd_lane_f64(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \ - float64_t __s0_89 = __p0_89; \ - float64_t __s1_89 = __p1_89; \ - float64x1_t __s2_89 = __p2_89; \ - float64_t __ret_89; \ - __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \ - __ret_89; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmss_lane_f32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \ - float32_t __s0_90 = __p0_90; \ - float32_t __s1_90 = __p1_90; \ - float32x2_t __s2_90 = __p2_90; \ - float32_t __ret_90; \ - __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \ - __ret_90; \ -}) -#else -#define vfmss_lane_f32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \ - float32_t __s0_91 = __p0_91; \ - float32_t __s1_91 = __p1_91; \ - float32x2_t __s2_91 = __p2_91; \ - float32x2_t __rev2_91; __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \ - float32_t __ret_91; \ - __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \ - __ret_91; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsq_lane_f64(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \ - float64x2_t __s0_92 = __p0_92; \ - float64x2_t __s1_92 = __p1_92; \ - float64x1_t __s2_92 = __p2_92; \ - float64x2_t __ret_92; \ - __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \ - __ret_92; \ -}) -#else -#define vfmsq_lane_f64(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \ - float64x2_t __s0_93 = __p0_93; \ - float64x2_t __s1_93 = __p1_93; \ - float64x1_t __s2_93 = __p2_93; \ - float64x2_t __rev0_93; __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \ - float64x2_t __rev1_93; __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \ - float64x2_t __ret_93; \ - __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \ - __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \ - __ret_93; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsq_lane_f32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \ - float32x4_t __s0_94 = __p0_94; \ - float32x4_t __s1_94 = __p1_94; \ - float32x2_t __s2_94 = __p2_94; \ - float32x4_t __ret_94; \ - __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \ - __ret_94; \ -}) -#else -#define vfmsq_lane_f32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \ - float32x4_t __s0_95 = __p0_95; \ - float32x4_t __s1_95 = __p1_95; \ - float32x2_t __s2_95 = __p2_95; \ - float32x4_t __rev0_95; __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \ - float32x4_t __rev1_95; __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \ - float32x2_t __rev2_95; __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \ - float32x4_t __ret_95; \ - __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \ - __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \ - __ret_95; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfms_lane_f64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \ - float64x1_t __s0_96 = __p0_96; \ - float64x1_t __s1_96 = __p1_96; \ - float64x1_t __s2_96 = __p2_96; \ - float64x1_t __ret_96; \ - __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \ - __ret_96; \ -}) -#else -#define vfms_lane_f64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \ - float64x1_t __s0_97 = __p0_97; \ - float64x1_t __s1_97 = __p1_97; \ - float64x1_t __s2_97 = __p2_97; \ - float64x1_t __ret_97; \ - __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \ - __ret_97; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfms_lane_f32(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \ - float32x2_t __s0_98 = __p0_98; \ - float32x2_t __s1_98 = __p1_98; \ - float32x2_t __s2_98 = __p2_98; \ - float32x2_t __ret_98; \ - __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \ - __ret_98; \ -}) -#else -#define vfms_lane_f32(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \ - float32x2_t __s0_99 = __p0_99; \ - float32x2_t __s1_99 = __p1_99; \ - float32x2_t __s2_99 = __p2_99; \ - float32x2_t __rev0_99; __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 1, 0); \ - float32x2_t __rev1_99; __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \ - float32x2_t __rev2_99; __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \ - float32x2_t __ret_99; \ - __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \ - __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \ - __ret_99; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsd_laneq_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \ +#define vfmsd_lane_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \ float64_t __s0_100 = __p0_100; \ float64_t __s1_100 = __p1_100; \ - float64x2_t __s2_100 = __p2_100; \ + float64x1_t __s2_100 = __p2_100; \ float64_t __ret_100; \ - __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \ + __ret_100 = vfmad_lane_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \ __ret_100; \ }) #else -#define vfmsd_laneq_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \ +#define vfmsd_lane_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \ float64_t __s0_101 = __p0_101; \ float64_t __s1_101 = __p1_101; \ - float64x2_t __s2_101 = __p2_101; \ - float64x2_t __rev2_101; __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \ + float64x1_t __s2_101 = __p2_101; \ float64_t __ret_101; \ - __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \ + __ret_101 = __noswap_vfmad_lane_f64(__s0_101, -__s1_101, __s2_101, __p3_101); \ __ret_101; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmss_laneq_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \ +#define vfmss_lane_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \ float32_t __s0_102 = __p0_102; \ float32_t __s1_102 = __p1_102; \ - float32x4_t __s2_102 = __p2_102; \ + float32x2_t __s2_102 = __p2_102; \ float32_t __ret_102; \ - __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \ + __ret_102 = vfmas_lane_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \ __ret_102; \ }) #else -#define vfmss_laneq_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \ +#define vfmss_lane_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \ float32_t __s0_103 = __p0_103; \ float32_t __s1_103 = __p1_103; \ - float32x4_t __s2_103 = __p2_103; \ - float32x4_t __rev2_103; __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \ + float32x2_t __s2_103 = __p2_103; \ + float32x2_t __rev2_103; __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 1, 0); \ float32_t __ret_103; \ - __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \ + __ret_103 = __noswap_vfmas_lane_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \ __ret_103; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsq_laneq_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \ +#define vfmsq_lane_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \ float64x2_t __s0_104 = __p0_104; \ float64x2_t __s1_104 = __p1_104; \ - float64x2_t __s2_104 = __p2_104; \ + float64x1_t __s2_104 = __p2_104; \ float64x2_t __ret_104; \ - __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \ + __ret_104 = vfmaq_lane_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \ __ret_104; \ }) #else -#define vfmsq_laneq_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \ +#define vfmsq_lane_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \ float64x2_t __s0_105 = __p0_105; \ float64x2_t __s1_105 = __p1_105; \ - float64x2_t __s2_105 = __p2_105; \ + float64x1_t __s2_105 = __p2_105; \ float64x2_t __rev0_105; __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \ float64x2_t __rev1_105; __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \ - float64x2_t __rev2_105; __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \ float64x2_t __ret_105; \ - __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \ + __ret_105 = __noswap_vfmaq_lane_f64(__rev0_105, -__rev1_105, __s2_105, __p3_105); \ __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \ __ret_105; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsq_laneq_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \ +#define vfmsq_lane_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \ float32x4_t __s0_106 = __p0_106; \ float32x4_t __s1_106 = __p1_106; \ - float32x4_t __s2_106 = __p2_106; \ + float32x2_t __s2_106 = __p2_106; \ float32x4_t __ret_106; \ - __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \ + __ret_106 = vfmaq_lane_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \ __ret_106; \ }) #else -#define vfmsq_laneq_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \ +#define vfmsq_lane_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \ float32x4_t __s0_107 = __p0_107; \ float32x4_t __s1_107 = __p1_107; \ - float32x4_t __s2_107 = __p2_107; \ + float32x2_t __s2_107 = __p2_107; \ float32x4_t __rev0_107; __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \ float32x4_t __rev1_107; __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \ - float32x4_t __rev2_107; __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \ + float32x2_t __rev2_107; __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 1, 0); \ float32x4_t __ret_107; \ - __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \ + __ret_107 = __noswap_vfmaq_lane_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \ __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \ __ret_107; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfms_laneq_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \ +#define vfms_lane_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \ float64x1_t __s0_108 = __p0_108; \ float64x1_t __s1_108 = __p1_108; \ - float64x2_t __s2_108 = __p2_108; \ + float64x1_t __s2_108 = __p2_108; \ float64x1_t __ret_108; \ - __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \ + __ret_108 = vfma_lane_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \ __ret_108; \ }) #else -#define vfms_laneq_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \ +#define vfms_lane_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \ float64x1_t __s0_109 = __p0_109; \ float64x1_t __s1_109 = __p1_109; \ - float64x2_t __s2_109 = __p2_109; \ - float64x2_t __rev2_109; __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \ + float64x1_t __s2_109 = __p2_109; \ float64x1_t __ret_109; \ - __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \ + __ret_109 = __noswap_vfma_lane_f64(__s0_109, -__s1_109, __s2_109, __p3_109); \ __ret_109; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfms_laneq_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \ +#define vfms_lane_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \ float32x2_t __s0_110 = __p0_110; \ float32x2_t __s1_110 = __p1_110; \ - float32x4_t __s2_110 = __p2_110; \ + float32x2_t __s2_110 = __p2_110; \ float32x2_t __ret_110; \ - __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \ + __ret_110 = vfma_lane_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \ __ret_110; \ }) #else -#define vfms_laneq_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \ +#define vfms_lane_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \ float32x2_t __s0_111 = __p0_111; \ float32x2_t __s1_111 = __p1_111; \ - float32x4_t __s2_111 = __p2_111; \ + float32x2_t __s2_111 = __p2_111; \ float32x2_t __rev0_111; __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \ float32x2_t __rev1_111; __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \ - float32x4_t __rev2_111; __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \ + float32x2_t __rev2_111; __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 1, 0); \ float32x2_t __ret_111; \ - __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \ + __ret_111 = __noswap_vfma_lane_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \ __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \ __ret_111; \ }) #endif +#ifdef __LITTLE_ENDIAN__ +#define vfmsd_laneq_f64(__p0_112, __p1_112, __p2_112, __p3_112) __extension__ ({ \ + float64_t __s0_112 = __p0_112; \ + float64_t __s1_112 = __p1_112; \ + float64x2_t __s2_112 = __p2_112; \ + float64_t __ret_112; \ + __ret_112 = vfmad_laneq_f64(__s0_112, -__s1_112, __s2_112, __p3_112); \ + __ret_112; \ +}) +#else +#define vfmsd_laneq_f64(__p0_113, __p1_113, __p2_113, __p3_113) __extension__ ({ \ + float64_t __s0_113 = __p0_113; \ + float64_t __s1_113 = __p1_113; \ + float64x2_t __s2_113 = __p2_113; \ + float64x2_t __rev2_113; __rev2_113 = __builtin_shufflevector(__s2_113, __s2_113, 1, 0); \ + float64_t __ret_113; \ + __ret_113 = __noswap_vfmad_laneq_f64(__s0_113, -__s1_113, __rev2_113, __p3_113); \ + __ret_113; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmss_laneq_f32(__p0_114, __p1_114, __p2_114, __p3_114) __extension__ ({ \ + float32_t __s0_114 = __p0_114; \ + float32_t __s1_114 = __p1_114; \ + float32x4_t __s2_114 = __p2_114; \ + float32_t __ret_114; \ + __ret_114 = vfmas_laneq_f32(__s0_114, -__s1_114, __s2_114, __p3_114); \ + __ret_114; \ +}) +#else +#define vfmss_laneq_f32(__p0_115, __p1_115, __p2_115, __p3_115) __extension__ ({ \ + float32_t __s0_115 = __p0_115; \ + float32_t __s1_115 = __p1_115; \ + float32x4_t __s2_115 = __p2_115; \ + float32x4_t __rev2_115; __rev2_115 = __builtin_shufflevector(__s2_115, __s2_115, 3, 2, 1, 0); \ + float32_t __ret_115; \ + __ret_115 = __noswap_vfmas_laneq_f32(__s0_115, -__s1_115, __rev2_115, __p3_115); \ + __ret_115; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f64(__p0_116, __p1_116, __p2_116, __p3_116) __extension__ ({ \ + float64x2_t __s0_116 = __p0_116; \ + float64x2_t __s1_116 = __p1_116; \ + float64x2_t __s2_116 = __p2_116; \ + float64x2_t __ret_116; \ + __ret_116 = vfmaq_laneq_f64(__s0_116, -__s1_116, __s2_116, __p3_116); \ + __ret_116; \ +}) +#else +#define vfmsq_laneq_f64(__p0_117, __p1_117, __p2_117, __p3_117) __extension__ ({ \ + float64x2_t __s0_117 = __p0_117; \ + float64x2_t __s1_117 = __p1_117; \ + float64x2_t __s2_117 = __p2_117; \ + float64x2_t __rev0_117; __rev0_117 = __builtin_shufflevector(__s0_117, __s0_117, 1, 0); \ + float64x2_t __rev1_117; __rev1_117 = __builtin_shufflevector(__s1_117, __s1_117, 1, 0); \ + float64x2_t __rev2_117; __rev2_117 = __builtin_shufflevector(__s2_117, __s2_117, 1, 0); \ + float64x2_t __ret_117; \ + __ret_117 = __noswap_vfmaq_laneq_f64(__rev0_117, -__rev1_117, __rev2_117, __p3_117); \ + __ret_117 = __builtin_shufflevector(__ret_117, __ret_117, 1, 0); \ + __ret_117; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f32(__p0_118, __p1_118, __p2_118, __p3_118) __extension__ ({ \ + float32x4_t __s0_118 = __p0_118; \ + float32x4_t __s1_118 = __p1_118; \ + float32x4_t __s2_118 = __p2_118; \ + float32x4_t __ret_118; \ + __ret_118 = vfmaq_laneq_f32(__s0_118, -__s1_118, __s2_118, __p3_118); \ + __ret_118; \ +}) +#else +#define vfmsq_laneq_f32(__p0_119, __p1_119, __p2_119, __p3_119) __extension__ ({ \ + float32x4_t __s0_119 = __p0_119; \ + float32x4_t __s1_119 = __p1_119; \ + float32x4_t __s2_119 = __p2_119; \ + float32x4_t __rev0_119; __rev0_119 = __builtin_shufflevector(__s0_119, __s0_119, 3, 2, 1, 0); \ + float32x4_t __rev1_119; __rev1_119 = __builtin_shufflevector(__s1_119, __s1_119, 3, 2, 1, 0); \ + float32x4_t __rev2_119; __rev2_119 = __builtin_shufflevector(__s2_119, __s2_119, 3, 2, 1, 0); \ + float32x4_t __ret_119; \ + __ret_119 = __noswap_vfmaq_laneq_f32(__rev0_119, -__rev1_119, __rev2_119, __p3_119); \ + __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0); \ + __ret_119; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f64(__p0_120, __p1_120, __p2_120, __p3_120) __extension__ ({ \ + float64x1_t __s0_120 = __p0_120; \ + float64x1_t __s1_120 = __p1_120; \ + float64x2_t __s2_120 = __p2_120; \ + float64x1_t __ret_120; \ + __ret_120 = vfma_laneq_f64(__s0_120, -__s1_120, __s2_120, __p3_120); \ + __ret_120; \ +}) +#else +#define vfms_laneq_f64(__p0_121, __p1_121, __p2_121, __p3_121) __extension__ ({ \ + float64x1_t __s0_121 = __p0_121; \ + float64x1_t __s1_121 = __p1_121; \ + float64x2_t __s2_121 = __p2_121; \ + float64x2_t __rev2_121; __rev2_121 = __builtin_shufflevector(__s2_121, __s2_121, 1, 0); \ + float64x1_t __ret_121; \ + __ret_121 = __noswap_vfma_laneq_f64(__s0_121, -__s1_121, __rev2_121, __p3_121); \ + __ret_121; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f32(__p0_122, __p1_122, __p2_122, __p3_122) __extension__ ({ \ + float32x2_t __s0_122 = __p0_122; \ + float32x2_t __s1_122 = __p1_122; \ + float32x4_t __s2_122 = __p2_122; \ + float32x2_t __ret_122; \ + __ret_122 = vfma_laneq_f32(__s0_122, -__s1_122, __s2_122, __p3_122); \ + __ret_122; \ +}) +#else +#define vfms_laneq_f32(__p0_123, __p1_123, __p2_123, __p3_123) __extension__ ({ \ + float32x2_t __s0_123 = __p0_123; \ + float32x2_t __s1_123 = __p1_123; \ + float32x4_t __s2_123 = __p2_123; \ + float32x2_t __rev0_123; __rev0_123 = __builtin_shufflevector(__s0_123, __s0_123, 1, 0); \ + float32x2_t __rev1_123; __rev1_123 = __builtin_shufflevector(__s1_123, __s1_123, 1, 0); \ + float32x4_t __rev2_123; __rev2_123 = __builtin_shufflevector(__s2_123, __s2_123, 3, 2, 1, 0); \ + float32x2_t __ret_123; \ + __ret_123 = __noswap_vfma_laneq_f32(__rev0_123, -__rev1_123, __rev2_123, __p3_123); \ + __ret_123 = __builtin_shufflevector(__ret_123, __ret_123, 1, 0); \ + __ret_123; \ +}) +#endif + #ifdef __LITTLE_ENDIAN__ __ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { float64x2_t __ret; @@ -53521,149 +56833,149 @@ __ai float64x1_t vmov_n_f64(float64_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_112) { - uint16x8_t __ret_112; - uint8x8_t __a1_112 = vget_high_u8(__p0_112); - __ret_112 = (uint16x8_t)(vshll_n_u8(__a1_112, 0)); - return __ret_112; -} -#else -__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_113) { - uint8x16_t __rev0_113; __rev0_113 = __builtin_shufflevector(__p0_113, __p0_113, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint16x8_t __ret_113; - uint8x8_t __a1_113 = __noswap_vget_high_u8(__rev0_113); - __ret_113 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_113, 0)); - __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret_113; -} -__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_114) { - uint16x8_t __ret_114; - uint8x8_t __a1_114 = __noswap_vget_high_u8(__p0_114); - __ret_114 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_114, 0)); - return __ret_114; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_115) { - uint64x2_t __ret_115; - uint32x2_t __a1_115 = vget_high_u32(__p0_115); - __ret_115 = (uint64x2_t)(vshll_n_u32(__a1_115, 0)); - return __ret_115; -} -#else -__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_116) { - uint32x4_t __rev0_116; __rev0_116 = __builtin_shufflevector(__p0_116, __p0_116, 3, 2, 1, 0); - uint64x2_t __ret_116; - uint32x2_t __a1_116 = __noswap_vget_high_u32(__rev0_116); - __ret_116 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_116, 0)); - __ret_116 = __builtin_shufflevector(__ret_116, __ret_116, 1, 0); - return __ret_116; -} -__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_117) { - uint64x2_t __ret_117; - uint32x2_t __a1_117 = __noswap_vget_high_u32(__p0_117); - __ret_117 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_117, 0)); - return __ret_117; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_118) { - uint32x4_t __ret_118; - uint16x4_t __a1_118 = vget_high_u16(__p0_118); - __ret_118 = (uint32x4_t)(vshll_n_u16(__a1_118, 0)); - return __ret_118; -} -#else -__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_119) { - uint16x8_t __rev0_119; __rev0_119 = __builtin_shufflevector(__p0_119, __p0_119, 7, 6, 5, 4, 3, 2, 1, 0); - uint32x4_t __ret_119; - uint16x4_t __a1_119 = __noswap_vget_high_u16(__rev0_119); - __ret_119 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_119, 0)); - __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0); - return __ret_119; -} -__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_120) { - uint32x4_t __ret_120; - uint16x4_t __a1_120 = __noswap_vget_high_u16(__p0_120); - __ret_120 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_120, 0)); - return __ret_120; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int16x8_t vmovl_high_s8(int8x16_t __p0_121) { - int16x8_t __ret_121; - int8x8_t __a1_121 = vget_high_s8(__p0_121); - __ret_121 = (int16x8_t)(vshll_n_s8(__a1_121, 0)); - return __ret_121; -} -#else -__ai int16x8_t vmovl_high_s8(int8x16_t __p0_122) { - int8x16_t __rev0_122; __rev0_122 = __builtin_shufflevector(__p0_122, __p0_122, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int16x8_t __ret_122; - int8x8_t __a1_122 = __noswap_vget_high_s8(__rev0_122); - __ret_122 = (int16x8_t)(__noswap_vshll_n_s8(__a1_122, 0)); - __ret_122 = __builtin_shufflevector(__ret_122, __ret_122, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret_122; -} -__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_123) { - int16x8_t __ret_123; - int8x8_t __a1_123 = __noswap_vget_high_s8(__p0_123); - __ret_123 = (int16x8_t)(__noswap_vshll_n_s8(__a1_123, 0)); - return __ret_123; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int64x2_t vmovl_high_s32(int32x4_t __p0_124) { - int64x2_t __ret_124; - int32x2_t __a1_124 = vget_high_s32(__p0_124); - __ret_124 = (int64x2_t)(vshll_n_s32(__a1_124, 0)); +__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_124) { + uint16x8_t __ret_124; + uint8x8_t __a1_124 = vget_high_u8(__p0_124); + __ret_124 = (uint16x8_t)(vshll_n_u8(__a1_124, 0)); return __ret_124; } #else -__ai int64x2_t vmovl_high_s32(int32x4_t __p0_125) { - int32x4_t __rev0_125; __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 3, 2, 1, 0); - int64x2_t __ret_125; - int32x2_t __a1_125 = __noswap_vget_high_s32(__rev0_125); - __ret_125 = (int64x2_t)(__noswap_vshll_n_s32(__a1_125, 0)); - __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 1, 0); +__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_125) { + uint8x16_t __rev0_125; __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret_125; + uint8x8_t __a1_125 = __noswap_vget_high_u8(__rev0_125); + __ret_125 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_125, 0)); + __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 7, 6, 5, 4, 3, 2, 1, 0); return __ret_125; } -__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_126) { - int64x2_t __ret_126; - int32x2_t __a1_126 = __noswap_vget_high_s32(__p0_126); - __ret_126 = (int64x2_t)(__noswap_vshll_n_s32(__a1_126, 0)); +__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_126) { + uint16x8_t __ret_126; + uint8x8_t __a1_126 = __noswap_vget_high_u8(__p0_126); + __ret_126 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_126, 0)); return __ret_126; } #endif #ifdef __LITTLE_ENDIAN__ -__ai int32x4_t vmovl_high_s16(int16x8_t __p0_127) { - int32x4_t __ret_127; - int16x4_t __a1_127 = vget_high_s16(__p0_127); - __ret_127 = (int32x4_t)(vshll_n_s16(__a1_127, 0)); +__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_127) { + uint64x2_t __ret_127; + uint32x2_t __a1_127 = vget_high_u32(__p0_127); + __ret_127 = (uint64x2_t)(vshll_n_u32(__a1_127, 0)); return __ret_127; } #else -__ai int32x4_t vmovl_high_s16(int16x8_t __p0_128) { - int16x8_t __rev0_128; __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 7, 6, 5, 4, 3, 2, 1, 0); - int32x4_t __ret_128; - int16x4_t __a1_128 = __noswap_vget_high_s16(__rev0_128); - __ret_128 = (int32x4_t)(__noswap_vshll_n_s16(__a1_128, 0)); - __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 3, 2, 1, 0); +__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_128) { + uint32x4_t __rev0_128; __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 3, 2, 1, 0); + uint64x2_t __ret_128; + uint32x2_t __a1_128 = __noswap_vget_high_u32(__rev0_128); + __ret_128 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_128, 0)); + __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 1, 0); return __ret_128; } -__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_129) { - int32x4_t __ret_129; - int16x4_t __a1_129 = __noswap_vget_high_s16(__p0_129); - __ret_129 = (int32x4_t)(__noswap_vshll_n_s16(__a1_129, 0)); +__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_129) { + uint64x2_t __ret_129; + uint32x2_t __a1_129 = __noswap_vget_high_u32(__p0_129); + __ret_129 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_129, 0)); return __ret_129; } #endif +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_130) { + uint32x4_t __ret_130; + uint16x4_t __a1_130 = vget_high_u16(__p0_130); + __ret_130 = (uint32x4_t)(vshll_n_u16(__a1_130, 0)); + return __ret_130; +} +#else +__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_131) { + uint16x8_t __rev0_131; __rev0_131 = __builtin_shufflevector(__p0_131, __p0_131, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __ret_131; + uint16x4_t __a1_131 = __noswap_vget_high_u16(__rev0_131); + __ret_131 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_131, 0)); + __ret_131 = __builtin_shufflevector(__ret_131, __ret_131, 3, 2, 1, 0); + return __ret_131; +} +__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_132) { + uint32x4_t __ret_132; + uint16x4_t __a1_132 = __noswap_vget_high_u16(__p0_132); + __ret_132 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_132, 0)); + return __ret_132; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vmovl_high_s8(int8x16_t __p0_133) { + int16x8_t __ret_133; + int8x8_t __a1_133 = vget_high_s8(__p0_133); + __ret_133 = (int16x8_t)(vshll_n_s8(__a1_133, 0)); + return __ret_133; +} +#else +__ai int16x8_t vmovl_high_s8(int8x16_t __p0_134) { + int8x16_t __rev0_134; __rev0_134 = __builtin_shufflevector(__p0_134, __p0_134, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret_134; + int8x8_t __a1_134 = __noswap_vget_high_s8(__rev0_134); + __ret_134 = (int16x8_t)(__noswap_vshll_n_s8(__a1_134, 0)); + __ret_134 = __builtin_shufflevector(__ret_134, __ret_134, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret_134; +} +__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_135) { + int16x8_t __ret_135; + int8x8_t __a1_135 = __noswap_vget_high_s8(__p0_135); + __ret_135 = (int16x8_t)(__noswap_vshll_n_s8(__a1_135, 0)); + return __ret_135; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int64x2_t vmovl_high_s32(int32x4_t __p0_136) { + int64x2_t __ret_136; + int32x2_t __a1_136 = vget_high_s32(__p0_136); + __ret_136 = (int64x2_t)(vshll_n_s32(__a1_136, 0)); + return __ret_136; +} +#else +__ai int64x2_t vmovl_high_s32(int32x4_t __p0_137) { + int32x4_t __rev0_137; __rev0_137 = __builtin_shufflevector(__p0_137, __p0_137, 3, 2, 1, 0); + int64x2_t __ret_137; + int32x2_t __a1_137 = __noswap_vget_high_s32(__rev0_137); + __ret_137 = (int64x2_t)(__noswap_vshll_n_s32(__a1_137, 0)); + __ret_137 = __builtin_shufflevector(__ret_137, __ret_137, 1, 0); + return __ret_137; +} +__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_138) { + int64x2_t __ret_138; + int32x2_t __a1_138 = __noswap_vget_high_s32(__p0_138); + __ret_138 = (int64x2_t)(__noswap_vshll_n_s32(__a1_138, 0)); + return __ret_138; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int32x4_t vmovl_high_s16(int16x8_t __p0_139) { + int32x4_t __ret_139; + int16x4_t __a1_139 = vget_high_s16(__p0_139); + __ret_139 = (int32x4_t)(vshll_n_s16(__a1_139, 0)); + return __ret_139; +} +#else +__ai int32x4_t vmovl_high_s16(int16x8_t __p0_140) { + int16x8_t __rev0_140; __rev0_140 = __builtin_shufflevector(__p0_140, __p0_140, 7, 6, 5, 4, 3, 2, 1, 0); + int32x4_t __ret_140; + int16x4_t __a1_140 = __noswap_vget_high_s16(__rev0_140); + __ret_140 = (int32x4_t)(__noswap_vshll_n_s16(__a1_140, 0)); + __ret_140 = __builtin_shufflevector(__ret_140, __ret_140, 3, 2, 1, 0); + return __ret_140; +} +__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_141) { + int32x4_t __ret_141; + int16x4_t __a1_141 = __noswap_vget_high_s16(__p0_141); + __ret_141 = (int32x4_t)(__noswap_vshll_n_s16(__a1_141, 0)); + return __ret_141; +} +#endif + #ifdef __LITTLE_ENDIAN__ __ai uint16x8_t vmovn_high_u32(uint16x4_t __p0, uint32x4_t __p1) { uint16x8_t __ret; @@ -53798,39 +57110,39 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmuld_lane_f64(__p0_130, __p1_130, __p2_130) __extension__ ({ \ - float64_t __s0_130 = __p0_130; \ - float64x1_t __s1_130 = __p1_130; \ - float64_t __ret_130; \ - __ret_130 = __s0_130 * vget_lane_f64(__s1_130, __p2_130); \ - __ret_130; \ +#define vmuld_lane_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \ + float64_t __s0_142 = __p0_142; \ + float64x1_t __s1_142 = __p1_142; \ + float64_t __ret_142; \ + __ret_142 = __s0_142 * vget_lane_f64(__s1_142, __p2_142); \ + __ret_142; \ }) #else -#define vmuld_lane_f64(__p0_131, __p1_131, __p2_131) __extension__ ({ \ - float64_t __s0_131 = __p0_131; \ - float64x1_t __s1_131 = __p1_131; \ - float64_t __ret_131; \ - __ret_131 = __s0_131 * __noswap_vget_lane_f64(__s1_131, __p2_131); \ - __ret_131; \ +#define vmuld_lane_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \ + float64_t __s0_143 = __p0_143; \ + float64x1_t __s1_143 = __p1_143; \ + float64_t __ret_143; \ + __ret_143 = __s0_143 * __noswap_vget_lane_f64(__s1_143, __p2_143); \ + __ret_143; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmuls_lane_f32(__p0_132, __p1_132, __p2_132) __extension__ ({ \ - float32_t __s0_132 = __p0_132; \ - float32x2_t __s1_132 = __p1_132; \ - float32_t __ret_132; \ - __ret_132 = __s0_132 * vget_lane_f32(__s1_132, __p2_132); \ - __ret_132; \ +#define vmuls_lane_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \ + float32_t __s0_144 = __p0_144; \ + float32x2_t __s1_144 = __p1_144; \ + float32_t __ret_144; \ + __ret_144 = __s0_144 * vget_lane_f32(__s1_144, __p2_144); \ + __ret_144; \ }) #else -#define vmuls_lane_f32(__p0_133, __p1_133, __p2_133) __extension__ ({ \ - float32_t __s0_133 = __p0_133; \ - float32x2_t __s1_133 = __p1_133; \ - float32x2_t __rev1_133; __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 1, 0); \ - float32_t __ret_133; \ - __ret_133 = __s0_133 * __noswap_vget_lane_f32(__rev1_133, __p2_133); \ - __ret_133; \ +#define vmuls_lane_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \ + float32_t __s0_145 = __p0_145; \ + float32x2_t __s1_145 = __p1_145; \ + float32x2_t __rev1_145; __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 1, 0); \ + float32_t __ret_145; \ + __ret_145 = __s0_145 * __noswap_vget_lane_f32(__rev1_145, __p2_145); \ + __ret_145; \ }) #endif @@ -53873,40 +57185,40 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmuld_laneq_f64(__p0_134, __p1_134, __p2_134) __extension__ ({ \ - float64_t __s0_134 = __p0_134; \ - float64x2_t __s1_134 = __p1_134; \ - float64_t __ret_134; \ - __ret_134 = __s0_134 * vgetq_lane_f64(__s1_134, __p2_134); \ - __ret_134; \ +#define vmuld_laneq_f64(__p0_146, __p1_146, __p2_146) __extension__ ({ \ + float64_t __s0_146 = __p0_146; \ + float64x2_t __s1_146 = __p1_146; \ + float64_t __ret_146; \ + __ret_146 = __s0_146 * vgetq_lane_f64(__s1_146, __p2_146); \ + __ret_146; \ }) #else -#define vmuld_laneq_f64(__p0_135, __p1_135, __p2_135) __extension__ ({ \ - float64_t __s0_135 = __p0_135; \ - float64x2_t __s1_135 = __p1_135; \ - float64x2_t __rev1_135; __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 1, 0); \ - float64_t __ret_135; \ - __ret_135 = __s0_135 * __noswap_vgetq_lane_f64(__rev1_135, __p2_135); \ - __ret_135; \ +#define vmuld_laneq_f64(__p0_147, __p1_147, __p2_147) __extension__ ({ \ + float64_t __s0_147 = __p0_147; \ + float64x2_t __s1_147 = __p1_147; \ + float64x2_t __rev1_147; __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \ + float64_t __ret_147; \ + __ret_147 = __s0_147 * __noswap_vgetq_lane_f64(__rev1_147, __p2_147); \ + __ret_147; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmuls_laneq_f32(__p0_136, __p1_136, __p2_136) __extension__ ({ \ - float32_t __s0_136 = __p0_136; \ - float32x4_t __s1_136 = __p1_136; \ - float32_t __ret_136; \ - __ret_136 = __s0_136 * vgetq_lane_f32(__s1_136, __p2_136); \ - __ret_136; \ +#define vmuls_laneq_f32(__p0_148, __p1_148, __p2_148) __extension__ ({ \ + float32_t __s0_148 = __p0_148; \ + float32x4_t __s1_148 = __p1_148; \ + float32_t __ret_148; \ + __ret_148 = __s0_148 * vgetq_lane_f32(__s1_148, __p2_148); \ + __ret_148; \ }) #else -#define vmuls_laneq_f32(__p0_137, __p1_137, __p2_137) __extension__ ({ \ - float32_t __s0_137 = __p0_137; \ - float32x4_t __s1_137 = __p1_137; \ - float32x4_t __rev1_137; __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 3, 2, 1, 0); \ - float32_t __ret_137; \ - __ret_137 = __s0_137 * __noswap_vgetq_lane_f32(__rev1_137, __p2_137); \ - __ret_137; \ +#define vmuls_laneq_f32(__p0_149, __p1_149, __p2_149) __extension__ ({ \ + float32_t __s0_149 = __p0_149; \ + float32x4_t __s1_149 = __p1_149; \ + float32x4_t __rev1_149; __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \ + float32_t __ret_149; \ + __ret_149 = __s0_149 * __noswap_vgetq_lane_f32(__rev1_149, __p2_149); \ + __ret_149; \ }) #endif @@ -54779,39 +58091,39 @@ __ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxd_lane_f64(__p0_138, __p1_138, __p2_138) __extension__ ({ \ - float64_t __s0_138 = __p0_138; \ - float64x1_t __s1_138 = __p1_138; \ - float64_t __ret_138; \ - __ret_138 = vmulxd_f64(__s0_138, vget_lane_f64(__s1_138, __p2_138)); \ - __ret_138; \ +#define vmulxd_lane_f64(__p0_150, __p1_150, __p2_150) __extension__ ({ \ + float64_t __s0_150 = __p0_150; \ + float64x1_t __s1_150 = __p1_150; \ + float64_t __ret_150; \ + __ret_150 = vmulxd_f64(__s0_150, vget_lane_f64(__s1_150, __p2_150)); \ + __ret_150; \ }) #else -#define vmulxd_lane_f64(__p0_139, __p1_139, __p2_139) __extension__ ({ \ - float64_t __s0_139 = __p0_139; \ - float64x1_t __s1_139 = __p1_139; \ - float64_t __ret_139; \ - __ret_139 = __noswap_vmulxd_f64(__s0_139, __noswap_vget_lane_f64(__s1_139, __p2_139)); \ - __ret_139; \ +#define vmulxd_lane_f64(__p0_151, __p1_151, __p2_151) __extension__ ({ \ + float64_t __s0_151 = __p0_151; \ + float64x1_t __s1_151 = __p1_151; \ + float64_t __ret_151; \ + __ret_151 = __noswap_vmulxd_f64(__s0_151, __noswap_vget_lane_f64(__s1_151, __p2_151)); \ + __ret_151; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxs_lane_f32(__p0_140, __p1_140, __p2_140) __extension__ ({ \ - float32_t __s0_140 = __p0_140; \ - float32x2_t __s1_140 = __p1_140; \ - float32_t __ret_140; \ - __ret_140 = vmulxs_f32(__s0_140, vget_lane_f32(__s1_140, __p2_140)); \ - __ret_140; \ +#define vmulxs_lane_f32(__p0_152, __p1_152, __p2_152) __extension__ ({ \ + float32_t __s0_152 = __p0_152; \ + float32x2_t __s1_152 = __p1_152; \ + float32_t __ret_152; \ + __ret_152 = vmulxs_f32(__s0_152, vget_lane_f32(__s1_152, __p2_152)); \ + __ret_152; \ }) #else -#define vmulxs_lane_f32(__p0_141, __p1_141, __p2_141) __extension__ ({ \ - float32_t __s0_141 = __p0_141; \ - float32x2_t __s1_141 = __p1_141; \ - float32x2_t __rev1_141; __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 1, 0); \ - float32_t __ret_141; \ - __ret_141 = __noswap_vmulxs_f32(__s0_141, __noswap_vget_lane_f32(__rev1_141, __p2_141)); \ - __ret_141; \ +#define vmulxs_lane_f32(__p0_153, __p1_153, __p2_153) __extension__ ({ \ + float32_t __s0_153 = __p0_153; \ + float32x2_t __s1_153 = __p1_153; \ + float32x2_t __rev1_153; __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 1, 0); \ + float32_t __ret_153; \ + __ret_153 = __noswap_vmulxs_f32(__s0_153, __noswap_vget_lane_f32(__rev1_153, __p2_153)); \ + __ret_153; \ }) #endif @@ -54878,40 +58190,40 @@ __ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxd_laneq_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \ - float64_t __s0_142 = __p0_142; \ - float64x2_t __s1_142 = __p1_142; \ - float64_t __ret_142; \ - __ret_142 = vmulxd_f64(__s0_142, vgetq_lane_f64(__s1_142, __p2_142)); \ - __ret_142; \ +#define vmulxd_laneq_f64(__p0_154, __p1_154, __p2_154) __extension__ ({ \ + float64_t __s0_154 = __p0_154; \ + float64x2_t __s1_154 = __p1_154; \ + float64_t __ret_154; \ + __ret_154 = vmulxd_f64(__s0_154, vgetq_lane_f64(__s1_154, __p2_154)); \ + __ret_154; \ }) #else -#define vmulxd_laneq_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \ - float64_t __s0_143 = __p0_143; \ - float64x2_t __s1_143 = __p1_143; \ - float64x2_t __rev1_143; __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 1, 0); \ - float64_t __ret_143; \ - __ret_143 = __noswap_vmulxd_f64(__s0_143, __noswap_vgetq_lane_f64(__rev1_143, __p2_143)); \ - __ret_143; \ +#define vmulxd_laneq_f64(__p0_155, __p1_155, __p2_155) __extension__ ({ \ + float64_t __s0_155 = __p0_155; \ + float64x2_t __s1_155 = __p1_155; \ + float64x2_t __rev1_155; __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \ + float64_t __ret_155; \ + __ret_155 = __noswap_vmulxd_f64(__s0_155, __noswap_vgetq_lane_f64(__rev1_155, __p2_155)); \ + __ret_155; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxs_laneq_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \ - float32_t __s0_144 = __p0_144; \ - float32x4_t __s1_144 = __p1_144; \ - float32_t __ret_144; \ - __ret_144 = vmulxs_f32(__s0_144, vgetq_lane_f32(__s1_144, __p2_144)); \ - __ret_144; \ +#define vmulxs_laneq_f32(__p0_156, __p1_156, __p2_156) __extension__ ({ \ + float32_t __s0_156 = __p0_156; \ + float32x4_t __s1_156 = __p1_156; \ + float32_t __ret_156; \ + __ret_156 = vmulxs_f32(__s0_156, vgetq_lane_f32(__s1_156, __p2_156)); \ + __ret_156; \ }) #else -#define vmulxs_laneq_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \ - float32_t __s0_145 = __p0_145; \ - float32x4_t __s1_145 = __p1_145; \ - float32x4_t __rev1_145; __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 3, 2, 1, 0); \ - float32_t __ret_145; \ - __ret_145 = __noswap_vmulxs_f32(__s0_145, __noswap_vgetq_lane_f32(__rev1_145, __p2_145)); \ - __ret_145; \ +#define vmulxs_laneq_f32(__p0_157, __p1_157, __p2_157) __extension__ ({ \ + float32_t __s0_157 = __p0_157; \ + float32x4_t __s1_157 = __p1_157; \ + float32x4_t __rev1_157; __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \ + float32_t __ret_157; \ + __ret_157 = __noswap_vmulxs_f32(__s0_157, __noswap_vgetq_lane_f32(__rev1_157, __p2_157)); \ + __ret_157; \ }) #endif @@ -56675,78 +59987,78 @@ __ai int16_t __noswap_vqdmulhh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhs_lane_s32(__p0_146, __p1_146, __p2_146) __extension__ ({ \ - int32_t __s0_146 = __p0_146; \ - int32x2_t __s1_146 = __p1_146; \ - int32_t __ret_146; \ - __ret_146 = vqdmulhs_s32(__s0_146, vget_lane_s32(__s1_146, __p2_146)); \ - __ret_146; \ +#define vqdmulhs_lane_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \ + int32_t __s0_158 = __p0_158; \ + int32x2_t __s1_158 = __p1_158; \ + int32_t __ret_158; \ + __ret_158 = vqdmulhs_s32(__s0_158, vget_lane_s32(__s1_158, __p2_158)); \ + __ret_158; \ }) #else -#define vqdmulhs_lane_s32(__p0_147, __p1_147, __p2_147) __extension__ ({ \ - int32_t __s0_147 = __p0_147; \ - int32x2_t __s1_147 = __p1_147; \ - int32x2_t __rev1_147; __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \ - int32_t __ret_147; \ - __ret_147 = __noswap_vqdmulhs_s32(__s0_147, __noswap_vget_lane_s32(__rev1_147, __p2_147)); \ - __ret_147; \ +#define vqdmulhs_lane_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \ + int32_t __s0_159 = __p0_159; \ + int32x2_t __s1_159 = __p1_159; \ + int32x2_t __rev1_159; __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 1, 0); \ + int32_t __ret_159; \ + __ret_159 = __noswap_vqdmulhs_s32(__s0_159, __noswap_vget_lane_s32(__rev1_159, __p2_159)); \ + __ret_159; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhh_lane_s16(__p0_148, __p1_148, __p2_148) __extension__ ({ \ - int16_t __s0_148 = __p0_148; \ - int16x4_t __s1_148 = __p1_148; \ - int16_t __ret_148; \ - __ret_148 = vqdmulhh_s16(__s0_148, vget_lane_s16(__s1_148, __p2_148)); \ - __ret_148; \ +#define vqdmulhh_lane_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \ + int16_t __s0_160 = __p0_160; \ + int16x4_t __s1_160 = __p1_160; \ + int16_t __ret_160; \ + __ret_160 = vqdmulhh_s16(__s0_160, vget_lane_s16(__s1_160, __p2_160)); \ + __ret_160; \ }) #else -#define vqdmulhh_lane_s16(__p0_149, __p1_149, __p2_149) __extension__ ({ \ - int16_t __s0_149 = __p0_149; \ - int16x4_t __s1_149 = __p1_149; \ - int16x4_t __rev1_149; __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \ - int16_t __ret_149; \ - __ret_149 = __noswap_vqdmulhh_s16(__s0_149, __noswap_vget_lane_s16(__rev1_149, __p2_149)); \ - __ret_149; \ +#define vqdmulhh_lane_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \ + int16_t __s0_161 = __p0_161; \ + int16x4_t __s1_161 = __p1_161; \ + int16x4_t __rev1_161; __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 3, 2, 1, 0); \ + int16_t __ret_161; \ + __ret_161 = __noswap_vqdmulhh_s16(__s0_161, __noswap_vget_lane_s16(__rev1_161, __p2_161)); \ + __ret_161; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhs_laneq_s32(__p0_150, __p1_150, __p2_150) __extension__ ({ \ - int32_t __s0_150 = __p0_150; \ - int32x4_t __s1_150 = __p1_150; \ - int32_t __ret_150; \ - __ret_150 = vqdmulhs_s32(__s0_150, vgetq_lane_s32(__s1_150, __p2_150)); \ - __ret_150; \ +#define vqdmulhs_laneq_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \ + int32_t __s0_162 = __p0_162; \ + int32x4_t __s1_162 = __p1_162; \ + int32_t __ret_162; \ + __ret_162 = vqdmulhs_s32(__s0_162, vgetq_lane_s32(__s1_162, __p2_162)); \ + __ret_162; \ }) #else -#define vqdmulhs_laneq_s32(__p0_151, __p1_151, __p2_151) __extension__ ({ \ - int32_t __s0_151 = __p0_151; \ - int32x4_t __s1_151 = __p1_151; \ - int32x4_t __rev1_151; __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \ - int32_t __ret_151; \ - __ret_151 = __noswap_vqdmulhs_s32(__s0_151, __noswap_vgetq_lane_s32(__rev1_151, __p2_151)); \ - __ret_151; \ +#define vqdmulhs_laneq_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \ + int32_t __s0_163 = __p0_163; \ + int32x4_t __s1_163 = __p1_163; \ + int32x4_t __rev1_163; __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 3, 2, 1, 0); \ + int32_t __ret_163; \ + __ret_163 = __noswap_vqdmulhs_s32(__s0_163, __noswap_vgetq_lane_s32(__rev1_163, __p2_163)); \ + __ret_163; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhh_laneq_s16(__p0_152, __p1_152, __p2_152) __extension__ ({ \ - int16_t __s0_152 = __p0_152; \ - int16x8_t __s1_152 = __p1_152; \ - int16_t __ret_152; \ - __ret_152 = vqdmulhh_s16(__s0_152, vgetq_lane_s16(__s1_152, __p2_152)); \ - __ret_152; \ +#define vqdmulhh_laneq_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \ + int16_t __s0_164 = __p0_164; \ + int16x8_t __s1_164 = __p1_164; \ + int16_t __ret_164; \ + __ret_164 = vqdmulhh_s16(__s0_164, vgetq_lane_s16(__s1_164, __p2_164)); \ + __ret_164; \ }) #else -#define vqdmulhh_laneq_s16(__p0_153, __p1_153, __p2_153) __extension__ ({ \ - int16_t __s0_153 = __p0_153; \ - int16x8_t __s1_153 = __p1_153; \ - int16x8_t __rev1_153; __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_153; \ - __ret_153 = __noswap_vqdmulhh_s16(__s0_153, __noswap_vgetq_lane_s16(__rev1_153, __p2_153)); \ - __ret_153; \ +#define vqdmulhh_laneq_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \ + int16_t __s0_165 = __p0_165; \ + int16x8_t __s1_165 = __p1_165; \ + int16x8_t __rev1_165; __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_165; \ + __ret_165 = __noswap_vqdmulhh_s16(__s0_165, __noswap_vgetq_lane_s16(__rev1_165, __p2_165)); \ + __ret_165; \ }) #endif @@ -57023,78 +60335,78 @@ __ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulls_lane_s32(__p0_154, __p1_154, __p2_154) __extension__ ({ \ - int32_t __s0_154 = __p0_154; \ - int32x2_t __s1_154 = __p1_154; \ - int64_t __ret_154; \ - __ret_154 = vqdmulls_s32(__s0_154, vget_lane_s32(__s1_154, __p2_154)); \ - __ret_154; \ +#define vqdmulls_lane_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \ + int32_t __s0_166 = __p0_166; \ + int32x2_t __s1_166 = __p1_166; \ + int64_t __ret_166; \ + __ret_166 = vqdmulls_s32(__s0_166, vget_lane_s32(__s1_166, __p2_166)); \ + __ret_166; \ }) #else -#define vqdmulls_lane_s32(__p0_155, __p1_155, __p2_155) __extension__ ({ \ - int32_t __s0_155 = __p0_155; \ - int32x2_t __s1_155 = __p1_155; \ - int32x2_t __rev1_155; __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \ - int64_t __ret_155; \ - __ret_155 = __noswap_vqdmulls_s32(__s0_155, __noswap_vget_lane_s32(__rev1_155, __p2_155)); \ - __ret_155; \ +#define vqdmulls_lane_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \ + int32_t __s0_167 = __p0_167; \ + int32x2_t __s1_167 = __p1_167; \ + int32x2_t __rev1_167; __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 1, 0); \ + int64_t __ret_167; \ + __ret_167 = __noswap_vqdmulls_s32(__s0_167, __noswap_vget_lane_s32(__rev1_167, __p2_167)); \ + __ret_167; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmullh_lane_s16(__p0_156, __p1_156, __p2_156) __extension__ ({ \ - int16_t __s0_156 = __p0_156; \ - int16x4_t __s1_156 = __p1_156; \ - int32_t __ret_156; \ - __ret_156 = vqdmullh_s16(__s0_156, vget_lane_s16(__s1_156, __p2_156)); \ - __ret_156; \ +#define vqdmullh_lane_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \ + int16_t __s0_168 = __p0_168; \ + int16x4_t __s1_168 = __p1_168; \ + int32_t __ret_168; \ + __ret_168 = vqdmullh_s16(__s0_168, vget_lane_s16(__s1_168, __p2_168)); \ + __ret_168; \ }) #else -#define vqdmullh_lane_s16(__p0_157, __p1_157, __p2_157) __extension__ ({ \ - int16_t __s0_157 = __p0_157; \ - int16x4_t __s1_157 = __p1_157; \ - int16x4_t __rev1_157; __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \ - int32_t __ret_157; \ - __ret_157 = __noswap_vqdmullh_s16(__s0_157, __noswap_vget_lane_s16(__rev1_157, __p2_157)); \ - __ret_157; \ +#define vqdmullh_lane_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \ + int16_t __s0_169 = __p0_169; \ + int16x4_t __s1_169 = __p1_169; \ + int16x4_t __rev1_169; __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 3, 2, 1, 0); \ + int32_t __ret_169; \ + __ret_169 = __noswap_vqdmullh_s16(__s0_169, __noswap_vget_lane_s16(__rev1_169, __p2_169)); \ + __ret_169; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulls_laneq_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \ - int32_t __s0_158 = __p0_158; \ - int32x4_t __s1_158 = __p1_158; \ - int64_t __ret_158; \ - __ret_158 = vqdmulls_s32(__s0_158, vgetq_lane_s32(__s1_158, __p2_158)); \ - __ret_158; \ +#define vqdmulls_laneq_s32(__p0_170, __p1_170, __p2_170) __extension__ ({ \ + int32_t __s0_170 = __p0_170; \ + int32x4_t __s1_170 = __p1_170; \ + int64_t __ret_170; \ + __ret_170 = vqdmulls_s32(__s0_170, vgetq_lane_s32(__s1_170, __p2_170)); \ + __ret_170; \ }) #else -#define vqdmulls_laneq_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \ - int32_t __s0_159 = __p0_159; \ - int32x4_t __s1_159 = __p1_159; \ - int32x4_t __rev1_159; __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \ - int64_t __ret_159; \ - __ret_159 = __noswap_vqdmulls_s32(__s0_159, __noswap_vgetq_lane_s32(__rev1_159, __p2_159)); \ - __ret_159; \ +#define vqdmulls_laneq_s32(__p0_171, __p1_171, __p2_171) __extension__ ({ \ + int32_t __s0_171 = __p0_171; \ + int32x4_t __s1_171 = __p1_171; \ + int32x4_t __rev1_171; __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \ + int64_t __ret_171; \ + __ret_171 = __noswap_vqdmulls_s32(__s0_171, __noswap_vgetq_lane_s32(__rev1_171, __p2_171)); \ + __ret_171; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmullh_laneq_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \ - int16_t __s0_160 = __p0_160; \ - int16x8_t __s1_160 = __p1_160; \ - int32_t __ret_160; \ - __ret_160 = vqdmullh_s16(__s0_160, vgetq_lane_s16(__s1_160, __p2_160)); \ - __ret_160; \ +#define vqdmullh_laneq_s16(__p0_172, __p1_172, __p2_172) __extension__ ({ \ + int16_t __s0_172 = __p0_172; \ + int16x8_t __s1_172 = __p1_172; \ + int32_t __ret_172; \ + __ret_172 = vqdmullh_s16(__s0_172, vgetq_lane_s16(__s1_172, __p2_172)); \ + __ret_172; \ }) #else -#define vqdmullh_laneq_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \ - int16_t __s0_161 = __p0_161; \ - int16x8_t __s1_161 = __p1_161; \ - int16x8_t __rev1_161; __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \ - int32_t __ret_161; \ - __ret_161 = __noswap_vqdmullh_s16(__s0_161, __noswap_vgetq_lane_s16(__rev1_161, __p2_161)); \ - __ret_161; \ +#define vqdmullh_laneq_s16(__p0_173, __p1_173, __p2_173) __extension__ ({ \ + int16_t __s0_173 = __p0_173; \ + int16x8_t __s1_173 = __p1_173; \ + int16x8_t __rev1_173; __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32_t __ret_173; \ + __ret_173 = __noswap_vqdmullh_s16(__s0_173, __noswap_vgetq_lane_s16(__rev1_173, __p2_173)); \ + __ret_173; \ }) #endif @@ -57544,78 +60856,78 @@ __ai int16_t __noswap_vqrdmulhh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhs_lane_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \ - int32_t __s0_162 = __p0_162; \ - int32x2_t __s1_162 = __p1_162; \ - int32_t __ret_162; \ - __ret_162 = vqrdmulhs_s32(__s0_162, vget_lane_s32(__s1_162, __p2_162)); \ - __ret_162; \ +#define vqrdmulhs_lane_s32(__p0_174, __p1_174, __p2_174) __extension__ ({ \ + int32_t __s0_174 = __p0_174; \ + int32x2_t __s1_174 = __p1_174; \ + int32_t __ret_174; \ + __ret_174 = vqrdmulhs_s32(__s0_174, vget_lane_s32(__s1_174, __p2_174)); \ + __ret_174; \ }) #else -#define vqrdmulhs_lane_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \ - int32_t __s0_163 = __p0_163; \ - int32x2_t __s1_163 = __p1_163; \ - int32x2_t __rev1_163; __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 1, 0); \ - int32_t __ret_163; \ - __ret_163 = __noswap_vqrdmulhs_s32(__s0_163, __noswap_vget_lane_s32(__rev1_163, __p2_163)); \ - __ret_163; \ +#define vqrdmulhs_lane_s32(__p0_175, __p1_175, __p2_175) __extension__ ({ \ + int32_t __s0_175 = __p0_175; \ + int32x2_t __s1_175 = __p1_175; \ + int32x2_t __rev1_175; __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 1, 0); \ + int32_t __ret_175; \ + __ret_175 = __noswap_vqrdmulhs_s32(__s0_175, __noswap_vget_lane_s32(__rev1_175, __p2_175)); \ + __ret_175; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhh_lane_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \ - int16_t __s0_164 = __p0_164; \ - int16x4_t __s1_164 = __p1_164; \ - int16_t __ret_164; \ - __ret_164 = vqrdmulhh_s16(__s0_164, vget_lane_s16(__s1_164, __p2_164)); \ - __ret_164; \ +#define vqrdmulhh_lane_s16(__p0_176, __p1_176, __p2_176) __extension__ ({ \ + int16_t __s0_176 = __p0_176; \ + int16x4_t __s1_176 = __p1_176; \ + int16_t __ret_176; \ + __ret_176 = vqrdmulhh_s16(__s0_176, vget_lane_s16(__s1_176, __p2_176)); \ + __ret_176; \ }) #else -#define vqrdmulhh_lane_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \ - int16_t __s0_165 = __p0_165; \ - int16x4_t __s1_165 = __p1_165; \ - int16x4_t __rev1_165; __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 3, 2, 1, 0); \ - int16_t __ret_165; \ - __ret_165 = __noswap_vqrdmulhh_s16(__s0_165, __noswap_vget_lane_s16(__rev1_165, __p2_165)); \ - __ret_165; \ +#define vqrdmulhh_lane_s16(__p0_177, __p1_177, __p2_177) __extension__ ({ \ + int16_t __s0_177 = __p0_177; \ + int16x4_t __s1_177 = __p1_177; \ + int16x4_t __rev1_177; __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \ + int16_t __ret_177; \ + __ret_177 = __noswap_vqrdmulhh_s16(__s0_177, __noswap_vget_lane_s16(__rev1_177, __p2_177)); \ + __ret_177; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhs_laneq_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \ - int32_t __s0_166 = __p0_166; \ - int32x4_t __s1_166 = __p1_166; \ - int32_t __ret_166; \ - __ret_166 = vqrdmulhs_s32(__s0_166, vgetq_lane_s32(__s1_166, __p2_166)); \ - __ret_166; \ +#define vqrdmulhs_laneq_s32(__p0_178, __p1_178, __p2_178) __extension__ ({ \ + int32_t __s0_178 = __p0_178; \ + int32x4_t __s1_178 = __p1_178; \ + int32_t __ret_178; \ + __ret_178 = vqrdmulhs_s32(__s0_178, vgetq_lane_s32(__s1_178, __p2_178)); \ + __ret_178; \ }) #else -#define vqrdmulhs_laneq_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \ - int32_t __s0_167 = __p0_167; \ - int32x4_t __s1_167 = __p1_167; \ - int32x4_t __rev1_167; __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \ - int32_t __ret_167; \ - __ret_167 = __noswap_vqrdmulhs_s32(__s0_167, __noswap_vgetq_lane_s32(__rev1_167, __p2_167)); \ - __ret_167; \ +#define vqrdmulhs_laneq_s32(__p0_179, __p1_179, __p2_179) __extension__ ({ \ + int32_t __s0_179 = __p0_179; \ + int32x4_t __s1_179 = __p1_179; \ + int32x4_t __rev1_179; __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 3, 2, 1, 0); \ + int32_t __ret_179; \ + __ret_179 = __noswap_vqrdmulhs_s32(__s0_179, __noswap_vgetq_lane_s32(__rev1_179, __p2_179)); \ + __ret_179; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhh_laneq_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \ - int16_t __s0_168 = __p0_168; \ - int16x8_t __s1_168 = __p1_168; \ - int16_t __ret_168; \ - __ret_168 = vqrdmulhh_s16(__s0_168, vgetq_lane_s16(__s1_168, __p2_168)); \ - __ret_168; \ +#define vqrdmulhh_laneq_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \ + int16_t __s0_180 = __p0_180; \ + int16x8_t __s1_180 = __p1_180; \ + int16_t __ret_180; \ + __ret_180 = vqrdmulhh_s16(__s0_180, vgetq_lane_s16(__s1_180, __p2_180)); \ + __ret_180; \ }) #else -#define vqrdmulhh_laneq_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \ - int16_t __s0_169 = __p0_169; \ - int16x8_t __s1_169 = __p1_169; \ - int16x8_t __rev1_169; __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_169; \ - __ret_169 = __noswap_vqrdmulhh_s16(__s0_169, __noswap_vgetq_lane_s16(__rev1_169, __p2_169)); \ - __ret_169; \ +#define vqrdmulhh_laneq_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \ + int16_t __s0_181 = __p0_181; \ + int16x8_t __s1_181 = __p1_181; \ + int16x8_t __rev1_181; __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_181; \ + __ret_181 = __noswap_vqrdmulhh_s16(__s0_181, __noswap_vgetq_lane_s16(__rev1_181, __p2_181)); \ + __ret_181; \ }) #endif @@ -57816,927 +61128,927 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u32(__p0_170, __p1_170, __p2_170) __extension__ ({ \ - uint16x4_t __s0_170 = __p0_170; \ - uint32x4_t __s1_170 = __p1_170; \ - uint16x8_t __ret_170; \ - __ret_170 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_170), (uint16x4_t)(vqrshrn_n_u32(__s1_170, __p2_170)))); \ - __ret_170; \ -}) -#else -#define vqrshrn_high_n_u32(__p0_171, __p1_171, __p2_171) __extension__ ({ \ - uint16x4_t __s0_171 = __p0_171; \ - uint32x4_t __s1_171 = __p1_171; \ - uint16x4_t __rev0_171; __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \ - uint32x4_t __rev1_171; __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \ - uint16x8_t __ret_171; \ - __ret_171 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_171), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_171, __p2_171)))); \ - __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_171; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u64(__p0_172, __p1_172, __p2_172) __extension__ ({ \ - uint32x2_t __s0_172 = __p0_172; \ - uint64x2_t __s1_172 = __p1_172; \ - uint32x4_t __ret_172; \ - __ret_172 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_172), (uint32x2_t)(vqrshrn_n_u64(__s1_172, __p2_172)))); \ - __ret_172; \ -}) -#else -#define vqrshrn_high_n_u64(__p0_173, __p1_173, __p2_173) __extension__ ({ \ - uint32x2_t __s0_173 = __p0_173; \ - uint64x2_t __s1_173 = __p1_173; \ - uint32x2_t __rev0_173; __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 1, 0); \ - uint64x2_t __rev1_173; __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 1, 0); \ - uint32x4_t __ret_173; \ - __ret_173 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_173), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_173, __p2_173)))); \ - __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 3, 2, 1, 0); \ - __ret_173; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u16(__p0_174, __p1_174, __p2_174) __extension__ ({ \ - uint8x8_t __s0_174 = __p0_174; \ - uint16x8_t __s1_174 = __p1_174; \ - uint8x16_t __ret_174; \ - __ret_174 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_174), (uint8x8_t)(vqrshrn_n_u16(__s1_174, __p2_174)))); \ - __ret_174; \ -}) -#else -#define vqrshrn_high_n_u16(__p0_175, __p1_175, __p2_175) __extension__ ({ \ - uint8x8_t __s0_175 = __p0_175; \ - uint16x8_t __s1_175 = __p1_175; \ - uint8x8_t __rev0_175; __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_175; __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_175; \ - __ret_175 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_175), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_175, __p2_175)))); \ - __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_175; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s32(__p0_176, __p1_176, __p2_176) __extension__ ({ \ - int16x4_t __s0_176 = __p0_176; \ - int32x4_t __s1_176 = __p1_176; \ - int16x8_t __ret_176; \ - __ret_176 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_176), (int16x4_t)(vqrshrn_n_s32(__s1_176, __p2_176)))); \ - __ret_176; \ -}) -#else -#define vqrshrn_high_n_s32(__p0_177, __p1_177, __p2_177) __extension__ ({ \ - int16x4_t __s0_177 = __p0_177; \ - int32x4_t __s1_177 = __p1_177; \ - int16x4_t __rev0_177; __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 3, 2, 1, 0); \ - int32x4_t __rev1_177; __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \ - int16x8_t __ret_177; \ - __ret_177 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_177), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_177, __p2_177)))); \ - __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_177; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s64(__p0_178, __p1_178, __p2_178) __extension__ ({ \ - int32x2_t __s0_178 = __p0_178; \ - int64x2_t __s1_178 = __p1_178; \ - int32x4_t __ret_178; \ - __ret_178 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_178), (int32x2_t)(vqrshrn_n_s64(__s1_178, __p2_178)))); \ - __ret_178; \ -}) -#else -#define vqrshrn_high_n_s64(__p0_179, __p1_179, __p2_179) __extension__ ({ \ - int32x2_t __s0_179 = __p0_179; \ - int64x2_t __s1_179 = __p1_179; \ - int32x2_t __rev0_179; __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 1, 0); \ - int64x2_t __rev1_179; __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 1, 0); \ - int32x4_t __ret_179; \ - __ret_179 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_179), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_179, __p2_179)))); \ - __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \ - __ret_179; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \ - int8x8_t __s0_180 = __p0_180; \ - int16x8_t __s1_180 = __p1_180; \ - int8x16_t __ret_180; \ - __ret_180 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_180), (int8x8_t)(vqrshrn_n_s16(__s1_180, __p2_180)))); \ - __ret_180; \ -}) -#else -#define vqrshrn_high_n_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \ - int8x8_t __s0_181 = __p0_181; \ - int16x8_t __s1_181 = __p1_181; \ - int8x8_t __rev0_181; __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_181; __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_181; \ - __ret_181 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_181), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_181, __p2_181)))); \ - __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_181; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s32(__p0_182, __p1_182, __p2_182) __extension__ ({ \ - int16x4_t __s0_182 = __p0_182; \ - int32x4_t __s1_182 = __p1_182; \ - int16x8_t __ret_182; \ - __ret_182 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_182), (int16x4_t)(vqrshrun_n_s32(__s1_182, __p2_182)))); \ +#define vqrshrn_high_n_u32(__p0_182, __p1_182, __p2_182) __extension__ ({ \ + uint16x4_t __s0_182 = __p0_182; \ + uint32x4_t __s1_182 = __p1_182; \ + uint16x8_t __ret_182; \ + __ret_182 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_182), (uint16x4_t)(vqrshrn_n_u32(__s1_182, __p2_182)))); \ __ret_182; \ }) #else -#define vqrshrun_high_n_s32(__p0_183, __p1_183, __p2_183) __extension__ ({ \ - int16x4_t __s0_183 = __p0_183; \ - int32x4_t __s1_183 = __p1_183; \ - int16x4_t __rev0_183; __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \ - int32x4_t __rev1_183; __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \ - int16x8_t __ret_183; \ - __ret_183 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_183), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_183, __p2_183)))); \ +#define vqrshrn_high_n_u32(__p0_183, __p1_183, __p2_183) __extension__ ({ \ + uint16x4_t __s0_183 = __p0_183; \ + uint32x4_t __s1_183 = __p1_183; \ + uint16x4_t __rev0_183; __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \ + uint32x4_t __rev1_183; __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \ + uint16x8_t __ret_183; \ + __ret_183 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_183), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_183, __p2_183)))); \ __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_183; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s64(__p0_184, __p1_184, __p2_184) __extension__ ({ \ - int32x2_t __s0_184 = __p0_184; \ - int64x2_t __s1_184 = __p1_184; \ - int32x4_t __ret_184; \ - __ret_184 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_184), (int32x2_t)(vqrshrun_n_s64(__s1_184, __p2_184)))); \ +#define vqrshrn_high_n_u64(__p0_184, __p1_184, __p2_184) __extension__ ({ \ + uint32x2_t __s0_184 = __p0_184; \ + uint64x2_t __s1_184 = __p1_184; \ + uint32x4_t __ret_184; \ + __ret_184 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_184), (uint32x2_t)(vqrshrn_n_u64(__s1_184, __p2_184)))); \ __ret_184; \ }) #else -#define vqrshrun_high_n_s64(__p0_185, __p1_185, __p2_185) __extension__ ({ \ - int32x2_t __s0_185 = __p0_185; \ - int64x2_t __s1_185 = __p1_185; \ - int32x2_t __rev0_185; __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \ - int64x2_t __rev1_185; __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \ - int32x4_t __ret_185; \ - __ret_185 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_185), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_185, __p2_185)))); \ +#define vqrshrn_high_n_u64(__p0_185, __p1_185, __p2_185) __extension__ ({ \ + uint32x2_t __s0_185 = __p0_185; \ + uint64x2_t __s1_185 = __p1_185; \ + uint32x2_t __rev0_185; __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \ + uint64x2_t __rev1_185; __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \ + uint32x4_t __ret_185; \ + __ret_185 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_185), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_185, __p2_185)))); \ __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \ __ret_185; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s16(__p0_186, __p1_186, __p2_186) __extension__ ({ \ - int8x8_t __s0_186 = __p0_186; \ - int16x8_t __s1_186 = __p1_186; \ - int8x16_t __ret_186; \ - __ret_186 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_186), (int8x8_t)(vqrshrun_n_s16(__s1_186, __p2_186)))); \ +#define vqrshrn_high_n_u16(__p0_186, __p1_186, __p2_186) __extension__ ({ \ + uint8x8_t __s0_186 = __p0_186; \ + uint16x8_t __s1_186 = __p1_186; \ + uint8x16_t __ret_186; \ + __ret_186 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_186), (uint8x8_t)(vqrshrn_n_u16(__s1_186, __p2_186)))); \ __ret_186; \ }) #else -#define vqrshrun_high_n_s16(__p0_187, __p1_187, __p2_187) __extension__ ({ \ - int8x8_t __s0_187 = __p0_187; \ - int16x8_t __s1_187 = __p1_187; \ - int8x8_t __rev0_187; __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_187; __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_187; \ - __ret_187 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_187), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_187, __p2_187)))); \ +#define vqrshrn_high_n_u16(__p0_187, __p1_187, __p2_187) __extension__ ({ \ + uint8x8_t __s0_187 = __p0_187; \ + uint16x8_t __s1_187 = __p1_187; \ + uint8x8_t __rev0_187; __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_187; __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_187; \ + __ret_187 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_187), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_187, __p2_187)))); \ __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_187; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) { - uint8_t __ret; - __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1); - return __ret; -} -#else -__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) { - uint8_t __ret; - __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) { - uint32_t __ret; - __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1); - return __ret; -} -#else -__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) { - uint32_t __ret; - __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) { - uint64_t __ret; - __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1); - return __ret; -} -#else -__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) { - uint64_t __ret; - __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) { - uint16_t __ret; - __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1); - return __ret; -} -#else -__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) { - uint16_t __ret; - __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) { - int8_t __ret; - __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1); - return __ret; -} -#else -__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) { - int8_t __ret; - __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) { - int32_t __ret; - __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1); - return __ret; -} -#else -__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) { - int32_t __ret; - __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1); - return __ret; -} -#else -__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { - int16_t __ret; - __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1); - return __ret; -} -#else -__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { - int16_t __ret; - __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \ - uint8_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \ - uint8_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshls_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshls_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshld_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint64_t __ret; \ - __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshld_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint64_t __ret; \ - __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \ - int8_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \ - int8_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshls_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshls_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshld_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshld_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \ - int8_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \ - int8_t __s0 = __p0; \ - int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \ - int32_t __s0 = __p0; \ - int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \ - int16_t __s0 = __p0; \ - int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u32(__p0_188, __p1_188, __p2_188) __extension__ ({ \ - uint16x4_t __s0_188 = __p0_188; \ - uint32x4_t __s1_188 = __p1_188; \ - uint16x8_t __ret_188; \ - __ret_188 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_188), (uint16x4_t)(vqshrn_n_u32(__s1_188, __p2_188)))); \ +#define vqrshrn_high_n_s32(__p0_188, __p1_188, __p2_188) __extension__ ({ \ + int16x4_t __s0_188 = __p0_188; \ + int32x4_t __s1_188 = __p1_188; \ + int16x8_t __ret_188; \ + __ret_188 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_188), (int16x4_t)(vqrshrn_n_s32(__s1_188, __p2_188)))); \ __ret_188; \ }) #else -#define vqshrn_high_n_u32(__p0_189, __p1_189, __p2_189) __extension__ ({ \ - uint16x4_t __s0_189 = __p0_189; \ - uint32x4_t __s1_189 = __p1_189; \ - uint16x4_t __rev0_189; __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \ - uint32x4_t __rev1_189; __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \ - uint16x8_t __ret_189; \ - __ret_189 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_189), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_189, __p2_189)))); \ +#define vqrshrn_high_n_s32(__p0_189, __p1_189, __p2_189) __extension__ ({ \ + int16x4_t __s0_189 = __p0_189; \ + int32x4_t __s1_189 = __p1_189; \ + int16x4_t __rev0_189; __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \ + int32x4_t __rev1_189; __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \ + int16x8_t __ret_189; \ + __ret_189 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_189), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_189, __p2_189)))); \ __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_189; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u64(__p0_190, __p1_190, __p2_190) __extension__ ({ \ - uint32x2_t __s0_190 = __p0_190; \ - uint64x2_t __s1_190 = __p1_190; \ - uint32x4_t __ret_190; \ - __ret_190 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_190), (uint32x2_t)(vqshrn_n_u64(__s1_190, __p2_190)))); \ +#define vqrshrn_high_n_s64(__p0_190, __p1_190, __p2_190) __extension__ ({ \ + int32x2_t __s0_190 = __p0_190; \ + int64x2_t __s1_190 = __p1_190; \ + int32x4_t __ret_190; \ + __ret_190 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_190), (int32x2_t)(vqrshrn_n_s64(__s1_190, __p2_190)))); \ __ret_190; \ }) #else -#define vqshrn_high_n_u64(__p0_191, __p1_191, __p2_191) __extension__ ({ \ - uint32x2_t __s0_191 = __p0_191; \ - uint64x2_t __s1_191 = __p1_191; \ - uint32x2_t __rev0_191; __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \ - uint64x2_t __rev1_191; __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \ - uint32x4_t __ret_191; \ - __ret_191 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_191), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_191, __p2_191)))); \ +#define vqrshrn_high_n_s64(__p0_191, __p1_191, __p2_191) __extension__ ({ \ + int32x2_t __s0_191 = __p0_191; \ + int64x2_t __s1_191 = __p1_191; \ + int32x2_t __rev0_191; __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \ + int64x2_t __rev1_191; __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \ + int32x4_t __ret_191; \ + __ret_191 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_191), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_191, __p2_191)))); \ __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \ __ret_191; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u16(__p0_192, __p1_192, __p2_192) __extension__ ({ \ - uint8x8_t __s0_192 = __p0_192; \ - uint16x8_t __s1_192 = __p1_192; \ - uint8x16_t __ret_192; \ - __ret_192 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_192), (uint8x8_t)(vqshrn_n_u16(__s1_192, __p2_192)))); \ +#define vqrshrn_high_n_s16(__p0_192, __p1_192, __p2_192) __extension__ ({ \ + int8x8_t __s0_192 = __p0_192; \ + int16x8_t __s1_192 = __p1_192; \ + int8x16_t __ret_192; \ + __ret_192 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_192), (int8x8_t)(vqrshrn_n_s16(__s1_192, __p2_192)))); \ __ret_192; \ }) #else -#define vqshrn_high_n_u16(__p0_193, __p1_193, __p2_193) __extension__ ({ \ - uint8x8_t __s0_193 = __p0_193; \ - uint16x8_t __s1_193 = __p1_193; \ - uint8x8_t __rev0_193; __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_193; __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_193; \ - __ret_193 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_193), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_193, __p2_193)))); \ +#define vqrshrn_high_n_s16(__p0_193, __p1_193, __p2_193) __extension__ ({ \ + int8x8_t __s0_193 = __p0_193; \ + int16x8_t __s1_193 = __p1_193; \ + int8x8_t __rev0_193; __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_193; __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_193; \ + __ret_193 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_193), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_193, __p2_193)))); \ __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_193; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \ +#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrns_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqrshrns_n_u32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrnd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqrshrnd_n_u64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrnh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqrshrnh_n_u16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrns_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqrshrns_n_s32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrnd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqrshrnd_n_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqrshrnh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqrshrnh_n_s16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqrshrun_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \ int16x4_t __s0_194 = __p0_194; \ int32x4_t __s1_194 = __p1_194; \ int16x8_t __ret_194; \ - __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqshrn_n_s32(__s1_194, __p2_194)))); \ + __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqrshrun_n_s32(__s1_194, __p2_194)))); \ __ret_194; \ }) #else -#define vqshrn_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \ +#define vqrshrun_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \ int16x4_t __s0_195 = __p0_195; \ int32x4_t __s1_195 = __p1_195; \ int16x4_t __rev0_195; __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \ int32x4_t __rev1_195; __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \ int16x8_t __ret_195; \ - __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_195, __p2_195)))); \ + __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_195, __p2_195)))); \ __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_195; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \ +#define vqrshrun_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \ int32x2_t __s0_196 = __p0_196; \ int64x2_t __s1_196 = __p1_196; \ int32x4_t __ret_196; \ - __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqshrn_n_s64(__s1_196, __p2_196)))); \ + __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqrshrun_n_s64(__s1_196, __p2_196)))); \ __ret_196; \ }) #else -#define vqshrn_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \ +#define vqrshrun_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \ int32x2_t __s0_197 = __p0_197; \ int64x2_t __s1_197 = __p1_197; \ int32x2_t __rev0_197; __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \ int64x2_t __rev1_197; __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \ int32x4_t __ret_197; \ - __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_197, __p2_197)))); \ + __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_197, __p2_197)))); \ __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \ __ret_197; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \ +#define vqrshrun_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \ int8x8_t __s0_198 = __p0_198; \ int16x8_t __s1_198 = __p1_198; \ int8x16_t __ret_198; \ - __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqshrn_n_s16(__s1_198, __p2_198)))); \ + __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqrshrun_n_s16(__s1_198, __p2_198)))); \ __ret_198; \ }) #else -#define vqshrn_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \ +#define vqrshrun_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \ int8x8_t __s0_199 = __p0_199; \ int16x8_t __s1_199 = __p1_199; \ int8x8_t __rev0_199; __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \ int16x8_t __rev1_199; __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \ int8x16_t __ret_199; \ - __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_199, __p2_199)))); \ + __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_199, __p2_199)))); \ __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_199; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \ - uint32_t __s0 = __p0; \ - uint16_t __ret; \ - __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint32_t __ret; \ - __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \ - __ret; \ -}) -#else -#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \ - uint16_t __s0 = __p0; \ - uint8_t __ret; \ - __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \ +#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \ int32_t __s0 = __p0; \ int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \ + __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \ __ret; \ }) #else -#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \ +#define vqrshruns_n_s32(__p0, __p1) __extension__ ({ \ int32_t __s0 = __p0; \ int16_t __ret; \ - __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \ + __ret = (int16_t) __builtin_neon_vqrshruns_n_s32(__s0, __p1); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \ +#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \ int64_t __s0 = __p0; \ int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \ + __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \ __ret; \ }) #else -#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \ +#define vqrshrund_n_s64(__p0, __p1) __extension__ ({ \ int64_t __s0 = __p0; \ int32_t __ret; \ - __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \ + __ret = (int32_t) __builtin_neon_vqrshrund_n_s64(__s0, __p1); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \ +#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \ int16_t __s0 = __p0; \ int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \ + __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \ __ret; \ }) #else -#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \ +#define vqrshrunh_n_s16(__p0, __p1) __extension__ ({ \ int16_t __s0 = __p0; \ int8_t __ret; \ - __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \ + __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s32(__p0_200, __p1_200, __p2_200) __extension__ ({ \ - int16x4_t __s0_200 = __p0_200; \ - int32x4_t __s1_200 = __p1_200; \ - int16x8_t __ret_200; \ - __ret_200 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_200), (int16x4_t)(vqshrun_n_s32(__s1_200, __p2_200)))); \ +__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) { + uint8_t __ret; + __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1); + return __ret; +} +#else +__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) { + uint8_t __ret; + __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) { + uint32_t __ret; + __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1); + return __ret; +} +#else +__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) { + uint32_t __ret; + __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1); + return __ret; +} +#else +__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) { + uint16_t __ret; + __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1); + return __ret; +} +#else +__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) { + uint16_t __ret; + __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) { + int8_t __ret; + __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1); + return __ret; +} +#else +__ai int8_t vqshlb_s8(int8_t __p0, int8_t __p1) { + int8_t __ret; + __ret = (int8_t) __builtin_neon_vqshlb_s8(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) { + int32_t __ret; + __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1); + return __ret; +} +#else +__ai int32_t vqshls_s32(int32_t __p0, int32_t __p1) { + int32_t __ret; + __ret = (int32_t) __builtin_neon_vqshls_s32(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) { + int64_t __ret; + __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1); + return __ret; +} +#else +__ai int64_t vqshld_s64(int64_t __p0, int64_t __p1) { + int64_t __ret; + __ret = (int64_t) __builtin_neon_vqshld_s64(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { + int16_t __ret; + __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1); + return __ret; +} +#else +__ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { + int16_t __ret; + __ret = (int16_t) __builtin_neon_vqshlh_s16(__p0, __p1); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \ + uint8_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlb_n_u8(__p0, __p1) __extension__ ({ \ + uint8_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqshlb_n_u8(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshls_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshls_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqshls_n_u32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshld_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshld_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vqshld_n_u64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqshlh_n_u16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \ + int8_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlb_n_s8(__p0, __p1) __extension__ ({ \ + int8_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshlb_n_s8(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshls_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshls_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshls_n_s32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshld_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshld_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vqshld_n_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshlh_n_s16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \ + int8_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlub_n_s8(__p0, __p1) __extension__ ({ \ + int8_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshlub_n_s8(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlus_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshlus_n_s32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshlud_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vqshlud_n_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshluh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshluh_n_s16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrn_high_n_u32(__p0_200, __p1_200, __p2_200) __extension__ ({ \ + uint16x4_t __s0_200 = __p0_200; \ + uint32x4_t __s1_200 = __p1_200; \ + uint16x8_t __ret_200; \ + __ret_200 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_200), (uint16x4_t)(vqshrn_n_u32(__s1_200, __p2_200)))); \ __ret_200; \ }) #else -#define vqshrun_high_n_s32(__p0_201, __p1_201, __p2_201) __extension__ ({ \ - int16x4_t __s0_201 = __p0_201; \ - int32x4_t __s1_201 = __p1_201; \ - int16x4_t __rev0_201; __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \ - int32x4_t __rev1_201; __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \ - int16x8_t __ret_201; \ - __ret_201 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_201), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_201, __p2_201)))); \ +#define vqshrn_high_n_u32(__p0_201, __p1_201, __p2_201) __extension__ ({ \ + uint16x4_t __s0_201 = __p0_201; \ + uint32x4_t __s1_201 = __p1_201; \ + uint16x4_t __rev0_201; __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \ + uint32x4_t __rev1_201; __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \ + uint16x8_t __ret_201; \ + __ret_201 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_201), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_201, __p2_201)))); \ __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_201; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s64(__p0_202, __p1_202, __p2_202) __extension__ ({ \ - int32x2_t __s0_202 = __p0_202; \ - int64x2_t __s1_202 = __p1_202; \ - int32x4_t __ret_202; \ - __ret_202 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_202), (int32x2_t)(vqshrun_n_s64(__s1_202, __p2_202)))); \ +#define vqshrn_high_n_u64(__p0_202, __p1_202, __p2_202) __extension__ ({ \ + uint32x2_t __s0_202 = __p0_202; \ + uint64x2_t __s1_202 = __p1_202; \ + uint32x4_t __ret_202; \ + __ret_202 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_202), (uint32x2_t)(vqshrn_n_u64(__s1_202, __p2_202)))); \ __ret_202; \ }) #else -#define vqshrun_high_n_s64(__p0_203, __p1_203, __p2_203) __extension__ ({ \ - int32x2_t __s0_203 = __p0_203; \ - int64x2_t __s1_203 = __p1_203; \ - int32x2_t __rev0_203; __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \ - int64x2_t __rev1_203; __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \ - int32x4_t __ret_203; \ - __ret_203 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_203), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_203, __p2_203)))); \ +#define vqshrn_high_n_u64(__p0_203, __p1_203, __p2_203) __extension__ ({ \ + uint32x2_t __s0_203 = __p0_203; \ + uint64x2_t __s1_203 = __p1_203; \ + uint32x2_t __rev0_203; __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \ + uint64x2_t __rev1_203; __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \ + uint32x4_t __ret_203; \ + __ret_203 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_203), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_203, __p2_203)))); \ __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \ __ret_203; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s16(__p0_204, __p1_204, __p2_204) __extension__ ({ \ - int8x8_t __s0_204 = __p0_204; \ - int16x8_t __s1_204 = __p1_204; \ - int8x16_t __ret_204; \ - __ret_204 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_204), (int8x8_t)(vqshrun_n_s16(__s1_204, __p2_204)))); \ +#define vqshrn_high_n_u16(__p0_204, __p1_204, __p2_204) __extension__ ({ \ + uint8x8_t __s0_204 = __p0_204; \ + uint16x8_t __s1_204 = __p1_204; \ + uint8x16_t __ret_204; \ + __ret_204 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_204), (uint8x8_t)(vqshrn_n_u16(__s1_204, __p2_204)))); \ __ret_204; \ }) #else -#define vqshrun_high_n_s16(__p0_205, __p1_205, __p2_205) __extension__ ({ \ - int8x8_t __s0_205 = __p0_205; \ - int16x8_t __s1_205 = __p1_205; \ - int8x8_t __rev0_205; __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_205; __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_205; \ - __ret_205 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_205), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_205, __p2_205)))); \ +#define vqshrn_high_n_u16(__p0_205, __p1_205, __p2_205) __extension__ ({ \ + uint8x8_t __s0_205 = __p0_205; \ + uint16x8_t __s1_205 = __p1_205; \ + uint8x8_t __rev0_205; __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_205; __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_205; \ + __ret_205 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_205), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_205, __p2_205)))); \ __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_205; \ }) #endif +#ifdef __LITTLE_ENDIAN__ +#define vqshrn_high_n_s32(__p0_206, __p1_206, __p2_206) __extension__ ({ \ + int16x4_t __s0_206 = __p0_206; \ + int32x4_t __s1_206 = __p1_206; \ + int16x8_t __ret_206; \ + __ret_206 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_206), (int16x4_t)(vqshrn_n_s32(__s1_206, __p2_206)))); \ + __ret_206; \ +}) +#else +#define vqshrn_high_n_s32(__p0_207, __p1_207, __p2_207) __extension__ ({ \ + int16x4_t __s0_207 = __p0_207; \ + int32x4_t __s1_207 = __p1_207; \ + int16x4_t __rev0_207; __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \ + int32x4_t __rev1_207; __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \ + int16x8_t __ret_207; \ + __ret_207 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_207), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_207, __p2_207)))); \ + __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_207; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrn_high_n_s64(__p0_208, __p1_208, __p2_208) __extension__ ({ \ + int32x2_t __s0_208 = __p0_208; \ + int64x2_t __s1_208 = __p1_208; \ + int32x4_t __ret_208; \ + __ret_208 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_208), (int32x2_t)(vqshrn_n_s64(__s1_208, __p2_208)))); \ + __ret_208; \ +}) +#else +#define vqshrn_high_n_s64(__p0_209, __p1_209, __p2_209) __extension__ ({ \ + int32x2_t __s0_209 = __p0_209; \ + int64x2_t __s1_209 = __p1_209; \ + int32x2_t __rev0_209; __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \ + int64x2_t __rev1_209; __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \ + int32x4_t __ret_209; \ + __ret_209 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_209), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_209, __p2_209)))); \ + __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \ + __ret_209; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrn_high_n_s16(__p0_210, __p1_210, __p2_210) __extension__ ({ \ + int8x8_t __s0_210 = __p0_210; \ + int16x8_t __s1_210 = __p1_210; \ + int8x16_t __ret_210; \ + __ret_210 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_210), (int8x8_t)(vqshrn_n_s16(__s1_210, __p2_210)))); \ + __ret_210; \ +}) +#else +#define vqshrn_high_n_s16(__p0_211, __p1_211, __p2_211) __extension__ ({ \ + int8x8_t __s0_211 = __p0_211; \ + int16x8_t __s1_211 = __p1_211; \ + int8x8_t __rev0_211; __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_211; __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_211; \ + __ret_211 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_211), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_211, __p2_211)))); \ + __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_211; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrns_n_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vqshrns_n_u32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrnd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vqshrnd_n_u64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrnh_n_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + uint8_t __ret; \ + __ret = (uint8_t) __builtin_neon_vqshrnh_n_u16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrns_n_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vqshrns_n_s32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrnd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vqshrnd_n_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \ + __ret; \ +}) +#else +#define vqshrnh_n_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + int8_t __ret; \ + __ret = (int8_t) __builtin_neon_vqshrnh_n_s16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrun_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \ + int16x4_t __s0_212 = __p0_212; \ + int32x4_t __s1_212 = __p1_212; \ + int16x8_t __ret_212; \ + __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vqshrun_n_s32(__s1_212, __p2_212)))); \ + __ret_212; \ +}) +#else +#define vqshrun_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \ + int16x4_t __s0_213 = __p0_213; \ + int32x4_t __s1_213 = __p1_213; \ + int16x4_t __rev0_213; __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \ + int32x4_t __rev1_213; __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \ + int16x8_t __ret_213; \ + __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_213, __p2_213)))); \ + __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_213; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrun_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \ + int32x2_t __s0_214 = __p0_214; \ + int64x2_t __s1_214 = __p1_214; \ + int32x4_t __ret_214; \ + __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vqshrun_n_s64(__s1_214, __p2_214)))); \ + __ret_214; \ +}) +#else +#define vqshrun_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \ + int32x2_t __s0_215 = __p0_215; \ + int64x2_t __s1_215 = __p1_215; \ + int32x2_t __rev0_215; __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \ + int64x2_t __rev1_215; __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \ + int32x4_t __ret_215; \ + __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_215, __p2_215)))); \ + __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \ + __ret_215; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vqshrun_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \ + int8x8_t __s0_216 = __p0_216; \ + int16x8_t __s1_216 = __p1_216; \ + int8x16_t __ret_216; \ + __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vqshrun_n_s16(__s1_216, __p2_216)))); \ + __ret_216; \ +}) +#else +#define vqshrun_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \ + int8x8_t __s0_217 = __p0_217; \ + int16x8_t __s1_217 = __p1_217; \ + int8x8_t __rev0_217; __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_217; __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_217; \ + __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_217, __p2_217)))); \ + __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_217; \ +}) +#endif + #ifdef __LITTLE_ENDIAN__ #define vqshruns_n_s32(__p0, __p1) __extension__ ({ \ int32_t __s0 = __p0; \ @@ -60265,128 +63577,128 @@ __ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u32(__p0_206, __p1_206, __p2_206) __extension__ ({ \ - uint16x4_t __s0_206 = __p0_206; \ - uint32x4_t __s1_206 = __p1_206; \ - uint16x8_t __ret_206; \ - __ret_206 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_206), (uint16x4_t)(vrshrn_n_u32(__s1_206, __p2_206)))); \ - __ret_206; \ +#define vrshrn_high_n_u32(__p0_218, __p1_218, __p2_218) __extension__ ({ \ + uint16x4_t __s0_218 = __p0_218; \ + uint32x4_t __s1_218 = __p1_218; \ + uint16x8_t __ret_218; \ + __ret_218 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_218), (uint16x4_t)(vrshrn_n_u32(__s1_218, __p2_218)))); \ + __ret_218; \ }) #else -#define vrshrn_high_n_u32(__p0_207, __p1_207, __p2_207) __extension__ ({ \ - uint16x4_t __s0_207 = __p0_207; \ - uint32x4_t __s1_207 = __p1_207; \ - uint16x4_t __rev0_207; __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \ - uint32x4_t __rev1_207; __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \ - uint16x8_t __ret_207; \ - __ret_207 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_207), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_207, __p2_207)))); \ - __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_207; \ +#define vrshrn_high_n_u32(__p0_219, __p1_219, __p2_219) __extension__ ({ \ + uint16x4_t __s0_219 = __p0_219; \ + uint32x4_t __s1_219 = __p1_219; \ + uint16x4_t __rev0_219; __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 3, 2, 1, 0); \ + uint32x4_t __rev1_219; __rev1_219 = __builtin_shufflevector(__s1_219, __s1_219, 3, 2, 1, 0); \ + uint16x8_t __ret_219; \ + __ret_219 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_219), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_219, __p2_219)))); \ + __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_219; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u64(__p0_208, __p1_208, __p2_208) __extension__ ({ \ - uint32x2_t __s0_208 = __p0_208; \ - uint64x2_t __s1_208 = __p1_208; \ - uint32x4_t __ret_208; \ - __ret_208 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_208), (uint32x2_t)(vrshrn_n_u64(__s1_208, __p2_208)))); \ - __ret_208; \ +#define vrshrn_high_n_u64(__p0_220, __p1_220, __p2_220) __extension__ ({ \ + uint32x2_t __s0_220 = __p0_220; \ + uint64x2_t __s1_220 = __p1_220; \ + uint32x4_t __ret_220; \ + __ret_220 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_220), (uint32x2_t)(vrshrn_n_u64(__s1_220, __p2_220)))); \ + __ret_220; \ }) #else -#define vrshrn_high_n_u64(__p0_209, __p1_209, __p2_209) __extension__ ({ \ - uint32x2_t __s0_209 = __p0_209; \ - uint64x2_t __s1_209 = __p1_209; \ - uint32x2_t __rev0_209; __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \ - uint64x2_t __rev1_209; __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \ - uint32x4_t __ret_209; \ - __ret_209 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_209), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_209, __p2_209)))); \ - __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \ - __ret_209; \ +#define vrshrn_high_n_u64(__p0_221, __p1_221, __p2_221) __extension__ ({ \ + uint32x2_t __s0_221 = __p0_221; \ + uint64x2_t __s1_221 = __p1_221; \ + uint32x2_t __rev0_221; __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 1, 0); \ + uint64x2_t __rev1_221; __rev1_221 = __builtin_shufflevector(__s1_221, __s1_221, 1, 0); \ + uint32x4_t __ret_221; \ + __ret_221 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_221), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_221, __p2_221)))); \ + __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 3, 2, 1, 0); \ + __ret_221; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u16(__p0_210, __p1_210, __p2_210) __extension__ ({ \ - uint8x8_t __s0_210 = __p0_210; \ - uint16x8_t __s1_210 = __p1_210; \ - uint8x16_t __ret_210; \ - __ret_210 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_210), (uint8x8_t)(vrshrn_n_u16(__s1_210, __p2_210)))); \ - __ret_210; \ +#define vrshrn_high_n_u16(__p0_222, __p1_222, __p2_222) __extension__ ({ \ + uint8x8_t __s0_222 = __p0_222; \ + uint16x8_t __s1_222 = __p1_222; \ + uint8x16_t __ret_222; \ + __ret_222 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_222), (uint8x8_t)(vrshrn_n_u16(__s1_222, __p2_222)))); \ + __ret_222; \ }) #else -#define vrshrn_high_n_u16(__p0_211, __p1_211, __p2_211) __extension__ ({ \ - uint8x8_t __s0_211 = __p0_211; \ - uint16x8_t __s1_211 = __p1_211; \ - uint8x8_t __rev0_211; __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_211; __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_211; \ - __ret_211 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_211), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_211, __p2_211)))); \ - __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_211; \ +#define vrshrn_high_n_u16(__p0_223, __p1_223, __p2_223) __extension__ ({ \ + uint8x8_t __s0_223 = __p0_223; \ + uint16x8_t __s1_223 = __p1_223; \ + uint8x8_t __rev0_223; __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_223; __rev1_223 = __builtin_shufflevector(__s1_223, __s1_223, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_223; \ + __ret_223 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_223), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_223, __p2_223)))); \ + __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_223; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \ - int16x4_t __s0_212 = __p0_212; \ - int32x4_t __s1_212 = __p1_212; \ - int16x8_t __ret_212; \ - __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vrshrn_n_s32(__s1_212, __p2_212)))); \ - __ret_212; \ +#define vrshrn_high_n_s32(__p0_224, __p1_224, __p2_224) __extension__ ({ \ + int16x4_t __s0_224 = __p0_224; \ + int32x4_t __s1_224 = __p1_224; \ + int16x8_t __ret_224; \ + __ret_224 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_224), (int16x4_t)(vrshrn_n_s32(__s1_224, __p2_224)))); \ + __ret_224; \ }) #else -#define vrshrn_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \ - int16x4_t __s0_213 = __p0_213; \ - int32x4_t __s1_213 = __p1_213; \ - int16x4_t __rev0_213; __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \ - int32x4_t __rev1_213; __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \ - int16x8_t __ret_213; \ - __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_213, __p2_213)))); \ - __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_213; \ +#define vrshrn_high_n_s32(__p0_225, __p1_225, __p2_225) __extension__ ({ \ + int16x4_t __s0_225 = __p0_225; \ + int32x4_t __s1_225 = __p1_225; \ + int16x4_t __rev0_225; __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 3, 2, 1, 0); \ + int32x4_t __rev1_225; __rev1_225 = __builtin_shufflevector(__s1_225, __s1_225, 3, 2, 1, 0); \ + int16x8_t __ret_225; \ + __ret_225 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_225), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_225, __p2_225)))); \ + __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_225; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \ - int32x2_t __s0_214 = __p0_214; \ - int64x2_t __s1_214 = __p1_214; \ - int32x4_t __ret_214; \ - __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vrshrn_n_s64(__s1_214, __p2_214)))); \ - __ret_214; \ +#define vrshrn_high_n_s64(__p0_226, __p1_226, __p2_226) __extension__ ({ \ + int32x2_t __s0_226 = __p0_226; \ + int64x2_t __s1_226 = __p1_226; \ + int32x4_t __ret_226; \ + __ret_226 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_226), (int32x2_t)(vrshrn_n_s64(__s1_226, __p2_226)))); \ + __ret_226; \ }) #else -#define vrshrn_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \ - int32x2_t __s0_215 = __p0_215; \ - int64x2_t __s1_215 = __p1_215; \ - int32x2_t __rev0_215; __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \ - int64x2_t __rev1_215; __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \ - int32x4_t __ret_215; \ - __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_215, __p2_215)))); \ - __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \ - __ret_215; \ +#define vrshrn_high_n_s64(__p0_227, __p1_227, __p2_227) __extension__ ({ \ + int32x2_t __s0_227 = __p0_227; \ + int64x2_t __s1_227 = __p1_227; \ + int32x2_t __rev0_227; __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 1, 0); \ + int64x2_t __rev1_227; __rev1_227 = __builtin_shufflevector(__s1_227, __s1_227, 1, 0); \ + int32x4_t __ret_227; \ + __ret_227 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_227), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_227, __p2_227)))); \ + __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 3, 2, 1, 0); \ + __ret_227; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \ - int8x8_t __s0_216 = __p0_216; \ - int16x8_t __s1_216 = __p1_216; \ - int8x16_t __ret_216; \ - __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vrshrn_n_s16(__s1_216, __p2_216)))); \ - __ret_216; \ +#define vrshrn_high_n_s16(__p0_228, __p1_228, __p2_228) __extension__ ({ \ + int8x8_t __s0_228 = __p0_228; \ + int16x8_t __s1_228 = __p1_228; \ + int8x16_t __ret_228; \ + __ret_228 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_228), (int8x8_t)(vrshrn_n_s16(__s1_228, __p2_228)))); \ + __ret_228; \ }) #else -#define vrshrn_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \ - int8x8_t __s0_217 = __p0_217; \ - int16x8_t __s1_217 = __p1_217; \ - int8x8_t __rev0_217; __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_217; __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_217; \ - __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_217, __p2_217)))); \ - __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_217; \ +#define vrshrn_high_n_s16(__p0_229, __p1_229, __p2_229) __extension__ ({ \ + int8x8_t __s0_229 = __p0_229; \ + int16x8_t __s1_229 = __p1_229; \ + int8x8_t __rev0_229; __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_229; __rev1_229 = __builtin_shufflevector(__s1_229, __s1_229, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_229; \ + __ret_229 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_229), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_229, __p2_229)))); \ + __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_229; \ }) #endif @@ -60816,271 +64128,271 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u8(__p0_218, __p1_218) __extension__ ({ \ - uint8x16_t __s0_218 = __p0_218; \ - uint16x8_t __ret_218; \ - __ret_218 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_218), __p1_218)); \ - __ret_218; \ -}) -#else -#define vshll_high_n_u8(__p0_219, __p1_219) __extension__ ({ \ - uint8x16_t __s0_219 = __p0_219; \ - uint8x16_t __rev0_219; __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __ret_219; \ - __ret_219 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_219), __p1_219)); \ - __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_219; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u32(__p0_220, __p1_220) __extension__ ({ \ - uint32x4_t __s0_220 = __p0_220; \ - uint64x2_t __ret_220; \ - __ret_220 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_220), __p1_220)); \ - __ret_220; \ -}) -#else -#define vshll_high_n_u32(__p0_221, __p1_221) __extension__ ({ \ - uint32x4_t __s0_221 = __p0_221; \ - uint32x4_t __rev0_221; __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \ - uint64x2_t __ret_221; \ - __ret_221 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_221), __p1_221)); \ - __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 1, 0); \ - __ret_221; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u16(__p0_222, __p1_222) __extension__ ({ \ - uint16x8_t __s0_222 = __p0_222; \ - uint32x4_t __ret_222; \ - __ret_222 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_222), __p1_222)); \ - __ret_222; \ -}) -#else -#define vshll_high_n_u16(__p0_223, __p1_223) __extension__ ({ \ - uint16x8_t __s0_223 = __p0_223; \ - uint16x8_t __rev0_223; __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint32x4_t __ret_223; \ - __ret_223 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_223), __p1_223)); \ - __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \ - __ret_223; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s8(__p0_224, __p1_224) __extension__ ({ \ - int8x16_t __s0_224 = __p0_224; \ - int16x8_t __ret_224; \ - __ret_224 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_224), __p1_224)); \ - __ret_224; \ -}) -#else -#define vshll_high_n_s8(__p0_225, __p1_225) __extension__ ({ \ - int8x16_t __s0_225 = __p0_225; \ - int8x16_t __rev0_225; __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __ret_225; \ - __ret_225 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_225), __p1_225)); \ - __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_225; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s32(__p0_226, __p1_226) __extension__ ({ \ - int32x4_t __s0_226 = __p0_226; \ - int64x2_t __ret_226; \ - __ret_226 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_226), __p1_226)); \ - __ret_226; \ -}) -#else -#define vshll_high_n_s32(__p0_227, __p1_227) __extension__ ({ \ - int32x4_t __s0_227 = __p0_227; \ - int32x4_t __rev0_227; __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 3, 2, 1, 0); \ - int64x2_t __ret_227; \ - __ret_227 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_227), __p1_227)); \ - __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \ - __ret_227; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s16(__p0_228, __p1_228) __extension__ ({ \ - int16x8_t __s0_228 = __p0_228; \ - int32x4_t __ret_228; \ - __ret_228 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_228), __p1_228)); \ - __ret_228; \ -}) -#else -#define vshll_high_n_s16(__p0_229, __p1_229) __extension__ ({ \ - int16x8_t __s0_229 = __p0_229; \ - int16x8_t __rev0_229; __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \ - int32x4_t __ret_229; \ - __ret_229 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_229), __p1_229)); \ - __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \ - __ret_229; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshrd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint64_t __ret; \ - __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \ - __ret; \ -}) -#else -#define vshrd_n_u64(__p0, __p1) __extension__ ({ \ - uint64_t __s0 = __p0; \ - uint64_t __ret; \ - __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshrd_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \ - __ret; \ -}) -#else -#define vshrd_n_s64(__p0, __p1) __extension__ ({ \ - int64_t __s0 = __p0; \ - int64_t __ret; \ - __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u32(__p0_230, __p1_230, __p2_230) __extension__ ({ \ - uint16x4_t __s0_230 = __p0_230; \ - uint32x4_t __s1_230 = __p1_230; \ +#define vshll_high_n_u8(__p0_230, __p1_230) __extension__ ({ \ + uint8x16_t __s0_230 = __p0_230; \ uint16x8_t __ret_230; \ - __ret_230 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_230), (uint16x4_t)(vshrn_n_u32(__s1_230, __p2_230)))); \ + __ret_230 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_230), __p1_230)); \ __ret_230; \ }) #else -#define vshrn_high_n_u32(__p0_231, __p1_231, __p2_231) __extension__ ({ \ - uint16x4_t __s0_231 = __p0_231; \ - uint32x4_t __s1_231 = __p1_231; \ - uint16x4_t __rev0_231; __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 3, 2, 1, 0); \ - uint32x4_t __rev1_231; __rev1_231 = __builtin_shufflevector(__s1_231, __s1_231, 3, 2, 1, 0); \ +#define vshll_high_n_u8(__p0_231, __p1_231) __extension__ ({ \ + uint8x16_t __s0_231 = __p0_231; \ + uint8x16_t __rev0_231; __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ uint16x8_t __ret_231; \ - __ret_231 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_231), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_231, __p2_231)))); \ + __ret_231 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_231), __p1_231)); \ __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_231; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u64(__p0_232, __p1_232, __p2_232) __extension__ ({ \ - uint32x2_t __s0_232 = __p0_232; \ - uint64x2_t __s1_232 = __p1_232; \ - uint32x4_t __ret_232; \ - __ret_232 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_232), (uint32x2_t)(vshrn_n_u64(__s1_232, __p2_232)))); \ +#define vshll_high_n_u32(__p0_232, __p1_232) __extension__ ({ \ + uint32x4_t __s0_232 = __p0_232; \ + uint64x2_t __ret_232; \ + __ret_232 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_232), __p1_232)); \ __ret_232; \ }) #else -#define vshrn_high_n_u64(__p0_233, __p1_233, __p2_233) __extension__ ({ \ - uint32x2_t __s0_233 = __p0_233; \ - uint64x2_t __s1_233 = __p1_233; \ - uint32x2_t __rev0_233; __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 1, 0); \ - uint64x2_t __rev1_233; __rev1_233 = __builtin_shufflevector(__s1_233, __s1_233, 1, 0); \ - uint32x4_t __ret_233; \ - __ret_233 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_233), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_233, __p2_233)))); \ - __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \ +#define vshll_high_n_u32(__p0_233, __p1_233) __extension__ ({ \ + uint32x4_t __s0_233 = __p0_233; \ + uint32x4_t __rev0_233; __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 3, 2, 1, 0); \ + uint64x2_t __ret_233; \ + __ret_233 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_233), __p1_233)); \ + __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 1, 0); \ __ret_233; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u16(__p0_234, __p1_234, __p2_234) __extension__ ({ \ - uint8x8_t __s0_234 = __p0_234; \ - uint16x8_t __s1_234 = __p1_234; \ - uint8x16_t __ret_234; \ - __ret_234 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_234), (uint8x8_t)(vshrn_n_u16(__s1_234, __p2_234)))); \ +#define vshll_high_n_u16(__p0_234, __p1_234) __extension__ ({ \ + uint16x8_t __s0_234 = __p0_234; \ + uint32x4_t __ret_234; \ + __ret_234 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_234), __p1_234)); \ __ret_234; \ }) #else -#define vshrn_high_n_u16(__p0_235, __p1_235, __p2_235) __extension__ ({ \ - uint8x8_t __s0_235 = __p0_235; \ - uint16x8_t __s1_235 = __p1_235; \ - uint8x8_t __rev0_235; __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_235; __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_235; \ - __ret_235 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_235), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_235, __p2_235)))); \ - __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vshll_high_n_u16(__p0_235, __p1_235) __extension__ ({ \ + uint16x8_t __s0_235 = __p0_235; \ + uint16x8_t __rev0_235; __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x4_t __ret_235; \ + __ret_235 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_235), __p1_235)); \ + __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 3, 2, 1, 0); \ __ret_235; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s32(__p0_236, __p1_236, __p2_236) __extension__ ({ \ - int16x4_t __s0_236 = __p0_236; \ - int32x4_t __s1_236 = __p1_236; \ +#define vshll_high_n_s8(__p0_236, __p1_236) __extension__ ({ \ + int8x16_t __s0_236 = __p0_236; \ int16x8_t __ret_236; \ - __ret_236 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_236), (int16x4_t)(vshrn_n_s32(__s1_236, __p2_236)))); \ + __ret_236 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_236), __p1_236)); \ __ret_236; \ }) #else -#define vshrn_high_n_s32(__p0_237, __p1_237, __p2_237) __extension__ ({ \ - int16x4_t __s0_237 = __p0_237; \ - int32x4_t __s1_237 = __p1_237; \ - int16x4_t __rev0_237; __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \ - int32x4_t __rev1_237; __rev1_237 = __builtin_shufflevector(__s1_237, __s1_237, 3, 2, 1, 0); \ +#define vshll_high_n_s8(__p0_237, __p1_237) __extension__ ({ \ + int8x16_t __s0_237 = __p0_237; \ + int8x16_t __rev0_237; __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int16x8_t __ret_237; \ - __ret_237 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_237), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_237, __p2_237)))); \ + __ret_237 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_237), __p1_237)); \ __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_237; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s64(__p0_238, __p1_238, __p2_238) __extension__ ({ \ - int32x2_t __s0_238 = __p0_238; \ - int64x2_t __s1_238 = __p1_238; \ - int32x4_t __ret_238; \ - __ret_238 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_238), (int32x2_t)(vshrn_n_s64(__s1_238, __p2_238)))); \ +#define vshll_high_n_s32(__p0_238, __p1_238) __extension__ ({ \ + int32x4_t __s0_238 = __p0_238; \ + int64x2_t __ret_238; \ + __ret_238 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_238), __p1_238)); \ __ret_238; \ }) #else -#define vshrn_high_n_s64(__p0_239, __p1_239, __p2_239) __extension__ ({ \ - int32x2_t __s0_239 = __p0_239; \ - int64x2_t __s1_239 = __p1_239; \ - int32x2_t __rev0_239; __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \ - int64x2_t __rev1_239; __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \ - int32x4_t __ret_239; \ - __ret_239 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_239), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_239, __p2_239)))); \ - __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \ +#define vshll_high_n_s32(__p0_239, __p1_239) __extension__ ({ \ + int32x4_t __s0_239 = __p0_239; \ + int32x4_t __rev0_239; __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 3, 2, 1, 0); \ + int64x2_t __ret_239; \ + __ret_239 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_239), __p1_239)); \ + __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 1, 0); \ __ret_239; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s16(__p0_240, __p1_240, __p2_240) __extension__ ({ \ - int8x8_t __s0_240 = __p0_240; \ - int16x8_t __s1_240 = __p1_240; \ - int8x16_t __ret_240; \ - __ret_240 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_240), (int8x8_t)(vshrn_n_s16(__s1_240, __p2_240)))); \ +#define vshll_high_n_s16(__p0_240, __p1_240) __extension__ ({ \ + int16x8_t __s0_240 = __p0_240; \ + int32x4_t __ret_240; \ + __ret_240 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_240), __p1_240)); \ __ret_240; \ }) #else -#define vshrn_high_n_s16(__p0_241, __p1_241, __p2_241) __extension__ ({ \ - int8x8_t __s0_241 = __p0_241; \ - int16x8_t __s1_241 = __p1_241; \ - int8x8_t __rev0_241; __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_241; __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_241; \ - __ret_241 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_241), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_241, __p2_241)))); \ - __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vshll_high_n_s16(__p0_241, __p1_241) __extension__ ({ \ + int16x8_t __s0_241 = __p0_241; \ + int16x8_t __rev0_241; __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x4_t __ret_241; \ + __ret_241 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_241), __p1_241)); \ + __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 3, 2, 1, 0); \ __ret_241; \ }) #endif +#ifdef __LITTLE_ENDIAN__ +#define vshrd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \ + __ret; \ +}) +#else +#define vshrd_n_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vshrd_n_u64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vshrd_n_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vshrd_n_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_u32(__p0_242, __p1_242, __p2_242) __extension__ ({ \ + uint16x4_t __s0_242 = __p0_242; \ + uint32x4_t __s1_242 = __p1_242; \ + uint16x8_t __ret_242; \ + __ret_242 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_242), (uint16x4_t)(vshrn_n_u32(__s1_242, __p2_242)))); \ + __ret_242; \ +}) +#else +#define vshrn_high_n_u32(__p0_243, __p1_243, __p2_243) __extension__ ({ \ + uint16x4_t __s0_243 = __p0_243; \ + uint32x4_t __s1_243 = __p1_243; \ + uint16x4_t __rev0_243; __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \ + uint32x4_t __rev1_243; __rev1_243 = __builtin_shufflevector(__s1_243, __s1_243, 3, 2, 1, 0); \ + uint16x8_t __ret_243; \ + __ret_243 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_243), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_243, __p2_243)))); \ + __ret_243 = __builtin_shufflevector(__ret_243, __ret_243, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_243; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_u64(__p0_244, __p1_244, __p2_244) __extension__ ({ \ + uint32x2_t __s0_244 = __p0_244; \ + uint64x2_t __s1_244 = __p1_244; \ + uint32x4_t __ret_244; \ + __ret_244 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_244), (uint32x2_t)(vshrn_n_u64(__s1_244, __p2_244)))); \ + __ret_244; \ +}) +#else +#define vshrn_high_n_u64(__p0_245, __p1_245, __p2_245) __extension__ ({ \ + uint32x2_t __s0_245 = __p0_245; \ + uint64x2_t __s1_245 = __p1_245; \ + uint32x2_t __rev0_245; __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 1, 0); \ + uint64x2_t __rev1_245; __rev1_245 = __builtin_shufflevector(__s1_245, __s1_245, 1, 0); \ + uint32x4_t __ret_245; \ + __ret_245 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_245), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_245, __p2_245)))); \ + __ret_245 = __builtin_shufflevector(__ret_245, __ret_245, 3, 2, 1, 0); \ + __ret_245; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_u16(__p0_246, __p1_246, __p2_246) __extension__ ({ \ + uint8x8_t __s0_246 = __p0_246; \ + uint16x8_t __s1_246 = __p1_246; \ + uint8x16_t __ret_246; \ + __ret_246 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_246), (uint8x8_t)(vshrn_n_u16(__s1_246, __p2_246)))); \ + __ret_246; \ +}) +#else +#define vshrn_high_n_u16(__p0_247, __p1_247, __p2_247) __extension__ ({ \ + uint8x8_t __s0_247 = __p0_247; \ + uint16x8_t __s1_247 = __p1_247; \ + uint8x8_t __rev0_247; __rev0_247 = __builtin_shufflevector(__s0_247, __s0_247, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_247; __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_247; \ + __ret_247 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_247), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_247, __p2_247)))); \ + __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_247; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_s32(__p0_248, __p1_248, __p2_248) __extension__ ({ \ + int16x4_t __s0_248 = __p0_248; \ + int32x4_t __s1_248 = __p1_248; \ + int16x8_t __ret_248; \ + __ret_248 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_248), (int16x4_t)(vshrn_n_s32(__s1_248, __p2_248)))); \ + __ret_248; \ +}) +#else +#define vshrn_high_n_s32(__p0_249, __p1_249, __p2_249) __extension__ ({ \ + int16x4_t __s0_249 = __p0_249; \ + int32x4_t __s1_249 = __p1_249; \ + int16x4_t __rev0_249; __rev0_249 = __builtin_shufflevector(__s0_249, __s0_249, 3, 2, 1, 0); \ + int32x4_t __rev1_249; __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 3, 2, 1, 0); \ + int16x8_t __ret_249; \ + __ret_249 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_249), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_249, __p2_249)))); \ + __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_249; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_s64(__p0_250, __p1_250, __p2_250) __extension__ ({ \ + int32x2_t __s0_250 = __p0_250; \ + int64x2_t __s1_250 = __p1_250; \ + int32x4_t __ret_250; \ + __ret_250 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_250), (int32x2_t)(vshrn_n_s64(__s1_250, __p2_250)))); \ + __ret_250; \ +}) +#else +#define vshrn_high_n_s64(__p0_251, __p1_251, __p2_251) __extension__ ({ \ + int32x2_t __s0_251 = __p0_251; \ + int64x2_t __s1_251 = __p1_251; \ + int32x2_t __rev0_251; __rev0_251 = __builtin_shufflevector(__s0_251, __s0_251, 1, 0); \ + int64x2_t __rev1_251; __rev1_251 = __builtin_shufflevector(__s1_251, __s1_251, 1, 0); \ + int32x4_t __ret_251; \ + __ret_251 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_251), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_251, __p2_251)))); \ + __ret_251 = __builtin_shufflevector(__ret_251, __ret_251, 3, 2, 1, 0); \ + __ret_251; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vshrn_high_n_s16(__p0_252, __p1_252, __p2_252) __extension__ ({ \ + int8x8_t __s0_252 = __p0_252; \ + int16x8_t __s1_252 = __p1_252; \ + int8x16_t __ret_252; \ + __ret_252 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_252), (int8x8_t)(vshrn_n_s16(__s1_252, __p2_252)))); \ + __ret_252; \ +}) +#else +#define vshrn_high_n_s16(__p0_253, __p1_253, __p2_253) __extension__ ({ \ + int8x8_t __s0_253 = __p0_253; \ + int16x8_t __s1_253 = __p1_253; \ + int8x8_t __rev0_253; __rev0_253 = __builtin_shufflevector(__s0_253, __s0_253, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_253; __rev1_253 = __builtin_shufflevector(__s1_253, __s1_253, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_253; \ + __ret_253 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_253), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_253, __p2_253)))); \ + __ret_253 = __builtin_shufflevector(__ret_253, __ret_253, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_253; \ +}) +#endif + #ifdef __LITTLE_ENDIAN__ #define vslid_n_u64(__p0, __p1, __p2) __extension__ ({ \ uint64_t __s0 = __p0; \ @@ -67149,44 +70461,60 @@ __ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vget_lane_f16(__p0_242, __p1_242) __extension__ ({ \ - float16x4_t __s0_242 = __p0_242; \ - float16_t __ret_242; \ -float16x4_t __reint_242 = __s0_242; \ -int16_t __reint1_242 = vget_lane_s16(*(int16x4_t *) &__reint_242, __p1_242); \ - __ret_242 = *(float16_t *) &__reint1_242; \ - __ret_242; \ +#define vget_lane_f16(__p0_254, __p1_254) __extension__ ({ \ + float16x4_t __s0_254 = __p0_254; \ + float16_t __ret_254; \ +float16x4_t __reint_254 = __s0_254; \ +int16_t __reint1_254 = vget_lane_s16(*(int16x4_t *) &__reint_254, __p1_254); \ + __ret_254 = *(float16_t *) &__reint1_254; \ + __ret_254; \ }) #else -#define vget_lane_f16(__p0_243, __p1_243) __extension__ ({ \ - float16x4_t __s0_243 = __p0_243; \ - float16x4_t __rev0_243; __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \ - float16_t __ret_243; \ -float16x4_t __reint_243 = __rev0_243; \ -int16_t __reint1_243 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_243, __p1_243); \ - __ret_243 = *(float16_t *) &__reint1_243; \ - __ret_243; \ +#define vget_lane_f16(__p0_255, __p1_255) __extension__ ({ \ + float16x4_t __s0_255 = __p0_255; \ + float16x4_t __rev0_255; __rev0_255 = __builtin_shufflevector(__s0_255, __s0_255, 3, 2, 1, 0); \ + float16_t __ret_255; \ +float16x4_t __reint_255 = __rev0_255; \ +int16_t __reint1_255 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_255, __p1_255); \ + __ret_255 = *(float16_t *) &__reint1_255; \ + __ret_255; \ +}) +#define __noswap_vget_lane_f16(__p0_256, __p1_256) __extension__ ({ \ + float16x4_t __s0_256 = __p0_256; \ + float16_t __ret_256; \ +float16x4_t __reint_256 = __s0_256; \ +int16_t __reint1_256 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_256, __p1_256); \ + __ret_256 = *(float16_t *) &__reint1_256; \ + __ret_256; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vgetq_lane_f16(__p0_244, __p1_244) __extension__ ({ \ - float16x8_t __s0_244 = __p0_244; \ - float16_t __ret_244; \ -float16x8_t __reint_244 = __s0_244; \ -int16_t __reint1_244 = vgetq_lane_s16(*(int16x8_t *) &__reint_244, __p1_244); \ - __ret_244 = *(float16_t *) &__reint1_244; \ - __ret_244; \ +#define vgetq_lane_f16(__p0_257, __p1_257) __extension__ ({ \ + float16x8_t __s0_257 = __p0_257; \ + float16_t __ret_257; \ +float16x8_t __reint_257 = __s0_257; \ +int16_t __reint1_257 = vgetq_lane_s16(*(int16x8_t *) &__reint_257, __p1_257); \ + __ret_257 = *(float16_t *) &__reint1_257; \ + __ret_257; \ }) #else -#define vgetq_lane_f16(__p0_245, __p1_245) __extension__ ({ \ - float16x8_t __s0_245 = __p0_245; \ - float16x8_t __rev0_245; __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16_t __ret_245; \ -float16x8_t __reint_245 = __rev0_245; \ -int16_t __reint1_245 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_245, __p1_245); \ - __ret_245 = *(float16_t *) &__reint1_245; \ - __ret_245; \ +#define vgetq_lane_f16(__p0_258, __p1_258) __extension__ ({ \ + float16x8_t __s0_258 = __p0_258; \ + float16x8_t __rev0_258; __rev0_258 = __builtin_shufflevector(__s0_258, __s0_258, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_258; \ +float16x8_t __reint_258 = __rev0_258; \ +int16_t __reint1_258 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_258, __p1_258); \ + __ret_258 = *(float16_t *) &__reint1_258; \ + __ret_258; \ +}) +#define __noswap_vgetq_lane_f16(__p0_259, __p1_259) __extension__ ({ \ + float16x8_t __s0_259 = __p0_259; \ + float16_t __ret_259; \ +float16x8_t __reint_259 = __s0_259; \ +int16_t __reint1_259 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_259, __p1_259); \ + __ret_259 = *(float16_t *) &__reint1_259; \ + __ret_259; \ }) #endif @@ -67835,57 +71163,97 @@ __ai int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2 #endif #ifdef __LITTLE_ENDIAN__ -#define vset_lane_f16(__p0_246, __p1_246, __p2_246) __extension__ ({ \ - float16_t __s0_246 = __p0_246; \ - float16x4_t __s1_246 = __p1_246; \ - float16x4_t __ret_246; \ -float16_t __reint_246 = __s0_246; \ -float16x4_t __reint1_246 = __s1_246; \ -int16x4_t __reint2_246 = vset_lane_s16(*(int16_t *) &__reint_246, *(int16x4_t *) &__reint1_246, __p2_246); \ - __ret_246 = *(float16x4_t *) &__reint2_246; \ - __ret_246; \ +#define vset_lane_f16(__p0_260, __p1_260, __p2_260) __extension__ ({ \ + float16_t __s0_260 = __p0_260; \ + float16x4_t __s1_260 = __p1_260; \ + float16x4_t __ret_260; \ +float16_t __reint_260 = __s0_260; \ +float16x4_t __reint1_260 = __s1_260; \ +int16x4_t __reint2_260 = vset_lane_s16(*(int16_t *) &__reint_260, *(int16x4_t *) &__reint1_260, __p2_260); \ + __ret_260 = *(float16x4_t *) &__reint2_260; \ + __ret_260; \ }) #else -#define vset_lane_f16(__p0_247, __p1_247, __p2_247) __extension__ ({ \ - float16_t __s0_247 = __p0_247; \ - float16x4_t __s1_247 = __p1_247; \ - float16x4_t __rev1_247; __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 3, 2, 1, 0); \ - float16x4_t __ret_247; \ -float16_t __reint_247 = __s0_247; \ -float16x4_t __reint1_247 = __rev1_247; \ -int16x4_t __reint2_247 = __noswap_vset_lane_s16(*(int16_t *) &__reint_247, *(int16x4_t *) &__reint1_247, __p2_247); \ - __ret_247 = *(float16x4_t *) &__reint2_247; \ - __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 3, 2, 1, 0); \ - __ret_247; \ +#define vset_lane_f16(__p0_261, __p1_261, __p2_261) __extension__ ({ \ + float16_t __s0_261 = __p0_261; \ + float16x4_t __s1_261 = __p1_261; \ + float16x4_t __rev1_261; __rev1_261 = __builtin_shufflevector(__s1_261, __s1_261, 3, 2, 1, 0); \ + float16x4_t __ret_261; \ +float16_t __reint_261 = __s0_261; \ +float16x4_t __reint1_261 = __rev1_261; \ +int16x4_t __reint2_261 = __noswap_vset_lane_s16(*(int16_t *) &__reint_261, *(int16x4_t *) &__reint1_261, __p2_261); \ + __ret_261 = *(float16x4_t *) &__reint2_261; \ + __ret_261 = __builtin_shufflevector(__ret_261, __ret_261, 3, 2, 1, 0); \ + __ret_261; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vsetq_lane_f16(__p0_248, __p1_248, __p2_248) __extension__ ({ \ - float16_t __s0_248 = __p0_248; \ - float16x8_t __s1_248 = __p1_248; \ - float16x8_t __ret_248; \ -float16_t __reint_248 = __s0_248; \ -float16x8_t __reint1_248 = __s1_248; \ -int16x8_t __reint2_248 = vsetq_lane_s16(*(int16_t *) &__reint_248, *(int16x8_t *) &__reint1_248, __p2_248); \ - __ret_248 = *(float16x8_t *) &__reint2_248; \ - __ret_248; \ +#define vsetq_lane_f16(__p0_262, __p1_262, __p2_262) __extension__ ({ \ + float16_t __s0_262 = __p0_262; \ + float16x8_t __s1_262 = __p1_262; \ + float16x8_t __ret_262; \ +float16_t __reint_262 = __s0_262; \ +float16x8_t __reint1_262 = __s1_262; \ +int16x8_t __reint2_262 = vsetq_lane_s16(*(int16_t *) &__reint_262, *(int16x8_t *) &__reint1_262, __p2_262); \ + __ret_262 = *(float16x8_t *) &__reint2_262; \ + __ret_262; \ }) #else -#define vsetq_lane_f16(__p0_249, __p1_249, __p2_249) __extension__ ({ \ - float16_t __s0_249 = __p0_249; \ - float16x8_t __s1_249 = __p1_249; \ - float16x8_t __rev1_249; __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __ret_249; \ -float16_t __reint_249 = __s0_249; \ -float16x8_t __reint1_249 = __rev1_249; \ -int16x8_t __reint2_249 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_249, *(int16x8_t *) &__reint1_249, __p2_249); \ - __ret_249 = *(float16x8_t *) &__reint2_249; \ - __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_249; \ +#define vsetq_lane_f16(__p0_263, __p1_263, __p2_263) __extension__ ({ \ + float16_t __s0_263 = __p0_263; \ + float16x8_t __s1_263 = __p1_263; \ + float16x8_t __rev1_263; __rev1_263 = __builtin_shufflevector(__s1_263, __s1_263, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret_263; \ +float16_t __reint_263 = __s0_263; \ +float16x8_t __reint1_263 = __rev1_263; \ +int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(int16x8_t *) &__reint1_263, __p2_263); \ + __ret_263 = *(float16x8_t *) &__reint2_263; \ + __ret_263 = __builtin_shufflevector(__ret_263, __ret_263, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_263; \ }) #endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +#define vmulh_lane_f16(__p0_264, __p1_264, __p2_264) __extension__ ({ \ + float16_t __s0_264 = __p0_264; \ + float16x4_t __s1_264 = __p1_264; \ + float16_t __ret_264; \ + __ret_264 = __s0_264 * vget_lane_f16(__s1_264, __p2_264); \ + __ret_264; \ +}) +#else +#define vmulh_lane_f16(__p0_265, __p1_265, __p2_265) __extension__ ({ \ + float16_t __s0_265 = __p0_265; \ + float16x4_t __s1_265 = __p1_265; \ + float16x4_t __rev1_265; __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 3, 2, 1, 0); \ + float16_t __ret_265; \ + __ret_265 = __s0_265 * __noswap_vget_lane_f16(__rev1_265, __p2_265); \ + __ret_265; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulh_laneq_f16(__p0_266, __p1_266, __p2_266) __extension__ ({ \ + float16_t __s0_266 = __p0_266; \ + float16x8_t __s1_266 = __p1_266; \ + float16_t __ret_266; \ + __ret_266 = __s0_266 * vgetq_lane_f16(__s1_266, __p2_266); \ + __ret_266; \ +}) +#else +#define vmulh_laneq_f16(__p0_267, __p1_267, __p2_267) __extension__ ({ \ + float16_t __s0_267 = __p0_267; \ + float16x8_t __s1_267 = __p1_267; \ + float16x8_t __rev1_267; __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_267; \ + __ret_267 = __s0_267 * __noswap_vgetq_lane_f16(__rev1_267, __p2_267); \ + __ret_267; \ +}) +#endif + +#endif #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ __ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) { @@ -67916,86 +71284,86 @@ __ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahs_lane_s32(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \ - int32_t __s0_250 = __p0_250; \ - int32_t __s1_250 = __p1_250; \ - int32x2_t __s2_250 = __p2_250; \ - int32_t __ret_250; \ - __ret_250 = vqadds_s32(__s0_250, vqrdmulhs_s32(__s1_250, vget_lane_s32(__s2_250, __p3_250))); \ - __ret_250; \ +#define vqrdmlahs_lane_s32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ + int32_t __s0_268 = __p0_268; \ + int32_t __s1_268 = __p1_268; \ + int32x2_t __s2_268 = __p2_268; \ + int32_t __ret_268; \ + __ret_268 = vqadds_s32(__s0_268, vqrdmulhs_s32(__s1_268, vget_lane_s32(__s2_268, __p3_268))); \ + __ret_268; \ }) #else -#define vqrdmlahs_lane_s32(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \ - int32_t __s0_251 = __p0_251; \ - int32_t __s1_251 = __p1_251; \ - int32x2_t __s2_251 = __p2_251; \ - int32x2_t __rev2_251; __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \ - int32_t __ret_251; \ - __ret_251 = __noswap_vqadds_s32(__s0_251, __noswap_vqrdmulhs_s32(__s1_251, __noswap_vget_lane_s32(__rev2_251, __p3_251))); \ - __ret_251; \ +#define vqrdmlahs_lane_s32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ + int32_t __s0_269 = __p0_269; \ + int32_t __s1_269 = __p1_269; \ + int32x2_t __s2_269 = __p2_269; \ + int32x2_t __rev2_269; __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 1, 0); \ + int32_t __ret_269; \ + __ret_269 = __noswap_vqadds_s32(__s0_269, __noswap_vqrdmulhs_s32(__s1_269, __noswap_vget_lane_s32(__rev2_269, __p3_269))); \ + __ret_269; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahh_lane_s16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \ - int16_t __s0_252 = __p0_252; \ - int16_t __s1_252 = __p1_252; \ - int16x4_t __s2_252 = __p2_252; \ - int16_t __ret_252; \ - __ret_252 = vqaddh_s16(__s0_252, vqrdmulhh_s16(__s1_252, vget_lane_s16(__s2_252, __p3_252))); \ - __ret_252; \ +#define vqrdmlahh_lane_s16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ + int16_t __s0_270 = __p0_270; \ + int16_t __s1_270 = __p1_270; \ + int16x4_t __s2_270 = __p2_270; \ + int16_t __ret_270; \ + __ret_270 = vqaddh_s16(__s0_270, vqrdmulhh_s16(__s1_270, vget_lane_s16(__s2_270, __p3_270))); \ + __ret_270; \ }) #else -#define vqrdmlahh_lane_s16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \ - int16_t __s0_253 = __p0_253; \ - int16_t __s1_253 = __p1_253; \ - int16x4_t __s2_253 = __p2_253; \ - int16x4_t __rev2_253; __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 3, 2, 1, 0); \ - int16_t __ret_253; \ - __ret_253 = __noswap_vqaddh_s16(__s0_253, __noswap_vqrdmulhh_s16(__s1_253, __noswap_vget_lane_s16(__rev2_253, __p3_253))); \ - __ret_253; \ +#define vqrdmlahh_lane_s16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ + int16_t __s0_271 = __p0_271; \ + int16_t __s1_271 = __p1_271; \ + int16x4_t __s2_271 = __p2_271; \ + int16x4_t __rev2_271; __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \ + int16_t __ret_271; \ + __ret_271 = __noswap_vqaddh_s16(__s0_271, __noswap_vqrdmulhh_s16(__s1_271, __noswap_vget_lane_s16(__rev2_271, __p3_271))); \ + __ret_271; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahs_laneq_s32(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \ - int32_t __s0_254 = __p0_254; \ - int32_t __s1_254 = __p1_254; \ - int32x4_t __s2_254 = __p2_254; \ - int32_t __ret_254; \ - __ret_254 = vqadds_s32(__s0_254, vqrdmulhs_s32(__s1_254, vgetq_lane_s32(__s2_254, __p3_254))); \ - __ret_254; \ +#define vqrdmlahs_laneq_s32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ + int32_t __s0_272 = __p0_272; \ + int32_t __s1_272 = __p1_272; \ + int32x4_t __s2_272 = __p2_272; \ + int32_t __ret_272; \ + __ret_272 = vqadds_s32(__s0_272, vqrdmulhs_s32(__s1_272, vgetq_lane_s32(__s2_272, __p3_272))); \ + __ret_272; \ }) #else -#define vqrdmlahs_laneq_s32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \ - int32_t __s0_255 = __p0_255; \ - int32_t __s1_255 = __p1_255; \ - int32x4_t __s2_255 = __p2_255; \ - int32x4_t __rev2_255; __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 3, 2, 1, 0); \ - int32_t __ret_255; \ - __ret_255 = __noswap_vqadds_s32(__s0_255, __noswap_vqrdmulhs_s32(__s1_255, __noswap_vgetq_lane_s32(__rev2_255, __p3_255))); \ - __ret_255; \ +#define vqrdmlahs_laneq_s32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ + int32_t __s0_273 = __p0_273; \ + int32_t __s1_273 = __p1_273; \ + int32x4_t __s2_273 = __p2_273; \ + int32x4_t __rev2_273; __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 3, 2, 1, 0); \ + int32_t __ret_273; \ + __ret_273 = __noswap_vqadds_s32(__s0_273, __noswap_vqrdmulhs_s32(__s1_273, __noswap_vgetq_lane_s32(__rev2_273, __p3_273))); \ + __ret_273; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahh_laneq_s16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \ - int16_t __s0_256 = __p0_256; \ - int16_t __s1_256 = __p1_256; \ - int16x8_t __s2_256 = __p2_256; \ - int16_t __ret_256; \ - __ret_256 = vqaddh_s16(__s0_256, vqrdmulhh_s16(__s1_256, vgetq_lane_s16(__s2_256, __p3_256))); \ - __ret_256; \ +#define vqrdmlahh_laneq_s16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ + int16_t __s0_274 = __p0_274; \ + int16_t __s1_274 = __p1_274; \ + int16x8_t __s2_274 = __p2_274; \ + int16_t __ret_274; \ + __ret_274 = vqaddh_s16(__s0_274, vqrdmulhh_s16(__s1_274, vgetq_lane_s16(__s2_274, __p3_274))); \ + __ret_274; \ }) #else -#define vqrdmlahh_laneq_s16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \ - int16_t __s0_257 = __p0_257; \ - int16_t __s1_257 = __p1_257; \ - int16x8_t __s2_257 = __p2_257; \ - int16x8_t __rev2_257; __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_257; \ - __ret_257 = __noswap_vqaddh_s16(__s0_257, __noswap_vqrdmulhh_s16(__s1_257, __noswap_vgetq_lane_s16(__rev2_257, __p3_257))); \ - __ret_257; \ +#define vqrdmlahh_laneq_s16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ + int16_t __s0_275 = __p0_275; \ + int16_t __s1_275 = __p1_275; \ + int16x8_t __s2_275 = __p2_275; \ + int16x8_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_275; \ + __ret_275 = __noswap_vqaddh_s16(__s0_275, __noswap_vqrdmulhh_s16(__s1_275, __noswap_vgetq_lane_s16(__rev2_275, __p3_275))); \ + __ret_275; \ }) #endif @@ -68028,86 +71396,86 @@ __ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshs_lane_s32(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \ - int32_t __s0_258 = __p0_258; \ - int32_t __s1_258 = __p1_258; \ - int32x2_t __s2_258 = __p2_258; \ - int32_t __ret_258; \ - __ret_258 = vqsubs_s32(__s0_258, vqrdmulhs_s32(__s1_258, vget_lane_s32(__s2_258, __p3_258))); \ - __ret_258; \ +#define vqrdmlshs_lane_s32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ + int32_t __s0_276 = __p0_276; \ + int32_t __s1_276 = __p1_276; \ + int32x2_t __s2_276 = __p2_276; \ + int32_t __ret_276; \ + __ret_276 = vqsubs_s32(__s0_276, vqrdmulhs_s32(__s1_276, vget_lane_s32(__s2_276, __p3_276))); \ + __ret_276; \ }) #else -#define vqrdmlshs_lane_s32(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \ - int32_t __s0_259 = __p0_259; \ - int32_t __s1_259 = __p1_259; \ - int32x2_t __s2_259 = __p2_259; \ - int32x2_t __rev2_259; __rev2_259 = __builtin_shufflevector(__s2_259, __s2_259, 1, 0); \ - int32_t __ret_259; \ - __ret_259 = __noswap_vqsubs_s32(__s0_259, __noswap_vqrdmulhs_s32(__s1_259, __noswap_vget_lane_s32(__rev2_259, __p3_259))); \ - __ret_259; \ +#define vqrdmlshs_lane_s32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ + int32_t __s0_277 = __p0_277; \ + int32_t __s1_277 = __p1_277; \ + int32x2_t __s2_277 = __p2_277; \ + int32x2_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \ + int32_t __ret_277; \ + __ret_277 = __noswap_vqsubs_s32(__s0_277, __noswap_vqrdmulhs_s32(__s1_277, __noswap_vget_lane_s32(__rev2_277, __p3_277))); \ + __ret_277; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshh_lane_s16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \ - int16_t __s0_260 = __p0_260; \ - int16_t __s1_260 = __p1_260; \ - int16x4_t __s2_260 = __p2_260; \ - int16_t __ret_260; \ - __ret_260 = vqsubh_s16(__s0_260, vqrdmulhh_s16(__s1_260, vget_lane_s16(__s2_260, __p3_260))); \ - __ret_260; \ +#define vqrdmlshh_lane_s16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ + int16_t __s0_278 = __p0_278; \ + int16_t __s1_278 = __p1_278; \ + int16x4_t __s2_278 = __p2_278; \ + int16_t __ret_278; \ + __ret_278 = vqsubh_s16(__s0_278, vqrdmulhh_s16(__s1_278, vget_lane_s16(__s2_278, __p3_278))); \ + __ret_278; \ }) #else -#define vqrdmlshh_lane_s16(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \ - int16_t __s0_261 = __p0_261; \ - int16_t __s1_261 = __p1_261; \ - int16x4_t __s2_261 = __p2_261; \ - int16x4_t __rev2_261; __rev2_261 = __builtin_shufflevector(__s2_261, __s2_261, 3, 2, 1, 0); \ - int16_t __ret_261; \ - __ret_261 = __noswap_vqsubh_s16(__s0_261, __noswap_vqrdmulhh_s16(__s1_261, __noswap_vget_lane_s16(__rev2_261, __p3_261))); \ - __ret_261; \ +#define vqrdmlshh_lane_s16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ + int16_t __s0_279 = __p0_279; \ + int16_t __s1_279 = __p1_279; \ + int16x4_t __s2_279 = __p2_279; \ + int16x4_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 3, 2, 1, 0); \ + int16_t __ret_279; \ + __ret_279 = __noswap_vqsubh_s16(__s0_279, __noswap_vqrdmulhh_s16(__s1_279, __noswap_vget_lane_s16(__rev2_279, __p3_279))); \ + __ret_279; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshs_laneq_s32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \ - int32_t __s0_262 = __p0_262; \ - int32_t __s1_262 = __p1_262; \ - int32x4_t __s2_262 = __p2_262; \ - int32_t __ret_262; \ - __ret_262 = vqsubs_s32(__s0_262, vqrdmulhs_s32(__s1_262, vgetq_lane_s32(__s2_262, __p3_262))); \ - __ret_262; \ +#define vqrdmlshs_laneq_s32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ + int32_t __s0_280 = __p0_280; \ + int32_t __s1_280 = __p1_280; \ + int32x4_t __s2_280 = __p2_280; \ + int32_t __ret_280; \ + __ret_280 = vqsubs_s32(__s0_280, vqrdmulhs_s32(__s1_280, vgetq_lane_s32(__s2_280, __p3_280))); \ + __ret_280; \ }) #else -#define vqrdmlshs_laneq_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \ - int32_t __s0_263 = __p0_263; \ - int32_t __s1_263 = __p1_263; \ - int32x4_t __s2_263 = __p2_263; \ - int32x4_t __rev2_263; __rev2_263 = __builtin_shufflevector(__s2_263, __s2_263, 3, 2, 1, 0); \ - int32_t __ret_263; \ - __ret_263 = __noswap_vqsubs_s32(__s0_263, __noswap_vqrdmulhs_s32(__s1_263, __noswap_vgetq_lane_s32(__rev2_263, __p3_263))); \ - __ret_263; \ +#define vqrdmlshs_laneq_s32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ + int32_t __s0_281 = __p0_281; \ + int32_t __s1_281 = __p1_281; \ + int32x4_t __s2_281 = __p2_281; \ + int32x4_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \ + int32_t __ret_281; \ + __ret_281 = __noswap_vqsubs_s32(__s0_281, __noswap_vqrdmulhs_s32(__s1_281, __noswap_vgetq_lane_s32(__rev2_281, __p3_281))); \ + __ret_281; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshh_laneq_s16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \ - int16_t __s0_264 = __p0_264; \ - int16_t __s1_264 = __p1_264; \ - int16x8_t __s2_264 = __p2_264; \ - int16_t __ret_264; \ - __ret_264 = vqsubh_s16(__s0_264, vqrdmulhh_s16(__s1_264, vgetq_lane_s16(__s2_264, __p3_264))); \ - __ret_264; \ +#define vqrdmlshh_laneq_s16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \ + int16_t __s0_282 = __p0_282; \ + int16_t __s1_282 = __p1_282; \ + int16x8_t __s2_282 = __p2_282; \ + int16_t __ret_282; \ + __ret_282 = vqsubh_s16(__s0_282, vqrdmulhh_s16(__s1_282, vgetq_lane_s16(__s2_282, __p3_282))); \ + __ret_282; \ }) #else -#define vqrdmlshh_laneq_s16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \ - int16_t __s0_265 = __p0_265; \ - int16_t __s1_265 = __p1_265; \ - int16x8_t __s2_265 = __p2_265; \ - int16x8_t __rev2_265; __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_265; \ - __ret_265 = __noswap_vqsubh_s16(__s0_265, __noswap_vqrdmulhh_s16(__s1_265, __noswap_vgetq_lane_s16(__rev2_265, __p3_265))); \ - __ret_265; \ +#define vqrdmlshh_laneq_s16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \ + int16_t __s0_283 = __p0_283; \ + int16_t __s1_283 = __p1_283; \ + int16x8_t __s2_283 = __p2_283; \ + int16x8_t __rev2_283; __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_283; \ + __ret_283 = __noswap_vqsubh_s16(__s0_283, __noswap_vqrdmulhh_s16(__s1_283, __noswap_vgetq_lane_s16(__rev2_283, __p3_283))); \ + __ret_283; \ }) #endif @@ -68420,158 +71788,158 @@ __ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p64(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \ - poly64x2_t __s0_266 = __p0_266; \ - poly64x1_t __s2_266 = __p2_266; \ - poly64x2_t __ret_266; \ - __ret_266 = vsetq_lane_p64(vget_lane_p64(__s2_266, __p3_266), __s0_266, __p1_266); \ - __ret_266; \ +#define vcopyq_lane_p64(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \ + poly64x2_t __s0_284 = __p0_284; \ + poly64x1_t __s2_284 = __p2_284; \ + poly64x2_t __ret_284; \ + __ret_284 = vsetq_lane_p64(vget_lane_p64(__s2_284, __p3_284), __s0_284, __p1_284); \ + __ret_284; \ }) #else -#define vcopyq_lane_p64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \ - poly64x2_t __s0_267 = __p0_267; \ - poly64x1_t __s2_267 = __p2_267; \ - poly64x2_t __rev0_267; __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 1, 0); \ - poly64x2_t __ret_267; \ - __ret_267 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_267, __p3_267), __rev0_267, __p1_267); \ - __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \ - __ret_267; \ +#define vcopyq_lane_p64(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \ + poly64x2_t __s0_285 = __p0_285; \ + poly64x1_t __s2_285 = __p2_285; \ + poly64x2_t __rev0_285; __rev0_285 = __builtin_shufflevector(__s0_285, __s0_285, 1, 0); \ + poly64x2_t __ret_285; \ + __ret_285 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_285, __p3_285), __rev0_285, __p1_285); \ + __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 1, 0); \ + __ret_285; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_f64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ - float64x2_t __s0_268 = __p0_268; \ - float64x1_t __s2_268 = __p2_268; \ - float64x2_t __ret_268; \ - __ret_268 = vsetq_lane_f64(vget_lane_f64(__s2_268, __p3_268), __s0_268, __p1_268); \ - __ret_268; \ +#define vcopyq_lane_f64(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \ + float64x2_t __s0_286 = __p0_286; \ + float64x1_t __s2_286 = __p2_286; \ + float64x2_t __ret_286; \ + __ret_286 = vsetq_lane_f64(vget_lane_f64(__s2_286, __p3_286), __s0_286, __p1_286); \ + __ret_286; \ }) #else -#define vcopyq_lane_f64(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ - float64x2_t __s0_269 = __p0_269; \ - float64x1_t __s2_269 = __p2_269; \ - float64x2_t __rev0_269; __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 1, 0); \ - float64x2_t __ret_269; \ - __ret_269 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_269, __p3_269), __rev0_269, __p1_269); \ - __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 1, 0); \ - __ret_269; \ +#define vcopyq_lane_f64(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \ + float64x2_t __s0_287 = __p0_287; \ + float64x1_t __s2_287 = __p2_287; \ + float64x2_t __rev0_287; __rev0_287 = __builtin_shufflevector(__s0_287, __s0_287, 1, 0); \ + float64x2_t __ret_287; \ + __ret_287 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_287, __p3_287), __rev0_287, __p1_287); \ + __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \ + __ret_287; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p64(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ - poly64x1_t __s0_270 = __p0_270; \ - poly64x1_t __s2_270 = __p2_270; \ - poly64x1_t __ret_270; \ - __ret_270 = vset_lane_p64(vget_lane_p64(__s2_270, __p3_270), __s0_270, __p1_270); \ - __ret_270; \ +#define vcopy_lane_p64(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \ + poly64x1_t __s0_288 = __p0_288; \ + poly64x1_t __s2_288 = __p2_288; \ + poly64x1_t __ret_288; \ + __ret_288 = vset_lane_p64(vget_lane_p64(__s2_288, __p3_288), __s0_288, __p1_288); \ + __ret_288; \ }) #else -#define vcopy_lane_p64(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ - poly64x1_t __s0_271 = __p0_271; \ - poly64x1_t __s2_271 = __p2_271; \ - poly64x1_t __ret_271; \ - __ret_271 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_271, __p3_271), __s0_271, __p1_271); \ - __ret_271; \ +#define vcopy_lane_p64(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \ + poly64x1_t __s0_289 = __p0_289; \ + poly64x1_t __s2_289 = __p2_289; \ + poly64x1_t __ret_289; \ + __ret_289 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_289, __p3_289), __s0_289, __p1_289); \ + __ret_289; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_f64(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ - float64x1_t __s0_272 = __p0_272; \ - float64x1_t __s2_272 = __p2_272; \ - float64x1_t __ret_272; \ - __ret_272 = vset_lane_f64(vget_lane_f64(__s2_272, __p3_272), __s0_272, __p1_272); \ - __ret_272; \ +#define vcopy_lane_f64(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \ + float64x1_t __s0_290 = __p0_290; \ + float64x1_t __s2_290 = __p2_290; \ + float64x1_t __ret_290; \ + __ret_290 = vset_lane_f64(vget_lane_f64(__s2_290, __p3_290), __s0_290, __p1_290); \ + __ret_290; \ }) #else -#define vcopy_lane_f64(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ - float64x1_t __s0_273 = __p0_273; \ - float64x1_t __s2_273 = __p2_273; \ - float64x1_t __ret_273; \ - __ret_273 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_273, __p3_273), __s0_273, __p1_273); \ - __ret_273; \ +#define vcopy_lane_f64(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \ + float64x1_t __s0_291 = __p0_291; \ + float64x1_t __s2_291 = __p2_291; \ + float64x1_t __ret_291; \ + __ret_291 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_291, __p3_291), __s0_291, __p1_291); \ + __ret_291; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p64(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ - poly64x2_t __s0_274 = __p0_274; \ - poly64x2_t __s2_274 = __p2_274; \ - poly64x2_t __ret_274; \ - __ret_274 = vsetq_lane_p64(vgetq_lane_p64(__s2_274, __p3_274), __s0_274, __p1_274); \ - __ret_274; \ +#define vcopyq_laneq_p64(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \ + poly64x2_t __s0_292 = __p0_292; \ + poly64x2_t __s2_292 = __p2_292; \ + poly64x2_t __ret_292; \ + __ret_292 = vsetq_lane_p64(vgetq_lane_p64(__s2_292, __p3_292), __s0_292, __p1_292); \ + __ret_292; \ }) #else -#define vcopyq_laneq_p64(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ - poly64x2_t __s0_275 = __p0_275; \ - poly64x2_t __s2_275 = __p2_275; \ - poly64x2_t __rev0_275; __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 1, 0); \ - poly64x2_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 1, 0); \ - poly64x2_t __ret_275; \ - __ret_275 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_275, __p3_275), __rev0_275, __p1_275); \ - __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \ - __ret_275; \ +#define vcopyq_laneq_p64(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \ + poly64x2_t __s0_293 = __p0_293; \ + poly64x2_t __s2_293 = __p2_293; \ + poly64x2_t __rev0_293; __rev0_293 = __builtin_shufflevector(__s0_293, __s0_293, 1, 0); \ + poly64x2_t __rev2_293; __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 1, 0); \ + poly64x2_t __ret_293; \ + __ret_293 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_293, __p3_293), __rev0_293, __p1_293); \ + __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 1, 0); \ + __ret_293; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_f64(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ - float64x2_t __s0_276 = __p0_276; \ - float64x2_t __s2_276 = __p2_276; \ - float64x2_t __ret_276; \ - __ret_276 = vsetq_lane_f64(vgetq_lane_f64(__s2_276, __p3_276), __s0_276, __p1_276); \ - __ret_276; \ +#define vcopyq_laneq_f64(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \ + float64x2_t __s0_294 = __p0_294; \ + float64x2_t __s2_294 = __p2_294; \ + float64x2_t __ret_294; \ + __ret_294 = vsetq_lane_f64(vgetq_lane_f64(__s2_294, __p3_294), __s0_294, __p1_294); \ + __ret_294; \ }) #else -#define vcopyq_laneq_f64(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ - float64x2_t __s0_277 = __p0_277; \ - float64x2_t __s2_277 = __p2_277; \ - float64x2_t __rev0_277; __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \ - float64x2_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \ - float64x2_t __ret_277; \ - __ret_277 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_277, __p3_277), __rev0_277, __p1_277); \ - __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \ - __ret_277; \ +#define vcopyq_laneq_f64(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \ + float64x2_t __s0_295 = __p0_295; \ + float64x2_t __s2_295 = __p2_295; \ + float64x2_t __rev0_295; __rev0_295 = __builtin_shufflevector(__s0_295, __s0_295, 1, 0); \ + float64x2_t __rev2_295; __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 1, 0); \ + float64x2_t __ret_295; \ + __ret_295 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_295, __p3_295), __rev0_295, __p1_295); \ + __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \ + __ret_295; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p64(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ - poly64x1_t __s0_278 = __p0_278; \ - poly64x2_t __s2_278 = __p2_278; \ - poly64x1_t __ret_278; \ - __ret_278 = vset_lane_p64(vgetq_lane_p64(__s2_278, __p3_278), __s0_278, __p1_278); \ - __ret_278; \ +#define vcopy_laneq_p64(__p0_296, __p1_296, __p2_296, __p3_296) __extension__ ({ \ + poly64x1_t __s0_296 = __p0_296; \ + poly64x2_t __s2_296 = __p2_296; \ + poly64x1_t __ret_296; \ + __ret_296 = vset_lane_p64(vgetq_lane_p64(__s2_296, __p3_296), __s0_296, __p1_296); \ + __ret_296; \ }) #else -#define vcopy_laneq_p64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ - poly64x1_t __s0_279 = __p0_279; \ - poly64x2_t __s2_279 = __p2_279; \ - poly64x2_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 1, 0); \ - poly64x1_t __ret_279; \ - __ret_279 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_279, __p3_279), __s0_279, __p1_279); \ - __ret_279; \ +#define vcopy_laneq_p64(__p0_297, __p1_297, __p2_297, __p3_297) __extension__ ({ \ + poly64x1_t __s0_297 = __p0_297; \ + poly64x2_t __s2_297 = __p2_297; \ + poly64x2_t __rev2_297; __rev2_297 = __builtin_shufflevector(__s2_297, __s2_297, 1, 0); \ + poly64x1_t __ret_297; \ + __ret_297 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_297, __p3_297), __s0_297, __p1_297); \ + __ret_297; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_f64(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ - float64x1_t __s0_280 = __p0_280; \ - float64x2_t __s2_280 = __p2_280; \ - float64x1_t __ret_280; \ - __ret_280 = vset_lane_f64(vgetq_lane_f64(__s2_280, __p3_280), __s0_280, __p1_280); \ - __ret_280; \ +#define vcopy_laneq_f64(__p0_298, __p1_298, __p2_298, __p3_298) __extension__ ({ \ + float64x1_t __s0_298 = __p0_298; \ + float64x2_t __s2_298 = __p2_298; \ + float64x1_t __ret_298; \ + __ret_298 = vset_lane_f64(vgetq_lane_f64(__s2_298, __p3_298), __s0_298, __p1_298); \ + __ret_298; \ }) #else -#define vcopy_laneq_f64(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ - float64x1_t __s0_281 = __p0_281; \ - float64x2_t __s2_281 = __p2_281; \ - float64x2_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 1, 0); \ - float64x1_t __ret_281; \ - __ret_281 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_281, __p3_281), __s0_281, __p1_281); \ - __ret_281; \ +#define vcopy_laneq_f64(__p0_299, __p1_299, __p2_299, __p3_299) __extension__ ({ \ + float64x1_t __s0_299 = __p0_299; \ + float64x2_t __s2_299 = __p2_299; \ + float64x2_t __rev2_299; __rev2_299 = __builtin_shufflevector(__s2_299, __s2_299, 1, 0); \ + float64x1_t __ret_299; \ + __ret_299 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_299, __p3_299), __s0_299, __p1_299); \ + __ret_299; \ }) #endif @@ -68928,51 +72296,51 @@ __ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulx_lane_f64(__p0_282, __p1_282, __p2_282) __extension__ ({ \ - float64x1_t __s0_282 = __p0_282; \ - float64x1_t __s1_282 = __p1_282; \ - float64x1_t __ret_282; \ - float64_t __x_282 = vget_lane_f64(__s0_282, 0); \ - float64_t __y_282 = vget_lane_f64(__s1_282, __p2_282); \ - float64_t __z_282 = vmulxd_f64(__x_282, __y_282); \ - __ret_282 = vset_lane_f64(__z_282, __s0_282, __p2_282); \ - __ret_282; \ +#define vmulx_lane_f64(__p0_300, __p1_300, __p2_300) __extension__ ({ \ + float64x1_t __s0_300 = __p0_300; \ + float64x1_t __s1_300 = __p1_300; \ + float64x1_t __ret_300; \ + float64_t __x_300 = vget_lane_f64(__s0_300, 0); \ + float64_t __y_300 = vget_lane_f64(__s1_300, __p2_300); \ + float64_t __z_300 = vmulxd_f64(__x_300, __y_300); \ + __ret_300 = vset_lane_f64(__z_300, __s0_300, __p2_300); \ + __ret_300; \ }) #else -#define vmulx_lane_f64(__p0_283, __p1_283, __p2_283) __extension__ ({ \ - float64x1_t __s0_283 = __p0_283; \ - float64x1_t __s1_283 = __p1_283; \ - float64x1_t __ret_283; \ - float64_t __x_283 = __noswap_vget_lane_f64(__s0_283, 0); \ - float64_t __y_283 = __noswap_vget_lane_f64(__s1_283, __p2_283); \ - float64_t __z_283 = __noswap_vmulxd_f64(__x_283, __y_283); \ - __ret_283 = __noswap_vset_lane_f64(__z_283, __s0_283, __p2_283); \ - __ret_283; \ +#define vmulx_lane_f64(__p0_301, __p1_301, __p2_301) __extension__ ({ \ + float64x1_t __s0_301 = __p0_301; \ + float64x1_t __s1_301 = __p1_301; \ + float64x1_t __ret_301; \ + float64_t __x_301 = __noswap_vget_lane_f64(__s0_301, 0); \ + float64_t __y_301 = __noswap_vget_lane_f64(__s1_301, __p2_301); \ + float64_t __z_301 = __noswap_vmulxd_f64(__x_301, __y_301); \ + __ret_301 = __noswap_vset_lane_f64(__z_301, __s0_301, __p2_301); \ + __ret_301; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulx_laneq_f64(__p0_284, __p1_284, __p2_284) __extension__ ({ \ - float64x1_t __s0_284 = __p0_284; \ - float64x2_t __s1_284 = __p1_284; \ - float64x1_t __ret_284; \ - float64_t __x_284 = vget_lane_f64(__s0_284, 0); \ - float64_t __y_284 = vgetq_lane_f64(__s1_284, __p2_284); \ - float64_t __z_284 = vmulxd_f64(__x_284, __y_284); \ - __ret_284 = vset_lane_f64(__z_284, __s0_284, 0); \ - __ret_284; \ +#define vmulx_laneq_f64(__p0_302, __p1_302, __p2_302) __extension__ ({ \ + float64x1_t __s0_302 = __p0_302; \ + float64x2_t __s1_302 = __p1_302; \ + float64x1_t __ret_302; \ + float64_t __x_302 = vget_lane_f64(__s0_302, 0); \ + float64_t __y_302 = vgetq_lane_f64(__s1_302, __p2_302); \ + float64_t __z_302 = vmulxd_f64(__x_302, __y_302); \ + __ret_302 = vset_lane_f64(__z_302, __s0_302, 0); \ + __ret_302; \ }) #else -#define vmulx_laneq_f64(__p0_285, __p1_285, __p2_285) __extension__ ({ \ - float64x1_t __s0_285 = __p0_285; \ - float64x2_t __s1_285 = __p1_285; \ - float64x2_t __rev1_285; __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 1, 0); \ - float64x1_t __ret_285; \ - float64_t __x_285 = __noswap_vget_lane_f64(__s0_285, 0); \ - float64_t __y_285 = __noswap_vgetq_lane_f64(__rev1_285, __p2_285); \ - float64_t __z_285 = __noswap_vmulxd_f64(__x_285, __y_285); \ - __ret_285 = __noswap_vset_lane_f64(__z_285, __s0_285, 0); \ - __ret_285; \ +#define vmulx_laneq_f64(__p0_303, __p1_303, __p2_303) __extension__ ({ \ + float64x1_t __s0_303 = __p0_303; \ + float64x2_t __s1_303 = __p1_303; \ + float64x2_t __rev1_303; __rev1_303 = __builtin_shufflevector(__s1_303, __s1_303, 1, 0); \ + float64x1_t __ret_303; \ + float64_t __x_303 = __noswap_vget_lane_f64(__s0_303, 0); \ + float64_t __y_303 = __noswap_vgetq_lane_f64(__rev1_303, __p2_303); \ + float64_t __z_303 = __noswap_vmulxd_f64(__x_303, __y_303); \ + __ret_303 = __noswap_vset_lane_f64(__z_303, __s0_303, 0); \ + __ret_303; \ }) #endif diff --git a/c_headers/avx2intrin.h b/c_headers/avx2intrin.h index 576f761b25..caf4ced920 100644 --- a/c_headers/avx2intrin.h +++ b/c_headers/avx2intrin.h @@ -145,13 +145,21 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu8(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); + typedef unsigned short __v32hu __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v32qu)__a, __v32hu) + + __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) + >> 1, __v32qu); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu16(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); + typedef unsigned int __v16su __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v16hu)__a, __v16su) + + __builtin_convertvector((__v16hu)__b, __v16su)) + 1) + >> 1, __v16hu); } static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512bitalgintrin.h b/c_headers/avx512bitalgintrin.h new file mode 100644 index 0000000000..2dd1471d2f --- /dev/null +++ b/c_headers/avx512bitalgintrin.h @@ -0,0 +1,97 @@ +/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512BITALGINTRIN_H +#define __AVX512BITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi16(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi) _mm512_popcnt_epi16(__B), + (__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(), + __U, + __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi8(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, + (__v64qi) _mm512_popcnt_epi8(__B), + (__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(), + __U, + __B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A, + (__v64qi) __B, + __U); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) +{ + return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h index 41958b7214..3ff0e3aafd 100644 --- a/c_headers/avx512bwintrin.h +++ b/c_headers/avx512bwintrin.h @@ -56,293 +56,145 @@ _mm512_setzero_hi(void) { /* Integer compare */ -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b, - (__mmask64)-1); -} +#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b, - __u); -} +#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m)); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, - (__mmask64)-1); -} +#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, - __u); -} +#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b, - (__mmask32)-1); -} +#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b, - __u); -} +#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, - (__mmask32)-1); -} +#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, - __u); -} +#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - (__mmask64)-1); -} +#define _mm512_cmpeq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - __u); -} +#define _mm512_cmpeq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - (__mmask64)-1); -} +#define _mm512_cmpeq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - __u); -} +#define _mm512_cmpeq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_add_epi8 (__m512i __A, __m512i __B) { @@ -706,57 +558,55 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + typedef unsigned short __v64hu __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v64qu) __A, __v64hu) + + __builtin_convertvector((__v64qu) __B, __v64hu)) + 1) + >> 1, __v64qu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + typedef unsigned int __v32su __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v32hu) __A, __v32su) + + __builtin_convertvector((__v32hu) __B, __v32su)) + 1) + >> 1, __v32hu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi) _mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1543,46 +1393,6 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) } -#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) - -#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) - -#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) - -#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) - -#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) - #define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \ (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ (__v32hi)_mm512_undefined_epi32(), \ @@ -2028,32 +1838,29 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) __O, - __M); + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi)_mm512_set1_epi8(__A), + (__v64qi) __O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) - _mm512_setzero_qi(), - __M); + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi) _mm512_set1_epi8(__A), + (__v64qi) _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, - (__mmask64) __B); + return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, - (__mmask32) __B); +return (__mmask32) (( __A & 0xFFFF) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -2108,61 +1915,56 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_test_epi8_mask (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_test_epi16_mask (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_testn_epi8_mask (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_testn_epi16_mask (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS @@ -2219,17 +2021,17 @@ _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) __O, - __M); + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) __O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) _mm512_setzero_hi(), - __M); + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) _mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512cdintrin.h b/c_headers/avx512cdintrin.h index 23c423584a..ec7e0cd443 100644 --- a/c_headers/avx512cdintrin.h +++ b/c_headers/avx512cdintrin.h @@ -130,13 +130,14 @@ _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastmb_epi64 (__mmask8 __A) { - return (__m512i) __builtin_ia32_broadcastmb512 (__A); + return (__m512i) _mm512_set1_epi64((long long) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastmw_epi32 (__mmask16 __A) { - return (__m512i) __builtin_ia32_broadcastmw512 (__A); + return (__m512i) _mm512_set1_epi32((int) __A); + } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512dqintrin.h b/c_headers/avx512dqintrin.h index 4fd1add773..2c431d9740 100644 --- a/c_headers/avx512dqintrin.h +++ b/c_headers/avx512dqintrin.h @@ -973,25 +973,26 @@ _mm512_movepi64_mask (__m512i __A) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_broadcast_f32x2 (__m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf)_mm512_undefined_ps(), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v4sf)__A, + (__v4sf)_mm_undefined_ps(), + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf) - __O, __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf)_mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS @@ -1044,25 +1045,26 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcast_i32x2 (__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si)_mm512_setzero_si512(), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si) - __O, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si)_mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h index 4b66acc02f..d34f0b1327 100644 --- a/c_headers/avx512fintrin.h +++ b/c_headers/avx512fintrin.h @@ -258,25 +258,6 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v8di) _mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_set1_epi32(__mmask16 __M, int __A) -{ - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, - (__v16si) - _mm512_setzero_si512 (), - __M); -} - -#ifdef __x86_64__ -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) -{ - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, - (__v8di) - _mm512_setzero_si512 (), - __M); -} -#endif static __inline __m512 __DEFAULT_FN_ATTRS _mm512_setzero_ps(void) @@ -335,12 +316,30 @@ _mm512_set1_epi32(int __s) __s, __s, __s, __s, __s, __s, __s, __s }; } +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_set1_epi32(__mmask16 __M, int __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__M, + (__v16si)_mm512_set1_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + static __inline __m512i __DEFAULT_FN_ATTRS _mm512_set1_epi64(long long __d) { return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; } +#ifdef __x86_64__ +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_set1_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} +#endif + static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_broadcastss_ps(__m128 __A) { @@ -4544,37 +4543,6 @@ _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -/* Bit Test */ - -static __inline __mmask16 __DEFAULT_FN_ATTRS -_mm512_test_epi32_mask(__m512i __A, __m512i __B) -{ - return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, - (__v16si) __B, __U); -} - -static __inline __mmask8 __DEFAULT_FN_ATTRS -_mm512_test_epi64_mask(__m512i __A, __m512i __B) -{ - return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, - (__v8di) __B, - (__mmask8) -1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); -} - /* SIMD load ops */ @@ -4845,293 +4813,105 @@ _mm512_knot(__mmask16 __M) /* Integer compare */ -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b, - (__mmask16)-1); -} +#define _mm512_cmpeq_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b, - __u); -} +#define _mm512_cmpeq_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0, - (__mmask16)-1); -} +#define _mm512_cmpeq_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) { - return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) { - return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4, - __u); -} +#define _mm512_cmpeq_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtepi8_epi32(__m128i __A) @@ -6798,35 +6578,6 @@ _mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, (__mmask16) __U); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_testn_epi32_mask (__m512i __A, __m512i __B) -{ - return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, - (__v16si) __B, - (__mmask16) -1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, - (__v16si) __B, __U); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_testn_epi64_mask (__m512i __A, __m512i __B) -{ - return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, - (__v8di) __B, - (__mmask8) -1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, - (__v8di) __B, __U); -} #define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \ (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ @@ -7195,76 +6946,100 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B) } #define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1); }) + (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 4), \ + 1 + ((((imm) >> 0) & 0x3) * 4), \ + 2 + ((((imm) >> 0) & 0x3) * 4), \ + 3 + ((((imm) >> 0) & 0x3) * 4), \ + 0 + ((((imm) >> 2) & 0x3) * 4), \ + 1 + ((((imm) >> 2) & 0x3) * 4), \ + 2 + ((((imm) >> 2) & 0x3) * 4), \ + 3 + ((((imm) >> 2) & 0x3) * 4), \ + 16 + ((((imm) >> 4) & 0x3) * 4), \ + 17 + ((((imm) >> 4) & 0x3) * 4), \ + 18 + ((((imm) >> 4) & 0x3) * 4), \ + 19 + ((((imm) >> 4) & 0x3) * 4), \ + 16 + ((((imm) >> 6) & 0x3) * 4), \ + 17 + ((((imm) >> 6) & 0x3) * 4), \ + 18 + ((((imm) >> 6) & 0x3) * 4), \ + 19 + ((((imm) >> 6) & 0x3) * 4)); }) #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W)); }) #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ - (__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U)); }) + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps()); }) #define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1); }) + (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)(__m512d)(W)); }) #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ - (__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U)); }) + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd()); }) #define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W)); }) #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512()); }) #define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) + (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + 0 + ((((imm) >> 0) & 0x3) * 2), \ + 1 + ((((imm) >> 0) & 0x3) * 2), \ + 0 + ((((imm) >> 2) & 0x3) * 2), \ + 1 + ((((imm) >> 2) & 0x3) * 2), \ + 8 + ((((imm) >> 4) & 0x3) * 2), \ + 9 + ((((imm) >> 4) & 0x3) * 2), \ + 8 + ((((imm) >> 6) & 0x3) * 2), \ + 9 + ((((imm) >> 6) & 0x3) * 2)); }) #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)(__m512i)(W)); }) #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512()); }) #define _mm512_shuffle_pd(A, B, M) __extension__ ({ \ (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ @@ -9012,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B) static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS @@ -9035,7 +8810,7 @@ _mm512_stream_si512 (__m512i * __P, __m512i __A) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_stream_load_si512 (void *__P) +_mm512_stream_load_si512 (void const *__P) { typedef __v8di __v8di_aligned __attribute__((aligned(64))); return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); @@ -9167,6 +8942,64 @@ _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) (__mmask8)(M), \ _MM_FROUND_CUR_DIRECTION); }) +/* Bit Test */ + +static __inline __mmask16 __DEFAULT_FN_ATTRS +_mm512_test_epi32_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline __mmask8 __DEFAULT_FN_ATTRS +_mm512_test_epi64_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_epi32()); +} + static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_movehdup_ps (__m512 __A) { @@ -9737,16 +9570,18 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, - __M); + return (__m512i) __builtin_ia32_selectd_512(__M, + (__v16si) _mm512_set1_epi32(__A), + (__v16si) __O); } #ifdef __x86_64__ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, - __M); + return (__m512i) __builtin_ia32_selectq_512(__M, + (__v8di) _mm512_set1_epi64(__A), + (__v8di) __O); } #endif diff --git a/c_headers/avx512vbmi2intrin.h b/c_headers/avx512vbmi2intrin.h new file mode 100644 index 0000000000..43e97b40a0 --- /dev/null +++ b/c_headers/avx512vbmi2intrin.h @@ -0,0 +1,391 @@ +/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VBMI2INTRIN_H +#define __AVX512VBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) +{ + __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) +{ + __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \ + (__v8di)(B), \ + (int)(I), \ + (__v8di)(S), \ + (__mmask8)(U)); }) + +#define _mm512_maskz_shldi_epi64(U, A, B, I) \ + _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi64(A, B, I) \ + _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \ + (__v16si)(B), \ + (int)(I), \ + (__v16si)(S), \ + (__mmask16)(U)); }) + +#define _mm512_maskz_shldi_epi32(U, A, B, I) \ + _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi32(A, B, I) \ + _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + +#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \ + (__v32hi)(B), \ + (int)(I), \ + (__v32hi)(S), \ + (__mmask32)(U)); }) + +#define _mm512_maskz_shldi_epi16(U, A, B, I) \ + _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi16(A, B, I) \ + _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \ + (__v8di)(B), \ + (int)(I), \ + (__v8di)(S), \ + (__mmask8)(U)); }) + +#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ + _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi64(A, B, I) \ + _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \ + (__v16si)(B), \ + (int)(I), \ + (__v16si)(S), \ + (__mmask16)(U)); }) + +#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ + _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi32(A, B, I) \ + _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \ + (__v32hi)(B), \ + (int)(I), \ + (__v32hi)(S), \ + (__mmask32)(U)); }) + +#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ + _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi16(A, B, I) \ + _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif + diff --git a/c_headers/avx512vlbitalgintrin.h b/c_headers/avx512vlbitalgintrin.h new file mode 100644 index 0000000000..76eb87721b --- /dev/null +++ b/c_headers/avx512vlbitalgintrin.h @@ -0,0 +1,157 @@ +/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLBITALGINTRIN_H +#define __AVX512VLBITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_popcnt_epi16(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, + (__v16hi) _mm256_popcnt_epi16(__B), + (__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_popcnt_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, + (__v8hi) _mm128_popcnt_epi16(__B), + (__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) +{ + return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_popcnt_epi8(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, + (__v32qi) _mm256_popcnt_epi8(__B), + (__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_popcnt_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, + (__v16qi) _mm128_popcnt_epi8(__B), + (__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) +{ + return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A, + (__v32qi) __B, + __U); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B) +{ + return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1, + __A, + __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A, + (__v16qi) __B, + __U); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B) +{ + return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h index 3b58d04339..e940e2b685 100644 --- a/c_headers/avx512vlbwintrin.h +++ b/c_headers/avx512vlbwintrin.h @@ -38,581 +38,285 @@ _mm_setzero_hi(void){ /* Integer compare */ -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmple_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmple_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - __u); -} +#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1); }) + +#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m)); }) + +#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1); }) + +#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m)); }) + +#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1); }) + +#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m)); }) + +#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1); }) + +#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m)); }) + +#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1); }) + +#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m)); }) + +#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1); }) + +#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m)); }) + +#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1); }) + +#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m)); }) + +#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1); }) + +#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m)); }) + +#define _mm_cmpeq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ @@ -2146,86 +1850,6 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) } -#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) - -#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) - -#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) - -#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) - -#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) - #define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ @@ -2660,35 +2284,33 @@ _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, - (__v16qi) __O, - __M); + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, - (__v32qi) __O, - __M); + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, - (__v32qi) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) _mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -2793,121 +2415,108 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_test_epi8_mask (__m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, - (__v16qi) __B, - (__mmask16) -1); + return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, - (__v16qi) __B, __U); + return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_test_epi8_mask (__m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, - (__v32qi) __B, - (__mmask32) -1); + return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, - (__v32qi) __B, __U); + return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi16_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, - (__v8hi) __B, __U); + return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_test_epi16_mask (__m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256 ()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, - (__v16hi) __B, __U); + return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_testn_epi8_mask (__m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, - (__v16qi) __B, - (__mmask16) -1); + return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, - (__v16qi) __B, __U); + return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_testn_epi8_mask (__m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, - (__v32qi) __B, - (__mmask32) -1); + return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, - (__v32qi) __B, __U); + return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi16_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, - (__v8hi) __B, __U); + return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_testn_epi16_mask (__m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, - (__v16hi) __B, __U); + return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS @@ -3025,33 +2634,33 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, - (__v16hi) __O, - __M); + return (__m256i) __builtin_ia32_selectw_256 (__M, + (__v16hi) _mm256_set1_epi16(__A), + (__v16hi) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, - (__v16hi) _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectw_256(__M, + (__v16hi)_mm256_set1_epi16(__A), + (__v16hi) _mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, - (__v8hi) __O, - __M); + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, - (__v8hi) _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) _mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vlcdintrin.h b/c_headers/avx512vlcdintrin.h index 7b02e2e1f9..8f1cd25f0b 100644 --- a/c_headers/avx512vlcdintrin.h +++ b/c_headers/avx512vlcdintrin.h @@ -33,26 +33,26 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_broadcastmb128 (__A); +{ + return (__m128i) _mm_set1_epi64x((long long) __A); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcastmb_epi64 (__mmask8 __A) { - return (__m256i) __builtin_ia32_broadcastmb256 (__A); + return (__m256i) _mm256_set1_epi64x((long long)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcastmw_epi32 (__mmask16 __A) { - return (__m128i) __builtin_ia32_broadcastmw128 (__A); + return (__m128i) _mm_set1_epi32((int)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcastmw_epi32 (__mmask16 __A) { - return (__m256i) __builtin_ia32_broadcastmw256 (__A); + return (__m256i) _mm256_set1_epi32((int)__A); } diff --git a/c_headers/avx512vldqintrin.h b/c_headers/avx512vldqintrin.h index aecd7df34d..d80df9eaff 100644 --- a/c_headers/avx512vldqintrin.h +++ b/c_headers/avx512vldqintrin.h @@ -978,25 +978,25 @@ _mm256_movepi64_mask (__m256i __A) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_f32x2 (__m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf)_mm256_undefined_ps(), - (__mmask8) -1); + return (__m256)__builtin_shufflevector((__v4sf)__A, + (__v4sf)_mm_undefined_ps(), + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf) _mm256_setzero_ps (), - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256d __DEFAULT_FN_ATTRS @@ -1025,49 +1025,49 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcast_i32x2 (__m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si) __O, - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)__O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcast_i32x2 (__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si)_mm256_undefined_si256(), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si) __O, - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h index 99bb050de4..fb8056e3f8 100644 --- a/c_headers/avx512vlintrin.h +++ b/c_headers/avx512vlintrin.h @@ -38,582 +38,205 @@ _mm_setzero_di(void) { /* Integer compare */ -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, - __u); -} - - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, - __u); -} +#define _mm_cmpeq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) @@ -5723,59 +5346,72 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) (__v4df)_mm256_setzero_pd()); } +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)__O); +} -#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)(__m128i)(O), \ - (__mmask8)(M)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128()); +} -#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(M)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)__O); +} -#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)(__m256i)(O), \ - (__mmask8)(M)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} -#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(M)); }) #ifdef __x86_64__ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, - __M); + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) __O) ; } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) _mm256_setzero_si256()); } + #endif #define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \ @@ -6490,125 +6126,111 @@ _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi32_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, __U); + return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_test_epi32_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, __U); + return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi64_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, __U); + return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_test_epi64_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, __U); + return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi32_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, __U); + return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_testn_epi32_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, __U); + return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi64_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, __U); + return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_testn_epi64_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, __U); + return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } - - static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { @@ -6964,85 +6586,81 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm) #define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 4), \ + 1 + ((((imm) >> 0) & 0x1) * 4), \ + 2 + ((((imm) >> 0) & 0x1) * 4), \ + 3 + ((((imm) >> 0) & 0x1) * 4), \ + 8 + ((((imm) >> 1) & 0x1) * 4), \ + 9 + ((((imm) >> 1) & 0x1) * 4), \ + 10 + ((((imm) >> 1) & 0x1) * 4), \ + 11 + ((((imm) >> 1) & 0x1) * 4)); }) #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W)); }) #define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps()); }) #define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)(__m256)(W)); }) #define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) #define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)(__m256)(W)); }) #define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)(__m256)(W)); }) + #define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ diff --git a/c_headers/avx512vlvbmi2intrin.h b/c_headers/avx512vlvbmi2intrin.h new file mode 100644 index 0000000000..d1ec4976f2 --- /dev/null +++ b/c_headers/avx512vlvbmi2intrin.h @@ -0,0 +1,748 @@ +/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLVBMI2INTRIN_H +#define __AVX512VLVBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"))) + +static __inline __m128i __DEFAULT_FN_ATTRS +_mm128_setzero_hi(void) { + return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) +{ + __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) +{ + __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setzero_hi(void) { + return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) +{ + __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) +{ + __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \ + (__v4di)(B), \ + (int)(I), \ + (__v4di)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shldi_epi64(U, A, B, I) \ + _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi64(A, B, I) \ + _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \ + (__v2di)(B), \ + (int)(I), \ + (__v2di)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi64(U, A, B, I) \ + _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi64(A, B, I) \ + _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \ + (__v8si)(B), \ + (int)(I), \ + (__v8si)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shldi_epi32(U, A, B, I) \ + _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi32(A, B, I) \ + _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \ + (__v4si)(B), \ + (int)(I), \ + (__v4si)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi32(U, A, B, I) \ + _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi32(A, B, I) \ + _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \ + (__v16hi)(B), \ + (int)(I), \ + (__v16hi)(S), \ + (__mmask16)(U)); }) + +#define _mm256_maskz_shldi_epi16(U, A, B, I) \ + _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi16(A, B, I) \ + _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \ + (__v8hi)(B), \ + (int)(I), \ + (__v8hi)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi16(U, A, B, I) \ + _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi16(A, B, I) \ + _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \ + (__v4di)(B), \ + (int)(I), \ + (__v4di)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ + _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi64(A, B, I) \ + _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \ + (__v2di)(B), \ + (int)(I), \ + (__v2di)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi64(U, A, B, I) \ + _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi64(A, B, I) \ + _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \ + (__v8si)(B), \ + (int)(I), \ + (__v8si)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ + _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi32(A, B, I) \ + _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \ + (__v4si)(B), \ + (int)(I), \ + (__v4si)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi32(U, A, B, I) \ + _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi32(A, B, I) \ + _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \ + (__v16hi)(B), \ + (int)(I), \ + (__v16hi)(S), \ + (__mmask16)(U)); }) + +#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ + _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi16(A, B, I) \ + _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \ + (__v8hi)(B), \ + (int)(I), \ + (__v8hi)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi16(U, A, B, I) \ + _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi16(A, B, I) \ + _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vlvnniintrin.h b/c_headers/avx512vlvnniintrin.h new file mode 100644 index 0000000000..745ae8b7ad --- /dev/null +++ b/c_headers/avx512vlvnniintrin.h @@ -0,0 +1,254 @@ +/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLVNNIINTRIN_H +#define __AVX512VLVNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"))) + + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vnniintrin.h b/c_headers/avx512vnniintrin.h new file mode 100644 index 0000000000..0c6badd231 --- /dev/null +++ b/c_headers/avx512vnniintrin.h @@ -0,0 +1,146 @@ +/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VNNIINTRIN_H +#define __AVX512VNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vpopcntdqvlintrin.h b/c_headers/avx512vpopcntdqvlintrin.h new file mode 100644 index 0000000000..c2058a8f51 --- /dev/null +++ b/c_headers/avx512vpopcntdqvlintrin.h @@ -0,0 +1,99 @@ +/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics + *------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VPOPCNTDQVLINTRIN_H +#define __AVX512VPOPCNTDQVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectq_128( + (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectq_256( + (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/cetintrin.h b/c_headers/cetintrin.h new file mode 100644 index 0000000000..1256a3f63a --- /dev/null +++ b/c_headers/cetintrin.h @@ -0,0 +1,93 @@ +/*===---- cetintrin.h - CET intrinsic ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CETINTRIN_H +#define __CETINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("shstk"))) + +static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) { + __builtin_ia32_incsspd(__a); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) { + __builtin_ia32_incsspq(__a); +} +#endif /* __x86_64__ */ + +static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) { + return __builtin_ia32_rdsspd(__a); +} + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) { + return __builtin_ia32_rdsspq(__a); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() { + __builtin_ia32_saveprevssp(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) { + __builtin_ia32_rstorssp(__p); +} + +static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) { + __builtin_ia32_wrssd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) { + __builtin_ia32_wrssq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) { + __builtin_ia32_wrussd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) { + __builtin_ia32_wrussq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() { + __builtin_ia32_setssbsy(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) { + __builtin_ia32_clrssbsy(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __CETINTRIN_H */ diff --git a/c_headers/clflushoptintrin.h b/c_headers/clflushoptintrin.h index 60e0ead762..f1f1330234 100644 --- a/c_headers/clflushoptintrin.h +++ b/c_headers/clflushoptintrin.h @@ -32,7 +32,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt"))) static __inline__ void __DEFAULT_FN_ATTRS -_mm_clflushopt(char * __m) { +_mm_clflushopt(void const * __m) { __builtin_ia32_clflushopt(__m); } diff --git a/c_headers/clwbintrin.h b/c_headers/clwbintrin.h new file mode 100644 index 0000000000..2594a6c387 --- /dev/null +++ b/c_headers/clwbintrin.h @@ -0,0 +1,52 @@ +/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CLWBINTRIN_H +#define __CLWBINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb"))) + +/// \brief Writes back to memory the cache line (if modified) that contains the +/// linear address specified in \a __p from any level of the cache hierarchy in +/// the cache coherence domain +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLWB instruction. +/// +/// \param __p +/// A pointer to the memory location used to identify the cache line to be +/// written back. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clwb(void const *__p) { + __builtin_ia32_clwb(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/cpuid.h b/c_headers/cpuid.h index 2dd0add236..3ae90de0b9 100644 --- a/c_headers/cpuid.h +++ b/c_headers/cpuid.h @@ -173,16 +173,24 @@ #define bit_AVX512VL 0x80000000 /* Features in %ecx for leaf 7 sub-leaf 0 */ -#define bit_PREFTCHWT1 0x00000001 -#define bit_AVX512VBMI 0x00000002 -#define bit_PKU 0x00000004 -#define bit_OSPKE 0x00000010 +#define bit_PREFTCHWT1 0x00000001 +#define bit_AVX512VBMI 0x00000002 +#define bit_PKU 0x00000004 +#define bit_OSPKE 0x00000010 +#define bit_AVX512VBMI2 0x00000040 +#define bit_SHSTK 0x00000080 +#define bit_GFNI 0x00000100 +#define bit_VAES 0x00000200 +#define bit_VPCLMULQDQ 0x00000400 +#define bit_AVX512VNNI 0x00000800 +#define bit_AVX512BITALG 0x00001000 #define bit_AVX512VPOPCNTDQ 0x00004000 -#define bit_RDPID 0x00400000 +#define bit_RDPID 0x00400000 /* Features in %edx for leaf 7 sub-leaf 0 */ #define bit_AVX5124VNNIW 0x00000004 #define bit_AVX5124FMAPS 0x00000008 +#define bit_IBT 0x00100000 /* Features in %eax for leaf 13 sub-leaf 1 */ #define bit_XSAVEOPT 0x00000001 @@ -192,6 +200,7 @@ /* Features in %ecx for leaf 0x80000001 */ #define bit_LAHF_LM 0x00000001 #define bit_ABM 0x00000020 +#define bit_LZCNT bit_ABM /* for gcc compat */ #define bit_SSE4a 0x00000040 #define bit_PRFCHW 0x00000100 #define bit_XOP 0x00000800 diff --git a/c_headers/cuda_wrappers/algorithm b/c_headers/cuda_wrappers/algorithm index 95d9beb73c..cedd70762c 100644 --- a/c_headers/cuda_wrappers/algorithm +++ b/c_headers/cuda_wrappers/algorithm @@ -80,7 +80,7 @@ min(const __T &__a, const __T &__b, __Cmp __cmp) { template inline __device__ const __T & min(const __T &__a, const __T &__b) { - return __a < __b ? __b : __a; + return __a < __b ? __a : __b; } #ifdef _LIBCPP_END_NAMESPACE_STD diff --git a/c_headers/cuda_wrappers/new b/c_headers/cuda_wrappers/new index b77131af0e..71b7a52363 100644 --- a/c_headers/cuda_wrappers/new +++ b/c_headers/cuda_wrappers/new @@ -26,7 +26,6 @@ #include_next -// Device overrides for placement new and delete. #pragma push_macro("CUDA_NOEXCEPT") #if __cplusplus >= 201103L #define CUDA_NOEXCEPT noexcept @@ -34,6 +33,55 @@ #define CUDA_NOEXCEPT #endif +// Device overrides for non-placement new and delete. +__device__ inline void *operator new(__SIZE_TYPE__ size) { + if (size == 0) { + size = 1; + } + return ::malloc(size); +} +__device__ inline void *operator new(__SIZE_TYPE__ size, + const std::nothrow_t &) CUDA_NOEXCEPT { + return ::operator new(size); +} + +__device__ inline void *operator new[](__SIZE_TYPE__ size) { + return ::operator new(size); +} +__device__ inline void *operator new[](__SIZE_TYPE__ size, + const std::nothrow_t &) { + return ::operator new(size); +} + +__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT { + if (ptr) { + ::free(ptr); + } +} +__device__ inline void operator delete(void *ptr, + const std::nothrow_t &) CUDA_NOEXCEPT { + ::operator delete(ptr); +} + +__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +__device__ inline void operator delete[](void *ptr, + const std::nothrow_t &) CUDA_NOEXCEPT { + ::operator delete(ptr); +} + +// Sized delete, C++14 only. +#if __cplusplus >= 201402L +__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +#endif + +// Device overrides for placement new and delete. __device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT { return __ptr; } @@ -42,6 +90,7 @@ __device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT } __device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {} __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {} + #pragma pop_macro("CUDA_NOEXCEPT") #endif // include guard diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h index 709815cbb4..b332eeec20 100644 --- a/c_headers/emmintrin.h +++ b/c_headers/emmintrin.h @@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b) /// \brief Calculates the square root of the lower double-precision value of /// the second operand and returns it in the lower 64 bits of the result. -/// The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile /// @@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a) /// \brief Compares lower 64-bit double-precision values of both operands, and /// returns the lesser of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile /// @@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b) /// \brief Compares lower 64-bit double-precision values of both operands, and /// returns the greater of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile /// @@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b) } /// \brief Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. The -/// comparison yields 0 for false, 1 for true. +/// the two 128-bit floating-point vectors of [2 x double] for equality. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b) { @@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b) /// the value in the first parameter is less than the corresponding value in /// the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b) { @@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b) /// the value in the first parameter is less than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b) { @@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b) /// the value in the first parameter is greater than the corresponding value /// in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b) { @@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b) /// the value in the first parameter is greater than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b) { @@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b) /// the value in the first parameter is unequal to the corresponding value in /// the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 1 is returned. /// /// \headerfile /// @@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b) { @@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b) /// the two 128-bit floating-point vectors of [2 x double] for equality. The /// comparison yields 0 for false, 1 for true. /// -/// If either of the two lower double-precision values is NaN, 1 is returned. +/// If either of the two lower double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b) { @@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) /// the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. +/// double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b) { @@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) /// corresponding value in the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. +/// double-precision values is NaN, 0 is returned. /// /// \headerfile /// @@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b) { @@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) /// the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 0 is returned. +/// double-precision values is NaN, 1 is returned. /// /// \headerfile /// @@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison result. If either of the two -/// lower double-precision values is NaN, 0 is returned. +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b) { @@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a) /// /// \headerfile /// -/// This intrinsic corresponds to the VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. +/// This intrinsic corresponds to the +/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. /// /// \param __dp /// A pointer to a memory location that can store two double-precision /// values. /// \param __a /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each -/// of the values in \a dp. +/// of the values in \a __dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a) { @@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a) _mm_store_pd(__dp, __a); } -/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory -/// location. +/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. /// /// \headerfile /// -/// This intrinsic corresponds to the VMOVAPD / MOVAPD instruction. +/// This intrinsic corresponds to the +/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. /// /// \param __dp -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 16-byte aligned. +/// A pointer to a memory location that can store two double-precision +/// values. /// \param __a -/// A 128-bit vector of [2 x double] containing the values to be stored. +/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each +/// of the values in \a __dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a) { @@ -2258,7 +2274,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); + typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v16qu)__a, __v16hu) + + __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) + >> 1, __v16qu); } /// \brief Computes the rounded avarages of corresponding elements of two @@ -2278,7 +2298,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); + typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v8hu)__a, __v8su) + + __builtin_convertvector((__v8hu)__b, __v8su)) + 1) + >> 1, __v8hu); } /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] @@ -3838,8 +3862,7 @@ _mm_set1_epi8(char __b) /// /// \headerfile /// -/// This intrinsic corresponds to the VPUNPCKLQDQ / PUNPCKLQDQ -/// instruction. +/// This intrinsic does not correspond to a specific instruction. /// /// \param __q0 /// A 64-bit integral value used to initialize the lower 64 bits of the @@ -4010,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) /// specified unaligned memory location. When a mask bit is 1, the /// corresponding byte is written, otherwise it is not written. /// -/// To minimize caching, the date is flagged as non-temporal (unlikely to be +/// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). Exception and trap behavior for elements not selected /// for storage to memory are implementation dependent. /// @@ -4524,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); } -/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors -/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. /// /// \headerfile /// @@ -4657,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b) /// /// \headerfile /// -/// This intrinsic has no corresponding instruction. +/// This intrinsic corresponds to the MOVDQ2Q instruction. /// /// \param __a /// A 128-bit integer vector operand. The lower 64 bits are moved to the @@ -4674,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a) /// /// \headerfile /// -/// This intrinsic corresponds to the VMOVQ / MOVQ / MOVD instruction. +/// This intrinsic corresponds to the MOVD+VMOVQ instruction. /// /// \param __a /// A 64-bit value. @@ -4704,8 +4727,8 @@ _mm_move_epi64(__m128i __a) return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); } -/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors -/// of [2 x double] and interleaves them into a 128-bit vector of [2 x +/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// /// \headerfile @@ -4725,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b) return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); } -/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors +/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors /// of [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// @@ -4784,9 +4807,9 @@ _mm_movemask_pd(__m128d __a) /// A 128-bit vector of [2 x double]. /// \param i /// An 8-bit immediate value. The least significant two bits specify which -/// elements to copy from a and b: \n -/// Bit[0] = 0: lower element of a copied to lower element of result. \n -/// Bit[0] = 1: upper element of a copied to lower element of result. \n +/// elements to copy from \a a and \a b: \n +/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n +/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n /// \returns A 128-bit vector of [2 x double] containing the shuffled values. diff --git a/c_headers/float.h b/c_headers/float.h index 502143d4e4..44d4d05494 100644 --- a/c_headers/float.h +++ b/c_headers/float.h @@ -143,4 +143,18 @@ # define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__ #endif +#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__ +# define FLT16_MANT_DIG __FLT16_MANT_DIG__ +# define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__ +# define FLT16_DIG __FLT16_DIG__ +# define FLT16_MIN_EXP __FLT16_MIN_EXP__ +# define FLT16_MIN_10_EXP __FLT16_MIN_10_EXP__ +# define FLT16_MAX_EXP __FLT16_MAX_EXP__ +# define FLT16_MAX_10_EXP __FLT16_MAX_10_EXP__ +# define FLT16_MAX __FLT16_MAX__ +# define FLT16_EPSILON __FLT16_EPSILON__ +# define FLT16_MIN __FLT16_MIN__ +# define FLT16_TRUE_MIN __FLT16_TRUE_MIN__ +#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */ + #endif /* __FLOAT_H */ diff --git a/c_headers/fma4intrin.h b/c_headers/fma4intrin.h index 11aa8ceacf..962b1a60a2 100644 --- a/c_headers/fma4intrin.h +++ b/c_headers/fma4intrin.h @@ -60,73 +60,73 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/fmaintrin.h b/c_headers/fmaintrin.h index 0e2ef0b171..478a0ac81c 100644 --- a/c_headers/fmaintrin.h +++ b/c_headers/fmaintrin.h @@ -46,85 +46,85 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/gfniintrin.h b/c_headers/gfniintrin.h new file mode 100644 index 0000000000..20fadccfaa --- /dev/null +++ b/c_headers/gfniintrin.h @@ -0,0 +1,202 @@ +/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __GFNIINTRIN_H +#define __GFNIINTRIN_H + + +#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I)); }) + +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)); }) + + +#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I); }) + + +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I)); }) + +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)); }) + +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I); }) + + +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I)); }) + +#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S)); }) + +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \ + U, A, B, I); }) + +#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I)); }) + +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)); }) + + +#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I); }) + + +#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I)); }) + +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)); }) + +#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I); }) + + +#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I)); }) + +#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S)); }) + +#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \ + U, A, B, I); }) + +/* Default attributes for simple form (no masking). */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"))) + +/* Default attributes for VLX forms. */ +#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128(__U, + (__v16qi) _mm_gf2p8mul_epi8(__A, __B), + (__v16qi) __S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), + __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, + (__v32qi) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256(__U, + (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), + (__v32qi) __S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), + __U, __A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A, + (__v64qi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512(__U, + (__v64qi) _mm512_gf2p8mul_epi8(__A, __B), + (__v64qi) __S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(), + __U, __A, __B); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_F +#undef __DEFAULT_FN_ATTRS_VL + +#endif // __GFNIINTRIN_H + diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h index c5f25bfcb5..d3421dc86c 100644 --- a/c_headers/immintrin.h +++ b/c_headers/immintrin.h @@ -58,6 +58,10 @@ #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__) #include #endif @@ -114,6 +118,10 @@ _mm256_cvtph_ps(__m128i __a) } #endif /* __AVX2__ */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) #include #endif @@ -142,6 +150,10 @@ _mm256_cvtph_ps(__m128i __a) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__) #include #endif @@ -150,10 +162,29 @@ _mm256_cvtph_ps(__m128i __a) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__)) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VNNI__)) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BITALG__)) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || \ (defined(__AVX512VL__) && defined(__AVX512BW__)) #include @@ -191,6 +222,15 @@ _mm256_cvtph_ps(__m128i __a) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VBMI2__) && defined(__AVX512VL__)) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__) #include #endif @@ -199,6 +239,14 @@ _mm256_cvtph_ps(__m128i __a) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__) +#include +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__) +#include +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__) static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) _rdrand16_step(unsigned short *__p) @@ -315,6 +363,10 @@ _writegsbase_u64(unsigned long long __V) #include #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__) +#include +#endif + /* Some intrinsics inside adxintrin.h are available only on processors with ADX, * whereas others are also available at all times. */ #include diff --git a/c_headers/intrin.h b/c_headers/intrin.h index 881d05c0d1..b30aa215a4 100644 --- a/c_headers/intrin.h +++ b/c_headers/intrin.h @@ -38,6 +38,10 @@ #include #endif +#if defined(_M_ARM64) +#include +#endif + /* For the definition of jmp_buf. */ #if __STDC_HOSTED__ #include @@ -828,7 +832,7 @@ _InterlockedCompareExchange_nf(long volatile *_Destination, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); return _Comparand; } -static __inline__ short __DEFAULT_FN_ATTRS +static __inline__ long __DEFAULT_FN_ATTRS _InterlockedCompareExchange_rel(long volatile *_Destination, long _Exchange, long _Comparand) { __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h index 58c8daf3a5..ce204b04c0 100644 --- a/c_headers/opencl-c.h +++ b/c_headers/opencl-c.h @@ -11381,6 +11381,8 @@ half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c); * For each component of a vector type, * result[i] = if MSB of c[i] is set ? b[i] : a[i]. * For a scalar type, result = c ? b : a. + * b and a must have the same type. + * c must have the same number of elements and bits as a. */ char __ovld __cnfn select(char a, char b, char c); uchar __ovld __cnfn select(uchar a, uchar b, char c); @@ -11394,60 +11396,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, char8 c); uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c); char16 __ovld __cnfn select(char16 a, char16 b, char16 c); uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c); -short __ovld __cnfn select(short a, short b, char c); -ushort __ovld __cnfn select(ushort a, ushort b, char c); -short2 __ovld __cnfn select(short2 a, short2 b, char2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c); -short3 __ovld __cnfn select(short3 a, short3 b, char3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c); -short4 __ovld __cnfn select(short4 a, short4 b, char4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c); -short8 __ovld __cnfn select(short8 a, short8 b, char8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c); -short16 __ovld __cnfn select(short16 a, short16 b, char16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c); -int __ovld __cnfn select(int a, int b, char c); -uint __ovld __cnfn select(uint a, uint b, char c); -int2 __ovld __cnfn select(int2 a, int2 b, char2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c); -int3 __ovld __cnfn select(int3 a, int3 b, char3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c); -int4 __ovld __cnfn select(int4 a, int4 b, char4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c); -int8 __ovld __cnfn select(int8 a, int8 b, char8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c); -int16 __ovld __cnfn select(int16 a, int16 b, char16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c); -long __ovld __cnfn select(long a, long b, char c); -ulong __ovld __cnfn select(ulong a, ulong b, char c); -long2 __ovld __cnfn select(long2 a, long2 b, char2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c); -long3 __ovld __cnfn select(long3 a, long3 b, char3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c); -long4 __ovld __cnfn select(long4 a, long4 b, char4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c); -long8 __ovld __cnfn select(long8 a, long8 b, char8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c); -long16 __ovld __cnfn select(long16 a, long16 b, char16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c); -float __ovld __cnfn select(float a, float b, char c); -float2 __ovld __cnfn select(float2 a, float2 b, char2 c); -float3 __ovld __cnfn select(float3 a, float3 b, char3 c); -float4 __ovld __cnfn select(float4 a, float4 b, char4 c); -float8 __ovld __cnfn select(float8 a, float8 b, char8 c); -float16 __ovld __cnfn select(float16 a, float16 b, char16 c); -char __ovld __cnfn select(char a, char b, short c); -uchar __ovld __cnfn select(uchar a, uchar b, short c); -char2 __ovld __cnfn select(char2 a, char2 b, short2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c); -char3 __ovld __cnfn select(char3 a, char3 b, short3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c); -char4 __ovld __cnfn select(char4 a, char4 b, short4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c); -char8 __ovld __cnfn select(char8 a, char8 b, short8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c); -char16 __ovld __cnfn select(char16 a, char16 b, short16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c); + short __ovld __cnfn select(short a, short b, short c); ushort __ovld __cnfn select(ushort a, ushort b, short c); short2 __ovld __cnfn select(short2 a, short2 b, short2 c); @@ -11460,60 +11409,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, short8 c); ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c); short16 __ovld __cnfn select(short16 a, short16 b, short16 c); ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c); -int __ovld __cnfn select(int a, int b, short c); -uint __ovld __cnfn select(uint a, uint b, short c); -int2 __ovld __cnfn select(int2 a, int2 b, short2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c); -int3 __ovld __cnfn select(int3 a, int3 b, short3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c); -int4 __ovld __cnfn select(int4 a, int4 b, short4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c); -int8 __ovld __cnfn select(int8 a, int8 b, short8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c); -int16 __ovld __cnfn select(int16 a, int16 b, short16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c); -long __ovld __cnfn select(long a, long b, short c); -ulong __ovld __cnfn select(ulong a, ulong b, short c); -long2 __ovld __cnfn select(long2 a, long2 b, short2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c); -long3 __ovld __cnfn select(long3 a, long3 b, short3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c); -long4 __ovld __cnfn select(long4 a, long4 b, short4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c); -long8 __ovld __cnfn select(long8 a, long8 b, short8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c); -long16 __ovld __cnfn select(long16 a, long16 b, short16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c); -float __ovld __cnfn select(float a, float b, short c); -float2 __ovld __cnfn select(float2 a, float2 b, short2 c); -float3 __ovld __cnfn select(float3 a, float3 b, short3 c); -float4 __ovld __cnfn select(float4 a, float4 b, short4 c); -float8 __ovld __cnfn select(float8 a, float8 b, short8 c); -float16 __ovld __cnfn select(float16 a, float16 b, short16 c); -char __ovld __cnfn select(char a, char b, int c); -uchar __ovld __cnfn select(uchar a, uchar b, int c); -char2 __ovld __cnfn select(char2 a, char2 b, int2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c); -char3 __ovld __cnfn select(char3 a, char3 b, int3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c); -char4 __ovld __cnfn select(char4 a, char4 b, int4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c); -char8 __ovld __cnfn select(char8 a, char8 b, int8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c); -char16 __ovld __cnfn select(char16 a, char16 b, int16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c); -short __ovld __cnfn select(short a, short b, int c); -ushort __ovld __cnfn select(ushort a, ushort b, int c); -short2 __ovld __cnfn select(short2 a, short2 b, int2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c); -short3 __ovld __cnfn select(short3 a, short3 b, int3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c); -short4 __ovld __cnfn select(short4 a, short4 b, int4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c); -short8 __ovld __cnfn select(short8 a, short8 b, int8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c); -short16 __ovld __cnfn select(short16 a, short16 b, int16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c); + int __ovld __cnfn select(int a, int b, int c); uint __ovld __cnfn select(uint a, uint b, int c); int2 __ovld __cnfn select(int2 a, int2 b, int2 c); @@ -11526,60 +11422,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, int8 c); uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c); int16 __ovld __cnfn select(int16 a, int16 b, int16 c); uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c); -long __ovld __cnfn select(long a, long b, int c); -ulong __ovld __cnfn select(ulong a, ulong b, int c); -long2 __ovld __cnfn select(long2 a, long2 b, int2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c); -long3 __ovld __cnfn select(long3 a, long3 b, int3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c); -long4 __ovld __cnfn select(long4 a, long4 b, int4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c); -long8 __ovld __cnfn select(long8 a, long8 b, int8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c); -long16 __ovld __cnfn select(long16 a, long16 b, int16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c); float __ovld __cnfn select(float a, float b, int c); float2 __ovld __cnfn select(float2 a, float2 b, int2 c); float3 __ovld __cnfn select(float3 a, float3 b, int3 c); float4 __ovld __cnfn select(float4 a, float4 b, int4 c); float8 __ovld __cnfn select(float8 a, float8 b, int8 c); float16 __ovld __cnfn select(float16 a, float16 b, int16 c); -char __ovld __cnfn select(char a, char b, long c); -uchar __ovld __cnfn select(uchar a, uchar b, long c); -char2 __ovld __cnfn select(char2 a, char2 b, long2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c); -char3 __ovld __cnfn select(char3 a, char3 b, long3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c); -char4 __ovld __cnfn select(char4 a, char4 b, long4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c); -char8 __ovld __cnfn select(char8 a, char8 b, long8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c); -char16 __ovld __cnfn select(char16 a, char16 b, long16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c); -short __ovld __cnfn select(short a, short b, long c); -ushort __ovld __cnfn select(ushort a, ushort b, long c); -short2 __ovld __cnfn select(short2 a, short2 b, long2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c); -short3 __ovld __cnfn select(short3 a, short3 b, long3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c); -short4 __ovld __cnfn select(short4 a, short4 b, long4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c); -short8 __ovld __cnfn select(short8 a, short8 b, long8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c); -short16 __ovld __cnfn select(short16 a, short16 b, long16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c); -int __ovld __cnfn select(int a, int b, long c); -uint __ovld __cnfn select(uint a, uint b, long c); -int2 __ovld __cnfn select(int2 a, int2 b, long2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c); -int3 __ovld __cnfn select(int3 a, int3 b, long3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c); -int4 __ovld __cnfn select(int4 a, int4 b, long4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c); -int8 __ovld __cnfn select(int8 a, int8 b, long8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c); -int16 __ovld __cnfn select(int16 a, int16 b, long16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c); + long __ovld __cnfn select(long a, long b, long c); ulong __ovld __cnfn select(ulong a, ulong b, long c); long2 __ovld __cnfn select(long2 a, long2 b, long2 c); @@ -11592,12 +11441,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, long8 c); ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c); long16 __ovld __cnfn select(long16 a, long16 b, long16 c); ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c); -float __ovld __cnfn select(float a, float b, long c); -float2 __ovld __cnfn select(float2 a, float2 b, long2 c); -float3 __ovld __cnfn select(float3 a, float3 b, long3 c); -float4 __ovld __cnfn select(float4 a, float4 b, long4 c); -float8 __ovld __cnfn select(float8 a, float8 b, long8 c); -float16 __ovld __cnfn select(float16 a, float16 b, long16 c); + char __ovld __cnfn select(char a, char b, uchar c); uchar __ovld __cnfn select(uchar a, uchar b, uchar c); char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c); @@ -11610,60 +11454,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c); uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c); char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c); uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c); -short __ovld __cnfn select(short a, short b, uchar c); -ushort __ovld __cnfn select(ushort a, ushort b, uchar c); -short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c); -short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c); -short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c); -short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c); -short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c); -int __ovld __cnfn select(int a, int b, uchar c); -uint __ovld __cnfn select(uint a, uint b, uchar c); -int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c); -int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c); -int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c); -int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c); -int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c); -long __ovld __cnfn select(long a, long b, uchar c); -ulong __ovld __cnfn select(ulong a, ulong b, uchar c); -long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c); -long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c); -long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c); -long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c); -long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c); -float __ovld __cnfn select(float a, float b, uchar c); -float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c); -float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c); -float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c); -float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c); -float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c); -char __ovld __cnfn select(char a, char b, ushort c); -uchar __ovld __cnfn select(uchar a, uchar b, ushort c); -char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c); -char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c); -char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c); -char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c); -char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c); + short __ovld __cnfn select(short a, short b, ushort c); ushort __ovld __cnfn select(ushort a, ushort b, ushort c); short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c); @@ -11676,60 +11467,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c); ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c); short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c); ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c); -int __ovld __cnfn select(int a, int b, ushort c); -uint __ovld __cnfn select(uint a, uint b, ushort c); -int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c); -int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c); -int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c); -int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c); -int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c); -long __ovld __cnfn select(long a, long b, ushort c); -ulong __ovld __cnfn select(ulong a, ulong b, ushort c); -long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c); -long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c); -long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c); -long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c); -long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c); -float __ovld __cnfn select(float a, float b, ushort c); -float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c); -float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c); -float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c); -float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c); -float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c); -char __ovld __cnfn select(char a, char b, uint c); -uchar __ovld __cnfn select(uchar a, uchar b, uint c); -char2 __ovld __cnfn select(char2 a, char2 b, uint2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c); -char3 __ovld __cnfn select(char3 a, char3 b, uint3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c); -char4 __ovld __cnfn select(char4 a, char4 b, uint4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c); -char8 __ovld __cnfn select(char8 a, char8 b, uint8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c); -char16 __ovld __cnfn select(char16 a, char16 b, uint16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c); -short __ovld __cnfn select(short a, short b, uint c); -ushort __ovld __cnfn select(ushort a, ushort b, uint c); -short2 __ovld __cnfn select(short2 a, short2 b, uint2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c); -short3 __ovld __cnfn select(short3 a, short3 b, uint3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c); -short4 __ovld __cnfn select(short4 a, short4 b, uint4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c); -short8 __ovld __cnfn select(short8 a, short8 b, uint8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c); -short16 __ovld __cnfn select(short16 a, short16 b, uint16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c); + int __ovld __cnfn select(int a, int b, uint c); uint __ovld __cnfn select(uint a, uint b, uint c); int2 __ovld __cnfn select(int2 a, int2 b, uint2 c); @@ -11742,60 +11480,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, uint8 c); uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c); int16 __ovld __cnfn select(int16 a, int16 b, uint16 c); uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c); -long __ovld __cnfn select(long a, long b, uint c); -ulong __ovld __cnfn select(ulong a, ulong b, uint c); -long2 __ovld __cnfn select(long2 a, long2 b, uint2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c); -long3 __ovld __cnfn select(long3 a, long3 b, uint3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c); -long4 __ovld __cnfn select(long4 a, long4 b, uint4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c); -long8 __ovld __cnfn select(long8 a, long8 b, uint8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c); -long16 __ovld __cnfn select(long16 a, long16 b, uint16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c); float __ovld __cnfn select(float a, float b, uint c); float2 __ovld __cnfn select(float2 a, float2 b, uint2 c); float3 __ovld __cnfn select(float3 a, float3 b, uint3 c); float4 __ovld __cnfn select(float4 a, float4 b, uint4 c); float8 __ovld __cnfn select(float8 a, float8 b, uint8 c); float16 __ovld __cnfn select(float16 a, float16 b, uint16 c); -char __ovld __cnfn select(char a, char b, ulong c); -uchar __ovld __cnfn select(uchar a, uchar b, ulong c); -char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c); -char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c); -char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c); -char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c); -char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c); -short __ovld __cnfn select(short a, short b, ulong c); -ushort __ovld __cnfn select(ushort a, ushort b, ulong c); -short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c); -short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c); -short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c); -short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c); -short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c); -int __ovld __cnfn select(int a, int b, ulong c); -uint __ovld __cnfn select(uint a, uint b, ulong c); -int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c); -int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c); -int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c); -int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c); -int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c); + long __ovld __cnfn select(long a, long b, ulong c); ulong __ovld __cnfn select(ulong a, ulong b, ulong c); long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c); @@ -11808,12 +11499,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c); ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c); long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c); ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c); -float __ovld __cnfn select(float a, float b, ulong c); -float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c); -float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c); -float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c); -float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c); -float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c); + #ifdef cl_khr_fp64 double __ovld __cnfn select(double a, double b, long c); double2 __ovld __cnfn select(double2 a, double2 b, long2 c); @@ -13141,13 +12827,14 @@ void __ovld __conv barrier(cl_mem_fence_flags flags); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -typedef enum memory_scope -{ - memory_scope_work_item, - memory_scope_work_group, - memory_scope_device, - memory_scope_all_svm_devices, - memory_scope_sub_group +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif } memory_scope; void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope); @@ -13952,11 +13639,11 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v // enum values aligned with what clang uses in EmitAtomicExpr() typedef enum memory_order { - memory_order_relaxed, - memory_order_acquire, - memory_order_release, - memory_order_acq_rel, - memory_order_seq_cst + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST } memory_order; // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics @@ -16199,6 +15886,313 @@ double __ovld __conv sub_group_scan_inclusive_max(double x); #endif //cl_khr_subgroups cl_intel_subgroups +#if defined(cl_intel_subgroups) +// Intel-Specific Sub Group Functions +float __ovld __conv intel_sub_group_shuffle( float x, uint c ); +float2 __ovld __conv intel_sub_group_shuffle( float2 x, uint c ); +float3 __ovld __conv intel_sub_group_shuffle( float3 x, uint c ); +float4 __ovld __conv intel_sub_group_shuffle( float4 x, uint c ); +float8 __ovld __conv intel_sub_group_shuffle( float8 x, uint c ); +float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c ); + +int __ovld __conv intel_sub_group_shuffle( int x, uint c ); +int2 __ovld __conv intel_sub_group_shuffle( int2 x, uint c ); +int3 __ovld __conv intel_sub_group_shuffle( int3 x, uint c ); +int4 __ovld __conv intel_sub_group_shuffle( int4 x, uint c ); +int8 __ovld __conv intel_sub_group_shuffle( int8 x, uint c ); +int16 __ovld __conv intel_sub_group_shuffle( int16 x, uint c ); + +uint __ovld __conv intel_sub_group_shuffle( uint x, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle( uint2 x, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle( uint3 x, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle( uint4 x, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle( uint8 x, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle( uint16 x, uint c ); + +long __ovld __conv intel_sub_group_shuffle( long x, uint c ); +ulong __ovld __conv intel_sub_group_shuffle( ulong x, uint c ); + +float __ovld __conv intel_sub_group_shuffle_down( float cur, float next, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c ); + +int __ovld __conv intel_sub_group_shuffle_down( int cur, int next, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_down( uint cur, uint next, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c ); + +long __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c ); + +float __ovld __conv intel_sub_group_shuffle_up( float prev, float cur, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c ); + +int __ovld __conv intel_sub_group_shuffle_up( int prev, int cur, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_up( uint prev, uint cur, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c ); + +long __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c ); + +float __ovld __conv intel_sub_group_shuffle_xor( float x, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c ); + +int __ovld __conv intel_sub_group_shuffle_xor( int x, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_xor( uint x, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c ); + +long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c ); + +uint __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord ); +uint2 __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord ); +uint4 __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord ); +uint8 __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +uint __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord); +uint2 __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord); +uint4 __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord); +uint8 __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +uint __ovld __conv intel_sub_group_block_read( const __global uint* p ); +uint2 __ovld __conv intel_sub_group_block_read2( const __global uint* p ); +uint4 __ovld __conv intel_sub_group_block_read4( const __global uint* p ); +uint8 __ovld __conv intel_sub_group_block_read8( const __global uint* p ); + +void __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data); +void __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data); +void __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data); +void __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data); +void __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data); +void __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data); +void __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data ); +void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data ); +void __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data ); +void __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data ); + +#ifdef cl_khr_fp16 +half __ovld __conv intel_sub_group_shuffle( half x, uint c ); +half __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c ); +half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c ); +half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c ); +#endif + +#if defined(cl_khr_fp64) +double __ovld __conv intel_sub_group_shuffle( double x, uint c ); +double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c ); +double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c ); +double __ovld __conv intel_sub_group_shuffle_xor( double x, uint c ); +#endif + +#endif //cl_intel_subgroups + +#if defined(cl_intel_subgroups_short) +short __ovld __conv intel_sub_group_broadcast( short x, uint sub_group_local_id ); +short2 __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id ); +short3 __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id ); +short4 __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id ); +short8 __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id ); + +ushort __ovld __conv intel_sub_group_broadcast( ushort x, uint sub_group_local_id ); +ushort2 __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id ); +ushort3 __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id ); +ushort4 __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id ); +ushort8 __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id ); + +short __ovld __conv intel_sub_group_shuffle( short x, uint c ); +short2 __ovld __conv intel_sub_group_shuffle( short2 x, uint c ); +short3 __ovld __conv intel_sub_group_shuffle( short3 x, uint c ); +short4 __ovld __conv intel_sub_group_shuffle( short4 x, uint c ); +short8 __ovld __conv intel_sub_group_shuffle( short8 x, uint c ); +short16 __ovld __conv intel_sub_group_shuffle( short16 x, uint c); + +ushort __ovld __conv intel_sub_group_shuffle( ushort x, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle( ushort2 x, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle( ushort3 x, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle( ushort4 x, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle( ushort8 x, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c ); + +short __ovld __conv intel_sub_group_shuffle_down( short cur, short next, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_down( short2 cur, short2 next, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_down( short3 cur, short3 next, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_down( short4 cur, short4 next, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_down( short8 cur, short8 next, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_down( ushort cur, ushort next, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_down( ushort2 cur, ushort2 next, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_down( ushort3 cur, ushort3 next, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_down( ushort4 cur, ushort4 next, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_down( ushort8 cur, ushort8 next, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c ); + +short __ovld __conv intel_sub_group_shuffle_up( short cur, short next, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_up( short2 cur, short2 next, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_up( short3 cur, short3 next, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_up( short4 cur, short4 next, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_up( short8 cur, short8 next, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_up( ushort cur, ushort next, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_up( ushort2 cur, ushort2 next, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_up( ushort3 cur, ushort3 next, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_up( ushort4 cur, ushort4 next, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_up( ushort8 cur, ushort8 next, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c ); + +short __ovld __conv intel_sub_group_shuffle_xor( short x, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_xor( short2 x, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_xor( short3 x, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_xor( short4 x, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_xor( short8 x, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_xor( ushort x, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_xor( ushort2 x, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_xor( ushort3 x, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_xor( ushort4 x, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_xor( ushort8 x, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c ); + +short __ovld __conv intel_sub_group_reduce_add( short x ); +ushort __ovld __conv intel_sub_group_reduce_add( ushort x ); +short __ovld __conv intel_sub_group_reduce_min( short x ); +ushort __ovld __conv intel_sub_group_reduce_min( ushort x ); +short __ovld __conv intel_sub_group_reduce_max( short x ); +ushort __ovld __conv intel_sub_group_reduce_max( ushort x ); + +short __ovld __conv intel_sub_group_scan_exclusive_add( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_add( ushort x ); +short __ovld __conv intel_sub_group_scan_exclusive_min( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_min( ushort x ); +short __ovld __conv intel_sub_group_scan_exclusive_max( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_max( ushort x ); + +short __ovld __conv intel_sub_group_scan_inclusive_add( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_add( ushort x ); +short __ovld __conv intel_sub_group_scan_inclusive_min( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x ); +short __ovld __conv intel_sub_group_scan_inclusive_max( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x ); + +uint __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +uint __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord ); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +uint __ovld __conv intel_sub_group_block_read_ui( const __global uint* p ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p ); + +void __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data ); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data ); + +ushort __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord ); +ushort2 __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord ); +ushort4 __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord ); +ushort8 __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +ushort __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord); +ushort2 __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord); +ushort4 __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord); +ushort8 __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +ushort __ovld __conv intel_sub_group_block_read_us( const __global ushort* p ); +ushort2 __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p ); +ushort4 __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p ); +ushort8 __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p ); + +void __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort data); +void __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data); +void __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data); +void __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort data); +void __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data); +void __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data); +void __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data ); +void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data ); +void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data ); +void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data ); +#endif // cl_intel_subgroups_short + #ifdef cl_amd_media_ops uint __ovld amd_bitalign(uint a, uint b, uint c); uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c); diff --git a/c_headers/pmmintrin.h b/c_headers/pmmintrin.h index 559ece2e39..7ec08a1bcb 100644 --- a/c_headers/pmmintrin.h +++ b/c_headers/pmmintrin.h @@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b) return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); } -/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit -/// vector of [4 x float] to float values stored in a 128-bit vector of +/// \brief Moves and duplicates odd-indexed values from a 128-bit vector +/// of [4 x float] to float values stored in a 128-bit vector of /// [4 x float]. /// /// \headerfile @@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); } -/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of +/// \brief Duplicates even-indexed values from a 128-bit vector of /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. /// /// \headerfile diff --git a/c_headers/smmintrin.h b/c_headers/smmintrin.h index c2fa5a452b..e02775cea3 100644 --- a/c_headers/smmintrin.h +++ b/c_headers/smmintrin.h @@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// input vectors are used as an input for dot product; otherwise that input /// is treated as zero. Bits [1:0] determine which elements of the result /// will receive a copy of the final dot product, with bit [0] corresponding -/// to the lowest element and bit [3] corresponding to the highest element of +/// to the lowest element and bit [1] corresponding to the highest element of /// each [2 x double] vector. If a bit is set, the dot product is returned in /// the corresponding element; otherwise that element is set to zero. #define _mm_dp_pd(X, Y, M) __extension__ ({\ @@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n /// Bits[3:0]: If any of these bits are set, the corresponding result /// element is cleared. -/// \returns A 128-bit vector of [4 x float] containing the copied single- -/// precision floating point elements from the operands. +/// \returns A 128-bit vector of [4 x float] containing the copied +/// single-precision floating point elements from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and diff --git a/c_headers/stdbool.h b/c_headers/stdbool.h index 0467893f34..5cb66b55d0 100644 --- a/c_headers/stdbool.h +++ b/c_headers/stdbool.h @@ -32,12 +32,15 @@ #define true 1 #define false 0 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__) -/* Define _Bool, bool, false, true as a GNU extension. */ +/* Define _Bool as a GNU extension. */ #define _Bool bool +#if __cplusplus < 201103L +/* For C++98, define bool, false, true as a GNU extension. */ #define bool bool #define false false #define true true #endif +#endif #define __bool_true_false_are_defined 1 diff --git a/c_headers/unwind.h b/c_headers/unwind.h index 4f74a34787..345fa4d0c1 100644 --- a/c_headers/unwind.h +++ b/c_headers/unwind.h @@ -76,7 +76,13 @@ typedef intptr_t _sleb128_t; typedef uintptr_t _uleb128_t; struct _Unwind_Context; +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) +struct _Unwind_Control_Block; +typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */ +#else struct _Unwind_Exception; +typedef struct _Unwind_Exception _Unwind_Exception; +#endif typedef enum { _URC_NO_REASON = 0, #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \ @@ -109,8 +115,42 @@ typedef enum { } _Unwind_Action; typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code, - struct _Unwind_Exception *); + _Unwind_Exception *); +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) +typedef struct _Unwind_Control_Block _Unwind_Control_Block; +typedef uint32_t _Unwind_EHT_Header; + +struct _Unwind_Control_Block { + uint64_t exception_class; + void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *); + /* unwinder cache (private fields for the unwinder's use) */ + struct { + uint32_t reserved1; /* forced unwind stop function, 0 if not forced */ + uint32_t reserved2; /* personality routine */ + uint32_t reserved3; /* callsite */ + uint32_t reserved4; /* forced unwind stop argument */ + uint32_t reserved5; + } unwinder_cache; + /* propagation barrier cache (valid after phase 1) */ + struct { + uint32_t sp; + uint32_t bitpattern[5]; + } barrier_cache; + /* cleanup cache (preserved over cleanup) */ + struct { + uint32_t bitpattern[4]; + } cleanup_cache; + /* personality cache (for personality's benefit) */ + struct { + uint32_t fnstart; /* function start address */ + _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */ + uint32_t additional; /* additional data */ + uint32_t reserved1; + } pr_cache; + long long int : 0; /* force alignment of next item to 8-byte boundary */ +} __attribute__((__aligned__(8))); +#else struct _Unwind_Exception { _Unwind_Exception_Class exception_class; _Unwind_Exception_Cleanup_Fn exception_cleanup; @@ -120,23 +160,24 @@ struct _Unwind_Exception { * aligned". GCC has interpreted this to mean "use the maximum useful * alignment for the target"; so do we. */ } __attribute__((__aligned__)); +#endif typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action, _Unwind_Exception_Class, - struct _Unwind_Exception *, + _Unwind_Exception *, struct _Unwind_Context *, void *); -typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)( - int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *, - struct _Unwind_Context *); +typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(int, _Unwind_Action, + _Unwind_Exception_Class, + _Unwind_Exception *, + struct _Unwind_Context *); typedef _Unwind_Personality_Fn __personality_routine; typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *, void *); -#if defined(__arm__) && !defined(__APPLE__) - +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) typedef enum { _UVRSC_CORE = 0, /* integer register */ _UVRSC_VFP = 1, /* vfp */ @@ -158,14 +199,12 @@ typedef enum { _UVRSR_FAILED = 2 } _Unwind_VRS_Result; -#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__) typedef uint32_t _Unwind_State; #define _US_VIRTUAL_UNWIND_FRAME ((_Unwind_State)0) #define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1) #define _US_UNWIND_FRAME_RESUME ((_Unwind_State)2) #define _US_ACTION_MASK ((_Unwind_State)3) #define _US_FORCE_UNWIND ((_Unwind_State)8) -#endif _Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context, _Unwind_VRS_RegClass __regclass, @@ -224,13 +263,12 @@ _Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *); /* DWARF EH functions; currently not available on Darwin/ARM */ #if !defined(__APPLE__) || !defined(__arm__) - -_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *, - _Unwind_Stop_Fn, void *); -void _Unwind_DeleteException(struct _Unwind_Exception *); -void _Unwind_Resume(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *); +_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn, + void *); +void _Unwind_DeleteException(_Unwind_Exception *); +void _Unwind_Resume(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *); #endif @@ -241,11 +279,11 @@ typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t; void _Unwind_SjLj_Register(_Unwind_FunctionContext_t); void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t); -_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *, +_Unwind_Reason_Code _Unwind_SjLj_RaiseException(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn, void *); -void _Unwind_SjLj_Resume(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *); +void _Unwind_SjLj_Resume(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(_Unwind_Exception *); void *_Unwind_FindEnclosingFunction(void *); diff --git a/c_headers/vaesintrin.h b/c_headers/vaesintrin.h new file mode 100644 index 0000000000..efbb8a5652 --- /dev/null +++ b/c_headers/vaesintrin.h @@ -0,0 +1,98 @@ +/*===------------------ vaesintrin.h - VAES intrinsics ---------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VAESINTRIN_H +#define __VAESINTRIN_H + +/* Default attributes for YMM forms. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"))) + + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenc_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenc256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenc_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenc512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdec_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdec256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdec_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdec512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A, + (__v8di) __B); +} + + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_F + +#endif diff --git a/c_headers/vpclmulqdqintrin.h b/c_headers/vpclmulqdqintrin.h new file mode 100644 index 0000000000..21cda22210 --- /dev/null +++ b/c_headers/vpclmulqdqintrin.h @@ -0,0 +1,42 @@ +/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VPCLMULQDQINTRIN_H +#define __VPCLMULQDQINTRIN_H + +#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (char)(I)); }) + +#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (char)(I)); }) + +#endif // __VPCLMULQDQINTRIN_H + diff --git a/c_headers/xmmintrin.h b/c_headers/xmmintrin.h index bbc2117b4e..279c0275d9 100644 --- a/c_headers/xmmintrin.h +++ b/c_headers/xmmintrin.h @@ -2035,9 +2035,11 @@ _mm_storer_ps(float *__p, __m128 __a) _mm_store_ps(__p, __a); } -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 +#define _MM_HINT_ET0 7 +#define _MM_HINT_ET1 6 +#define _MM_HINT_T0 3 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 1 #define _MM_HINT_NTA 0 #ifndef _MSC_VER @@ -2068,7 +2070,8 @@ _mm_storer_ps(float *__p, __m128 __a) /// be generated. \n /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will /// be generated. -#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) +#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \ + ((sel) >> 2) & 1, (sel) & 0x3)) #endif /// \brief Stores a 64-bit integer in the specified aligned memory location. To diff --git a/ci/appveyor/appveyor.yml b/ci/appveyor/appveyor.yml index 0cae78027b..2122153494 100644 --- a/ci/appveyor/appveyor.yml +++ b/ci/appveyor/appveyor.yml @@ -7,3 +7,4 @@ after_build: - '%APPVEYOR_BUILD_FOLDER%\ci\appveyor\after_build.bat' cache: - 'llvm+clang-5.0.1-win64-msvc-release.tar.xz' + - 'llvm+clang-6.0.0-win64-msvc-release.tar.xz' diff --git a/ci/appveyor/build_script.bat b/ci/appveyor/build_script.bat index 6621cf059c..5ae47df5ec 100644 --- a/ci/appveyor/build_script.bat +++ b/ci/appveyor/build_script.bat @@ -7,13 +7,13 @@ SET "PATH=C:\msys64\mingw64\bin;C:\msys64\usr\bin;%PATH%" SET "MSYSTEM=MINGW64" SET "APPVEYOR_CACHE_ENTRY_ZIP_ARGS=-m0=Copy" -bash -lc "cd ${APPVEYOR_BUILD_FOLDER} && if [ -s ""llvm+clang-5.0.1-win64-msvc-release.tar.xz"" ]; then echo 'skipping LLVM download'; else wget 'https://s3.amazonaws.com/ziglang.org/deps/llvm%%2bclang-5.0.1-win64-msvc-release.tar.xz'; fi && tar xf llvm+clang-5.0.1-win64-msvc-release.tar.xz" || exit /b +bash -lc "cd ${APPVEYOR_BUILD_FOLDER} && if [ -s ""llvm+clang-6.0.0-win64-msvc-release.tar.xz"" ]; then echo 'skipping LLVM download'; else wget 'https://s3.amazonaws.com/ziglang.org/deps/llvm%%2bclang-6.0.0-win64-msvc-release.tar.xz'; fi && tar xf llvm+clang-6.0.0-win64-msvc-release.tar.xz" || exit /b SET "PATH=%PREVPATH%" SET "MSYSTEM=%PREVMSYSTEM%" SET "ZIGBUILDDIR=%APPVEYOR_BUILD_FOLDER%\build-msvc-release" -SET "ZIGPREFIXPATH=%APPVEYOR_BUILD_FOLDER%\llvm+clang-5.0.1-win64-msvc-release" +SET "ZIGPREFIXPATH=%APPVEYOR_BUILD_FOLDER%\llvm+clang-6.0.0-win64-msvc-release" call "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86_amd64 diff --git a/ci/travis_linux_install b/ci/travis_linux_install index 0dfbe8aa21..02d0d6d2e8 100755 --- a/ci/travis_linux_install +++ b/ci/travis_linux_install @@ -4,4 +4,4 @@ set -x sudo apt-get remove -y llvm-* sudo rm -rf /usr/local/* -sudo apt-get install -y clang-5.0 libclang-5.0 libclang-5.0-dev llvm-5.0 llvm-5.0-dev liblld-5.0 liblld-5.0-dev cmake wine1.6-amd64 +sudo apt-get install -y clang-6.0 libclang-6.0 libclang-6.0-dev llvm-6.0 llvm-6.0-dev liblld-6.0 liblld-6.0-dev cmake wine1.6-amd64 diff --git a/cmake/Findclang.cmake b/cmake/Findclang.cmake index 4c4a13e6ce..26617d36fc 100644 --- a/cmake/Findclang.cmake +++ b/cmake/Findclang.cmake @@ -26,16 +26,16 @@ if(MSVC) else() find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h PATHS - /usr/lib/llvm/5/include - /usr/lib/llvm-5.0/include + /usr/lib/llvm/6/include + /usr/lib/llvm-6.0/include /mingw64/include) macro(FIND_AND_ADD_CLANG_LIB _libname_) string(TOUPPER ${_libname_} _prettylibname_) find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_} PATHS - /usr/lib/llvm/5/lib - /usr/lib/llvm-5.0/lib + /usr/lib/llvm/6/lib + /usr/lib/llvm-6.0/lib /mingw64/lib /c/msys64/mingw64/lib c:\\msys64\\mingw64\\lib) diff --git a/cmake/Findlld.cmake b/cmake/Findlld.cmake index 911bb876b9..d3c9011205 100644 --- a/cmake/Findlld.cmake +++ b/cmake/Findlld.cmake @@ -6,12 +6,12 @@ # LLD_INCLUDE_DIRS # LLD_LIBRARIES -find_path(LLD_INCLUDE_DIRS NAMES lld/Driver/Driver.h +find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h PATHS - /usr/lib/llvm-5.0/include + /usr/lib/llvm-6.0/include /mingw64/include) -find_library(LLD_LIBRARY NAMES lld-5.0 lld PATHS /usr/lib/llvm-5.0/lib) +find_library(LLD_LIBRARY NAMES lld-6.0 lld PATHS /usr/lib/llvm-6.0/lib) if(EXISTS ${LLD_LIBRARY}) set(LLD_LIBRARIES ${LLD_LIBRARY}) else() @@ -19,7 +19,7 @@ else() string(TOUPPER ${_libname_} _prettylibname_) find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_} PATHS - /usr/lib/llvm-5.0/lib + /usr/lib/llvm-6.0/lib /mingw64/lib /c/msys64/mingw64/lib c:/msys64/mingw64/lib) @@ -29,13 +29,14 @@ else() endmacro(FIND_AND_ADD_LLD_LIB) FIND_AND_ADD_LLD_LIB(lldDriver) + FIND_AND_ADD_LLD_LIB(lldMinGW) FIND_AND_ADD_LLD_LIB(lldELF) FIND_AND_ADD_LLD_LIB(lldCOFF) FIND_AND_ADD_LLD_LIB(lldMachO) FIND_AND_ADD_LLD_LIB(lldReaderWriter) FIND_AND_ADD_LLD_LIB(lldCore) FIND_AND_ADD_LLD_LIB(lldYAML) - FIND_AND_ADD_LLD_LIB(lldConfig) + FIND_AND_ADD_LLD_LIB(lldCommon) endif() include(FindPackageHandleStandardArgs) diff --git a/cmake/Findllvm.cmake b/cmake/Findllvm.cmake index 87e80f24bc..d0a4a3cba8 100644 --- a/cmake/Findllvm.cmake +++ b/cmake/Findllvm.cmake @@ -8,12 +8,12 @@ # LLVM_LIBDIRS find_program(LLVM_CONFIG_EXE - NAMES llvm-config-5.0 llvm-config + NAMES llvm-config-6.0 llvm-config PATHS "/mingw64/bin" "/c/msys64/mingw64/bin" "c:/msys64/mingw64/bin" - "C:/Libraries/llvm-5.0.0/bin") + "C:/Libraries/llvm-6.0.0/bin") if(NOT(CMAKE_BUILD_TYPE STREQUAL "Debug")) execute_process( @@ -62,7 +62,7 @@ execute_process( set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS}) if(NOT LLVM_LIBRARIES) - find_library(LLVM_LIBRARIES NAMES LLVM LLVM-5.0 LLVM-5) + find_library(LLVM_LIBRARIES NAMES LLVM LLVM-6.0 LLVM-6) endif() link_directories("${CMAKE_PREFIX_PATH}/lib") diff --git a/deps/lld-prebuilt/COFF/Options.inc b/deps/lld-prebuilt/COFF/Options.inc index a2a0f3f892..b5414ffa0b 100644 --- a/deps/lld-prebuilt/COFF/Options.inc +++ b/deps/lld-prebuilt/COFF/Options.inc @@ -12,9 +12,10 @@ #ifdef PREFIX #define COMMA , PREFIX(prefix_0, {nullptr}) +PREFIX(prefix_3, {"--" COMMA nullptr}) PREFIX(prefix_2, {"/" COMMA "-" COMMA nullptr}) PREFIX(prefix_1, {"/" COMMA "-" COMMA "-?" COMMA nullptr}) -PREFIX(prefix_3, {"/?" COMMA "-?" COMMA nullptr}) +PREFIX(prefix_4, {"/?" COMMA "-?" COMMA nullptr}) #undef COMMA #endif // PREFIX @@ -30,19 +31,26 @@ OPTION(prefix_0, "", INPUT, Input, INVALID, INVALID, nullptr, 0, 0, nullp OPTION(prefix_0, "", UNKNOWN, Unknown, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "align:", align, Joined, INVALID, INVALID, nullptr, 0, 0, "Section alignment", nullptr, nullptr) +OPTION(prefix_1, "aligncomm:", aligncomm, Joined, INVALID, INVALID, nullptr, 0, 0, + "Set common symbol alignment", nullptr, nullptr) OPTION(prefix_1, "allowbind:no", allowbind_no, Flag, INVALID, INVALID, nullptr, 0, 0, "Disable DLL binding", nullptr, nullptr) -OPTION(prefix_1, "allowbind", allowbind, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "allowbind", allowbind, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable DLL binding (default)", nullptr, nullptr) OPTION(prefix_1, "allowisolation:no", allowisolation_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Set NO_ISOLATION bit", nullptr, nullptr) -OPTION(prefix_1, "allowisolation", allowisolation, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Disable DLL isolation", nullptr, nullptr) +OPTION(prefix_1, "allowisolation", allowisolation, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable DLL isolation (default)", nullptr, nullptr) OPTION(prefix_1, "alternatename:", alternatename, Joined, INVALID, INVALID, nullptr, 0, 0, "Define weak alias", nullptr, nullptr) OPTION(prefix_1, "appcontainer:no", appcontainer_no, Flag, INVALID, INVALID, nullptr, 0, 0, + "Image can run outside an app container (default)", nullptr, nullptr) +OPTION(prefix_1, "appcontainer", appcontainer, Flag, INVALID, INVALID, nullptr, 0, 0, "Image can only be run in an app container", nullptr, nullptr) -OPTION(prefix_1, "appcontainer", appcontainer, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "base:", base, Joined, INVALID, INVALID, nullptr, 0, 0, "Base address of the program", nullptr, nullptr) +OPTION(prefix_1, "debug:dwarf", debug_dwarf, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "debug:ghash", debug_ghash, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "debugtype:", debugtype, Joined, INVALID, INVALID, nullptr, 0, 0, "Debug Info Options", nullptr, nullptr) OPTION(prefix_1, "debug", debug, Flag, INVALID, INVALID, nullptr, 0, 0, @@ -60,22 +68,25 @@ OPTION(prefix_1, "dll", dll, Flag, INVALID, INVALID, nullptr, 0, 0, OPTION(prefix_1, "driver:", driver, Joined, INVALID, INVALID, nullptr, 0, 0, "Generate a Windows NT Kernel Mode Driver", nullptr, nullptr) OPTION(prefix_1, "dynamicbase:no", dynamicbase_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Disable address space layout randomization", nullptr, nullptr) -OPTION(prefix_1, "dynamicbase", dynamicbase, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Disable ASLR (default when /fixed)", nullptr, nullptr) +OPTION(prefix_1, "dynamicbase", dynamicbase, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable ASLR (default unless /fixed)", nullptr, nullptr) OPTION(prefix_1, "editandcontinue", editandcontinue, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "entry:", entry, Joined, INVALID, INVALID, nullptr, 0, 0, "Name of entry point symbol", nullptr, nullptr) OPTION(prefix_1, "errorlimit:", errorlimit, Joined, INVALID, INVALID, nullptr, 0, 0, "Maximum number of errors to emit before stopping (0 = no limit)", nullptr, nullptr) OPTION(prefix_1, "errorreport:", errorreport, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "export-all-symbols", export_all_symbols, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "export:", export, Joined, INVALID, INVALID, nullptr, 0, 0, "Export a function", nullptr, nullptr) OPTION(prefix_1, "failifmismatch:", failifmismatch, Joined, INVALID, INVALID, nullptr, 0, 0, "", nullptr, nullptr) OPTION(prefix_1, "fastfail", fastfail, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "fixed:no", fixed_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Enable base relocations", nullptr, nullptr) -OPTION(prefix_1, "fixed", fixed, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Enable base relocations (default)", nullptr, nullptr) +OPTION(prefix_1, "fixed", fixed, Flag, INVALID, INVALID, nullptr, 0, 0, + "Disable base relocations", nullptr, nullptr) OPTION(prefix_1, "force:unresolved", force_unresolved, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "force", force, Flag, INVALID, INVALID, nullptr, 0, 0, "Allow undefined symbols when creating executables", nullptr, nullptr) @@ -85,10 +96,12 @@ OPTION(prefix_1, "heap:", heap, Joined, INVALID, INVALID, nullptr, 0, 0, "Size of the heap", nullptr, nullptr) OPTION(prefix_1, "help", help, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "highentropyva:no", highentropyva_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Set HIGH_ENTROPY_VA bit", nullptr, nullptr) -OPTION(prefix_1, "highentropyva", highentropyva, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Disable 64-bit ASLR", nullptr, nullptr) +OPTION(prefix_1, "highentropyva", highentropyva, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable 64-bit ASLR (default on 64-bit)", nullptr, nullptr) OPTION(prefix_1, "idlout:", idlout, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_1, "ignore:", ignore, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "ignore:", ignore, Joined, INVALID, INVALID, nullptr, 0, 0, + "Specify warning codes to ignore", nullptr, nullptr) OPTION(prefix_1, "ignoreidl", ignoreidl, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "implib:", implib, Joined, INVALID, INVALID, nullptr, 0, 0, "Import library name", nullptr, nullptr) @@ -97,14 +110,20 @@ OPTION(prefix_2, "include:", incl, Joined, INVALID, INVALID, nullptr, 0, 0, OPTION(prefix_1, "incremental:no", no_incremental, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "incremental", incremental, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "largeaddressaware:no", largeaddressaware_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Disable large addresses", nullptr, nullptr) -OPTION(prefix_1, "largeaddressaware", largeaddressaware, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Disable large addresses (default on 32-bit)", nullptr, nullptr) +OPTION(prefix_1, "largeaddressaware", largeaddressaware, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable large addresses (default on 64-bit)", nullptr, nullptr) OPTION(prefix_1, "libpath:", libpath, Joined, INVALID, INVALID, nullptr, 0, 0, "Additional library search path", nullptr, nullptr) OPTION(prefix_1, "linkrepro:", linkrepro, Joined, INVALID, INVALID, nullptr, 0, 0, "Dump linker invocation and input files for debugging", nullptr, nullptr) +OPTION(prefix_1, "lldltocache:", lldltocache, Joined, INVALID, INVALID, nullptr, 0, 0, + "Path to ThinLTO cached object file directory", nullptr, nullptr) +OPTION(prefix_1, "lldltocachepolicy:", lldltocachepolicy, Joined, INVALID, INVALID, nullptr, 0, 0, + "Pruning policy for the ThinLTO cache", nullptr, nullptr) OPTION(prefix_2, "lldmap:", lldmap_file, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "lldmap", lldmap, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "lldmingw", lldmingw, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "lldsavetemps", lldsavetemps, Flag, INVALID, INVALID, nullptr, 0, 0, "Save temporary files instead of deleting them", nullptr, nullptr) OPTION(prefix_1, "machine:", machine, Joined, INVALID, INVALID, nullptr, 0, 0, @@ -126,28 +145,31 @@ OPTION(prefix_1, "merge:", merge, Joined, INVALID, INVALID, nullptr, 0, 0, OPTION(prefix_1, "mllvm:", mllvm, Joined, INVALID, INVALID, nullptr, 0, 0, "Options to pass to LLVM", nullptr, nullptr) OPTION(prefix_1, "msvclto", msvclto, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "natvis:", natvis, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "nodefaultlib:", nodefaultlib, Joined, INVALID, INVALID, nullptr, 0, 0, "Remove a default library", nullptr, nullptr) OPTION(prefix_1, "nodefaultlib", nodefaultlib_all, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "noentry", noentry, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "nologo", nologo, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_1, "nopdb", nopdb, Flag, INVALID, INVALID, nullptr, 0, 0, - "Disable PDB generation for DWARF users", nullptr, nullptr) -OPTION(prefix_1, "nosymtab", nosymtab, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "nxcompat:no", nxcompat_no, Flag, INVALID, INVALID, nullptr, 0, 0, "Disable data execution provention", nullptr, nullptr) -OPTION(prefix_1, "nxcompat", nxcompat, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "nxcompat", nxcompat, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable data execution prevention (default)", nullptr, nullptr) OPTION(prefix_1, "opt:", opt, Joined, INVALID, INVALID, nullptr, 0, 0, "Control optimizations", nullptr, nullptr) OPTION(prefix_1, "out:", out, Joined, INVALID, INVALID, nullptr, 0, 0, "Path to file to write output", nullptr, nullptr) +OPTION(prefix_2, "output-def:", output_def, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "pdb:", pdb, Joined, INVALID, INVALID, nullptr, 0, 0, "PDB file path", nullptr, nullptr) OPTION(prefix_1, "pdbaltpath:", pdbaltpath, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "profile", profile, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_3, "rsp-quoting=", rsp_quoting, Joined, INVALID, INVALID, nullptr, 0, 0, + "Quoting style for response files, 'windows' (default) or 'posix'", nullptr, nullptr) OPTION(prefix_1, "safeseh:no", safeseh_no, Flag, INVALID, INVALID, nullptr, 0, 0, - "Produce an image with Safe Exception Handler", nullptr, nullptr) -OPTION(prefix_1, "safeseh", safeseh, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) + "Don't produce an image with Safe Exception Handler", nullptr, nullptr) +OPTION(prefix_1, "safeseh", safeseh, Flag, INVALID, INVALID, nullptr, 0, 0, + "Produce an image with Safe Exception Handler (only for x86)", nullptr, nullptr) OPTION(prefix_1, "section:", section, Joined, INVALID, INVALID, nullptr, 0, 0, "Specify section attributes", nullptr, nullptr) OPTION(prefix_1, "stack:", stack, Joined, INVALID, INVALID, nullptr, 0, 0, @@ -163,12 +185,27 @@ OPTION(prefix_1, "tlbid:", tlbid, Joined, INVALID, INVALID, nullptr, 0, 0, nullp OPTION(prefix_1, "tlbout:", tlbout, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "tsaware:no", tsaware_no, Flag, INVALID, INVALID, nullptr, 0, 0, "Create non-Terminal Server aware executable", nullptr, nullptr) -OPTION(prefix_1, "tsaware", tsaware, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "tsaware", tsaware, Flag, INVALID, INVALID, nullptr, 0, 0, + "Create Terminal Server aware executable (default)", nullptr, nullptr) OPTION(prefix_1, "verbose:", verbose_all, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "verbose", verbose, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "version:", version, Joined, INVALID, INVALID, nullptr, 0, 0, "Specify a version number in the PE header", nullptr, nullptr) -OPTION(prefix_1, "wx:no", wx_no, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_1, "wx", wx, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_3, "", help_q, Flag, INVALID, help, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_3, "version", dash_dash_version, Flag, INVALID, INVALID, nullptr, 0, 0, + "Print version information", nullptr, nullptr) +OPTION(prefix_1, "wholearchive:", wholearchive_file, Joined, INVALID, INVALID, nullptr, 0, 0, + "Include all object files from this archive", nullptr, nullptr) +OPTION(prefix_1, "wholearchive", wholearchive_flag, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "WX:no", WX_no, Flag, INVALID, INVALID, nullptr, 0, 0, + "Don't treat warnings as errors", nullptr, nullptr) +OPTION(prefix_1, "WX", WX, Flag, INVALID, INVALID, nullptr, 0, 0, + "Treat warnings as errors", nullptr, nullptr) +OPTION(prefix_4, "", help_q, Flag, INVALID, help, nullptr, 0, 0, nullptr, nullptr, nullptr) #endif // OPTION + +#ifdef OPTTABLE_ARG_INIT +////////// +// Option Values + + +#endif // OPTTABLE_ARG_INIT diff --git a/deps/lld-prebuilt/DarwinLdOptions.inc b/deps/lld-prebuilt/DarwinLdOptions.inc index c70eb2967c..dc4f3dec54 100644 --- a/deps/lld-prebuilt/DarwinLdOptions.inc +++ b/deps/lld-prebuilt/DarwinLdOptions.inc @@ -187,3 +187,10 @@ OPTION(prefix_1, "v", v, Flag, INVALID, INVALID, nullptr, 0, 0, OPTION(prefix_1, "Z", Z, Flag, INVALID, INVALID, nullptr, 0, 0, "Do not search standard directories for libraries or frameworks", nullptr, nullptr) #endif // OPTION + +#ifdef OPTTABLE_ARG_INIT +////////// +// Option Values + + +#endif // OPTTABLE_ARG_INIT diff --git a/deps/lld-prebuilt/ELF/Options.inc b/deps/lld-prebuilt/ELF/Options.inc index 012841973b..640f1bdcc6 100644 --- a/deps/lld-prebuilt/ELF/Options.inc +++ b/deps/lld-prebuilt/ELF/Options.inc @@ -52,19 +52,23 @@ OPTION(prefix_2, "build-id", build_id, Flag, INVALID, INVALID, nullptr, 0, 0, "Generate build ID note", nullptr, nullptr) OPTION(prefix_2, "b", alias_format_b, Separate, INVALID, format, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "call_shared", alias_Bdynamic_call_shared, Flag, INVALID, Bdynamic, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "chroot", chroot, Separate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "color-diagnostics=", color_diagnostics_eq, Joined, INVALID, INVALID, nullptr, 0, 0, "Use colors in diagnostics", nullptr, nullptr) OPTION(prefix_2, "color-diagnostics", color_diagnostics, Flag, INVALID, INVALID, nullptr, 0, 0, "Use colors in diagnostics", nullptr, nullptr) -OPTION(prefix_2, "compress-debug-sections=", compress_debug_sections, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "compress-debug-sections=", compress_debug_sections_eq, Joined, INVALID, compress_debug_sections, nullptr, 0, 0, "Compress DWARF debug sections", nullptr, nullptr) -OPTION(prefix_3, "cref", cref, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "compress-debug-sections", compress_debug_sections, Separate, INVALID, INVALID, nullptr, 0, 0, + "Compress DWARF debug sections", nullptr, nullptr) +OPTION(prefix_2, "cref", cref, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "dc", alias_define_common_dc, Flag, INVALID, define_common, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "define-common", define_common, Flag, INVALID, INVALID, nullptr, 0, 0, "Assign space to common symbols", nullptr, nullptr) -OPTION(prefix_2, "defsym=", defsym, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "defsym=", defsym_eq, Joined, INVALID, defsym, nullptr, 0, 0, + "Define a symbol alias", nullptr, nullptr) +OPTION(prefix_2, "defsym", defsym, Separate, INVALID, INVALID, nullptr, 0, 0, "Define a symbol alias", nullptr, nullptr) -OPTION(prefix_2, "defsym", alias_defsym, Separate, INVALID, defsym, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "demangle", demangle, Flag, INVALID, INVALID, nullptr, 0, 0, "Demangle symbol names", nullptr, nullptr) OPTION(prefix_2, "detect-odr-violations", detect_odr_violations, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) @@ -81,7 +85,8 @@ OPTION(prefix_2, "dn", alias_Bstatic_dn, Flag, INVALID, Bstatic, nullptr, 0, 0, OPTION(prefix_2, "dp", alias_define_common_dp, Flag, INVALID, define_common, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "dynamic-linker", dynamic_linker, Separate, INVALID, INVALID, nullptr, 0, 0, "Which dynamic linker to use", nullptr, nullptr) -OPTION(prefix_2, "dynamic-list=", alias_dynamic_list, Joined, INVALID, dynamic_list, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "dynamic-list=", dynamic_list_eq, Joined, INVALID, dynamic_list, nullptr, 0, 0, + "Read a list of dynamic symbols", nullptr, nullptr) OPTION(prefix_2, "dynamic-list", dynamic_list, Separate, INVALID, INVALID, nullptr, 0, 0, "Read a list of dynamic symbols", nullptr, nullptr) OPTION(prefix_2, "dy", alias_Bdynamic_dy, Flag, INVALID, Bdynamic, nullptr, 0, 0, nullptr, nullptr, nullptr) @@ -97,18 +102,22 @@ OPTION(prefix_2, "enable-new-dtags", enable_new_dtags, Flag, INVALID, INVALID, n OPTION(prefix_2, "end-group", end_group, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "end-lib", end_lib, Flag, INVALID, INVALID, nullptr, 0, 0, "End a grouping of objects that should be treated as if they were together in an archive", nullptr, nullptr) -OPTION(prefix_2, "entry=", alias_entry_entry, Joined, INVALID, entry, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "entry=", entry_eq, Joined, INVALID, entry, nullptr, 0, 0, + "Name of entry point symbol", "", nullptr) OPTION(prefix_2, "entry", entry, Separate, INVALID, INVALID, nullptr, 0, 0, "Name of entry point symbol", "", nullptr) -OPTION(prefix_2, "error-limit=", alias_error_limit, Joined, INVALID, error_limit, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "error-limit=", error_limit_eq, Joined, INVALID, error_limit, nullptr, 0, 0, + "Maximum number of errors to emit before stopping (0 = no limit)", nullptr, nullptr) OPTION(prefix_2, "error-limit", error_limit, Separate, INVALID, INVALID, nullptr, 0, 0, "Maximum number of errors to emit before stopping (0 = no limit)", nullptr, nullptr) OPTION(prefix_2, "error-unresolved-symbols", error_unresolved_symbols, Flag, INVALID, INVALID, nullptr, 0, 0, "Report unresolved symbols as errors", nullptr, nullptr) -OPTION(prefix_2, "exclude-libs=", alias_exclude_libs, Joined, INVALID, exclude_libs, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "exclude-libs=", exclude_libs_eq, Joined, INVALID, exclude_libs, nullptr, 0, 0, + "Exclude static libraries from automatic export", nullptr, nullptr) OPTION(prefix_2, "exclude-libs", exclude_libs, Separate, INVALID, INVALID, nullptr, 0, 0, "Exclude static libraries from automatic export", nullptr, nullptr) -OPTION(prefix_2, "export-dynamic-symbol=", alias_export_dynamic_symbol, Joined, INVALID, export_dynamic_symbol, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "export-dynamic-symbol=", export_dynamic_symbol_eq, Joined, INVALID, export_dynamic_symbol, nullptr, 0, 0, + "Put a symbol in the dynamic symbol table", nullptr, nullptr) OPTION(prefix_2, "export-dynamic-symbol", export_dynamic_symbol, Separate, INVALID, INVALID, nullptr, 0, 0, "Put a symbol in the dynamic symbol table", nullptr, nullptr) OPTION(prefix_2, "export-dynamic", export_dynamic, Flag, INVALID, INVALID, nullptr, 0, 0, @@ -117,12 +126,19 @@ OPTION(prefix_1, "E", alias_export_dynamic_E, Flag, INVALID, export_dynamic, nul OPTION(prefix_1, "e", alias_entry_e, JoinedOrSeparate, INVALID, entry, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "fatal-warnings", fatal_warnings, Flag, INVALID, INVALID, nullptr, 0, 0, "Treat warnings as errors", nullptr, nullptr) -OPTION(prefix_2, "filter=", filter, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "filter=", filter_eq, Joined, INVALID, filter, nullptr, 0, 0, "Set DT_FILTER field to the specified name", nullptr, nullptr) -OPTION(prefix_2, "fini=", alias_fini_fini, Joined, INVALID, fini, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "filter", filter, Separate, INVALID, INVALID, nullptr, 0, 0, + "Set DT_FILTER field to the specified name", nullptr, nullptr) +OPTION(prefix_2, "fini=", fini_eq, Joined, INVALID, fini, nullptr, 0, 0, + "Specify a finalizer function", "", nullptr) OPTION(prefix_2, "fini", fini, Separate, INVALID, INVALID, nullptr, 0, 0, "Specify a finalizer function", "", nullptr) -OPTION(prefix_2, "format=", format, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "fix-cortex-a53-843419", fix_cortex_a53_843419, Flag, INVALID, INVALID, nullptr, 0, 0, + "Apply fixes for AArch64 Cortex-A53 erratum 843419", nullptr, nullptr) +OPTION(prefix_2, "format=", format_eq, Joined, INVALID, format, nullptr, 0, 0, + "Change the input format of the inputs following this option", "", nullptr) +OPTION(prefix_2, "format", format, Separate, INVALID, INVALID, nullptr, 0, 0, "Change the input format of the inputs following this option", "", nullptr) OPTION(prefix_2, "full-shutdown", full_shutdown, Flag, INVALID, INVALID, nullptr, 0, 0, "Perform a full shutdown instead of calling _exit", nullptr, nullptr) @@ -134,23 +150,36 @@ OPTION(prefix_2, "gdb-index", gdb_index, Flag, INVALID, INVALID, nullptr, 0, 0, "Generate .gdb_index section", nullptr, nullptr) OPTION(prefix_1, "G", G, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "g", g, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "hash-style=", alias_hash_style_hash_style, Joined, INVALID, hash_style, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "hash-style=", hash_style_eq, Joined, INVALID, hash_style, nullptr, 0, 0, + "Specify hash style (sysv, gnu or both)", nullptr, nullptr) OPTION(prefix_2, "hash-style", hash_style, Separate, INVALID, INVALID, nullptr, 0, 0, "Specify hash style (sysv, gnu or both)", nullptr, nullptr) OPTION(prefix_2, "help", help, Flag, INVALID, INVALID, nullptr, 0, 0, "Print option help", nullptr, nullptr) OPTION(prefix_1, "h", alias_soname_h, JoinedOrSeparate, INVALID, soname, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "icf-data", icf_data, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable ICF to also fold identical read only data", nullptr, nullptr) OPTION(prefix_2, "icf=all", icf_all, Flag, INVALID, INVALID, nullptr, 0, 0, "Enable identical code folding", nullptr, nullptr) OPTION(prefix_2, "icf=none", icf_none, Flag, INVALID, INVALID, nullptr, 0, 0, "Disable identical code folding", nullptr, nullptr) -OPTION(prefix_2, "image-base=", image_base, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "image-base=", image_base_eq, Joined, INVALID, image_base, nullptr, 0, 0, "Set the base address", nullptr, nullptr) -OPTION(prefix_2, "init=", alias_init_init, Joined, INVALID, init, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "image-base", image_base, Separate, INVALID, INVALID, nullptr, 0, 0, + "Set the base address", nullptr, nullptr) +OPTION(prefix_2, "init=", init_eq, Joined, INVALID, init, nullptr, 0, 0, + "Specify an initializer function", "", nullptr) OPTION(prefix_2, "init", init, Separate, INVALID, INVALID, nullptr, 0, 0, "Specify an initializer function", "", nullptr) -OPTION(prefix_2, "library-path=", alias_L__library_path, Joined, INVALID, L, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "library=", alias_l__library, Joined, INVALID, l, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "library-path=", library_path_eq, Joined, INVALID, library_path, nullptr, 0, 0, + "Add a directory to the library search path", "", nullptr) +OPTION(prefix_2, "library-path", library_path, Separate, INVALID, INVALID, nullptr, 0, 0, + "Add a directory to the library search path", "", nullptr) +OPTION(prefix_2, "library=", library_eq, Joined, INVALID, library, nullptr, 0, 0, + "Root name of library to use", "", nullptr) +OPTION(prefix_2, "library", library, Separate, INVALID, INVALID, nullptr, 0, 0, + "Root name of library to use", "", nullptr) +OPTION(prefix_2, "long-plt", long_plt, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "lto-aa-pipeline=", lto_aa_pipeline, Joined, INVALID, INVALID, nullptr, 0, 0, "AA pipeline to run during LTO. Used in conjunction with -lto-newpm-passes", nullptr, nullptr) OPTION(prefix_2, "lto-newpm-passes=", lto_newpm_passes, Joined, INVALID, INVALID, nullptr, 0, 0, @@ -159,13 +188,14 @@ OPTION(prefix_2, "lto-O", lto_O, Joined, INVALID, INVALID, nullptr, 0, 0, "Optimization level for LTO", "", nullptr) OPTION(prefix_2, "lto-partitions=", lto_partitions, Joined, INVALID, INVALID, nullptr, 0, 0, "Number of LTO codegen partitions", nullptr, nullptr) -OPTION(prefix_1, "L", L, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, - "Add a directory to the library search path", "", nullptr) -OPTION(prefix_1, "l", l, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, - "Root name of library to use", "", nullptr) -OPTION(prefix_2, "Map=", alias_Map_eq, Joined, INVALID, Map, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "Map", Map, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_1, "L", alias_library_path, JoinedOrSeparate, INVALID, library_path, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_1, "l", alias_library, JoinedOrSeparate, INVALID, library, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "Map=", Map_eq, Joined, INVALID, Map, nullptr, 0, 0, "Print a link map to the specified file", nullptr, nullptr) +OPTION(prefix_2, "Map", Map, Separate, INVALID, INVALID, nullptr, 0, 0, + "Print a link map to the specified file", nullptr, nullptr) +OPTION(prefix_2, "merge-exidx-entries", merge_exidx_entries, Flag, INVALID, INVALID, nullptr, 0, 0, + "Enable merging .ARM.exidx entries", nullptr, nullptr) OPTION(prefix_2, "mllvm", mllvm, Separate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "M", alias_print_map_M, Flag, INVALID, print_map, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "m", m, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, @@ -176,21 +206,32 @@ OPTION(prefix_2, "no-as-needed", no_as_needed, Flag, INVALID, INVALID, nullptr, "Always DT_NEEDED for shared libraries", nullptr, nullptr) OPTION(prefix_2, "no-color-diagnostics", no_color_diagnostics, Flag, INVALID, INVALID, nullptr, 0, 0, "Do not use colors in diagnostics", nullptr, nullptr) -OPTION(prefix_2, "no-copy-dt-needed-entries", no_copy_dt_needed_entries, Flag, INVALID, no_add_needed, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "no-copy-dt-needed-entries", no_copy_dt_needed_entries, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "no-ctors-in-init-array", no_ctors_in_init_array, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "no-define-common", no_define_common, Flag, INVALID, INVALID, nullptr, 0, 0, "Do not assign space to common symbols", nullptr, nullptr) OPTION(prefix_2, "no-demangle", no_demangle, Flag, INVALID, INVALID, nullptr, 0, 0, "Do not demangle symbol names", nullptr, nullptr) OPTION(prefix_2, "no-dynamic-linker", no_dynamic_linker, Flag, INVALID, INVALID, nullptr, 0, 0, "Inhibit output of .interp section", nullptr, nullptr) +OPTION(prefix_2, "no-eh-frame-hdr", no_eh_frame_hdr, Flag, INVALID, INVALID, nullptr, 0, 0, + "Do not create .eh_frame_hdr section", nullptr, nullptr) OPTION(prefix_2, "no-export-dynamic", no_export_dynamic, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "no-fatal-warnings", no_fatal_warnings, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "no-gc-sections", no_gc_sections, Flag, INVALID, INVALID, nullptr, 0, 0, "Disable garbage collection of unused sections", nullptr, nullptr) +OPTION(prefix_2, "no-gdb-index", no_gdb_index, Flag, INVALID, INVALID, nullptr, 0, 0, + "Do not generate .gdb_index section", nullptr, nullptr) OPTION(prefix_2, "no-gnu-unique", no_gnu_unique, Flag, INVALID, INVALID, nullptr, 0, 0, "Disable STB_GNU_UNIQUE symbol binding", nullptr, nullptr) OPTION(prefix_2, "no-keep-memory", no_keep_memory, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "no-merge-exidx-entries", no_merge_exidx_entries, Flag, INVALID, INVALID, nullptr, 0, 0, + "Disable merging .ARM.exidx entries", nullptr, nullptr) OPTION(prefix_2, "no-mmap-output-file", no_mmap_output_file, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_3, "no-omagic", no_omagic, Flag, INVALID, INVALID, nullptr, 0, 0, + "Do not set the text data sections to be writable", "", nullptr) +OPTION(prefix_2, "no-print-gc-sections", no_print_gc_sections, Flag, INVALID, INVALID, nullptr, 0, 0, + "Do not list removed unused sections", nullptr, nullptr) OPTION(prefix_2, "no-rosegment", no_rosegment, Flag, INVALID, INVALID, nullptr, 0, 0, "Do not put read-only non-executable sections in their own segment", nullptr, nullptr) OPTION(prefix_2, "no-threads", no_threads, Flag, INVALID, INVALID, nullptr, 0, 0, @@ -219,17 +260,25 @@ OPTION(prefix_3, "opt-remarks-filename", opt_remarks_filename, Separate, INVALID "YAML output file for optimization remarks", nullptr, nullptr) OPTION(prefix_3, "opt-remarks-with-hotness", opt_remarks_with_hotness, Flag, INVALID, INVALID, nullptr, 0, 0, "Include hotness informations in the optimization remarks file", nullptr, nullptr) +OPTION(prefix_2, "orphan-handling=", orphan_handling_eq, Joined, INVALID, orphan_handling, nullptr, 0, 0, + "Control how orphan sections are handled when linker script used", nullptr, nullptr) +OPTION(prefix_2, "orphan-handling", orphan_handling, Separate, INVALID, INVALID, nullptr, 0, 0, + "Control how orphan sections are handled when linker script used", nullptr, nullptr) OPTION(prefix_3, "output=", alias_o_output, Joined, INVALID, o, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_3, "output", alias_o_output2, Separate, INVALID, o, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_1, "O", O, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_1, "O", O, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, "Optimize output file size", nullptr, nullptr) OPTION(prefix_1, "o", o, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, "Path to file to write output", "", nullptr) +OPTION(prefix_2, "pack-dyn-relocs=", pack_dyn_relocs_eq, Joined, INVALID, INVALID, nullptr, 0, 0, + "Pack dynamic relocations in the given format (none or android)", "", nullptr) OPTION(prefix_2, "pic-executable", alias_pie_pic_executable, Flag, INVALID, pie, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "pie", pie, Flag, INVALID, INVALID, nullptr, 0, 0, "Create a position independent executable", nullptr, nullptr) -OPTION(prefix_2, "plugin-opt=", plugin_opt_eq, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "plugin-opt", plugin_opt, Separate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "plugin-opt=", plugin_opt_eq, Joined, INVALID, plugin_opt, nullptr, 0, 0, + "specifies LTO options for compatibility with GNU linkers", nullptr, nullptr) +OPTION(prefix_2, "plugin-opt", plugin_opt, Separate, INVALID, INVALID, nullptr, 0, 0, + "specifies LTO options for compatibility with GNU linkers", nullptr, nullptr) OPTION(prefix_2, "plugin=", plugin_eq, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "plugin", plugin, Separate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "print-gc-sections", print_gc_sections, Flag, INVALID, INVALID, nullptr, 0, 0, @@ -240,34 +289,42 @@ OPTION(prefix_2, "Qy", Qy, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullp OPTION(prefix_1, "q", alias_emit_relocs, Flag, INVALID, emit_relocs, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "relocatable", relocatable, Flag, INVALID, INVALID, nullptr, 0, 0, "Create relocatable object file", nullptr, nullptr) -OPTION(prefix_2, "reproduce=", alias_reproduce_eq, Joined, INVALID, reproduce, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "reproduce=", reproduce_eq, Joined, INVALID, reproduce, nullptr, 0, 0, + "Dump linker invocation and input files for debugging", nullptr, nullptr) OPTION(prefix_2, "reproduce", reproduce, Separate, INVALID, INVALID, nullptr, 0, 0, "Dump linker invocation and input files for debugging", nullptr, nullptr) -OPTION(prefix_2, "retain-symbols-file=", retain_symbols_file, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "retain-symbols-file=", retain_symbols_file_eq, Joined, INVALID, retain_symbols_file, nullptr, 0, 0, + "Retain only the symbols listed in the file", "", nullptr) +OPTION(prefix_2, "retain-symbols-file", retain_symbols_file, Separate, INVALID, INVALID, nullptr, 0, 0, "Retain only the symbols listed in the file", "", nullptr) -OPTION(prefix_2, "retain-symbols-file", alias_retain_symbols_file, Separate, INVALID, retain_symbols_file, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "rpath-link=", rpath_link_eq, Joined, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "rpath-link", rpath_link, Separate, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "rpath=", alias_rpath_rpath, Joined, INVALID, rpath, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "rpath=", rpath_eq, Joined, INVALID, rpath, nullptr, 0, 0, + "Add a DT_RUNPATH to the output", nullptr, nullptr) OPTION(prefix_2, "rpath", rpath, Separate, INVALID, INVALID, nullptr, 0, 0, "Add a DT_RUNPATH to the output", nullptr, nullptr) -OPTION(prefix_2, "rsp-quoting=", rsp_quoting, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "rsp-quoting=", rsp_quoting_eq, Joined, INVALID, rsp_quoting, nullptr, 0, 0, + "Quoting style for response files. Values supported: windows|posix", nullptr, nullptr) +OPTION(prefix_2, "rsp-quoting", rsp_quoting, Separate, INVALID, INVALID, nullptr, 0, 0, "Quoting style for response files. Values supported: windows|posix", nullptr, nullptr) OPTION(prefix_1, "R", alias_rpath_R, JoinedOrSeparate, INVALID, rpath, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "r", alias_relocatable_r, Flag, INVALID, relocatable, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "save-temps", save_temps, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "script=", alias_script, Joined, INVALID, script, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "script=", script_eq, Joined, INVALID, script, nullptr, 0, 0, + "Read linker script", nullptr, nullptr) OPTION(prefix_2, "script", script, Separate, INVALID, INVALID, nullptr, 0, 0, "Read linker script", nullptr, nullptr) OPTION(prefix_2, "section-start", section_start, Separate, INVALID, INVALID, nullptr, 0, 0, "Set address of section", "
", nullptr) OPTION(prefix_2, "shared", shared, Flag, INVALID, INVALID, nullptr, 0, 0, "Build a shared object", nullptr, nullptr) -OPTION(prefix_2, "soname=", soname, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "soname=", soname_eq, Joined, INVALID, soname, nullptr, 0, 0, + "Set DT_SONAME", nullptr, nullptr) +OPTION(prefix_2, "soname", soname, Separate, INVALID, INVALID, nullptr, 0, 0, "Set DT_SONAME", nullptr, nullptr) -OPTION(prefix_2, "soname", alias_soname_soname, Separate, INVALID, soname, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "sort-common", sort_common, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "sort-section=", alias_sort_section, Joined, INVALID, sort_section, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "sort-section=", sort_section_eq, Joined, INVALID, sort_section, nullptr, 0, 0, + "Specifies sections sorting rule when linkerscript is used", nullptr, nullptr) OPTION(prefix_2, "sort-section", sort_section, Separate, INVALID, INVALID, nullptr, 0, 0, "Specifies sections sorting rule when linkerscript is used", nullptr, nullptr) OPTION(prefix_2, "start-group", start_group, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) @@ -281,7 +338,9 @@ OPTION(prefix_2, "strip-debug", strip_debug, Flag, INVALID, INVALID, nullptr, 0, "Strip debugging information", nullptr, nullptr) OPTION(prefix_2, "symbol-ordering-file", symbol_ordering_file, Separate, INVALID, INVALID, nullptr, 0, 0, "Layout sections in the order specified by symbol file", nullptr, nullptr) -OPTION(prefix_2, "sysroot=", sysroot, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "sysroot=", sysroot_eq, Joined, INVALID, sysroot, nullptr, 0, 0, + "Set the system root", nullptr, nullptr) +OPTION(prefix_2, "sysroot", sysroot, Separate, INVALID, INVALID, nullptr, 0, 0, "Set the system root", nullptr, nullptr) OPTION(prefix_1, "S", alias_strip_debug_S, Flag, INVALID, strip_debug, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "s", alias_strip_all, Flag, INVALID, strip_all, nullptr, 0, 0, nullptr, nullptr, nullptr) @@ -289,12 +348,16 @@ OPTION(prefix_2, "target1-abs", target1_abs, Flag, INVALID, INVALID, nullptr, 0, "Interpret R_ARM_TARGET1 as R_ARM_ABS32", nullptr, nullptr) OPTION(prefix_2, "target1-rel", target1_rel, Flag, INVALID, INVALID, nullptr, 0, 0, "Interpret R_ARM_TARGET1 as R_ARM_REL32", nullptr, nullptr) -OPTION(prefix_2, "target2=", target2, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "target2=", target2_eq, Joined, INVALID, target2, nullptr, 0, 0, "Interpret R_ARM_TARGET2 as , where is one of rel, abs, or got-rel", "", nullptr) -OPTION(prefix_2, "Tbss=", alias_Tbss, Joined, INVALID, Tbss, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "target2", target2, Separate, INVALID, INVALID, nullptr, 0, 0, + "Interpret R_ARM_TARGET2 as , where is one of rel, abs, or got-rel", "", nullptr) +OPTION(prefix_2, "Tbss=", Tbss_eq, Joined, INVALID, Tbss, nullptr, 0, 0, + "Same as --section-start with .bss as the sectionname", nullptr, nullptr) OPTION(prefix_2, "Tbss", Tbss, Separate, INVALID, INVALID, nullptr, 0, 0, "Same as --section-start with .bss as the sectionname", nullptr, nullptr) -OPTION(prefix_2, "Tdata=", alias_Tdata, Joined, INVALID, Tdata, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "Tdata=", Tdata_eq, Joined, INVALID, Tdata, nullptr, 0, 0, + "Same as --section-start with .data as the sectionname", nullptr, nullptr) OPTION(prefix_2, "Tdata", Tdata, Separate, INVALID, INVALID, nullptr, 0, 0, "Same as --section-start with .data as the sectionname", nullptr, nullptr) OPTION(prefix_2, "thinlto-cache-dir=", thinlto_cache_dir, Joined, INVALID, INVALID, nullptr, 0, 0, @@ -305,27 +368,33 @@ OPTION(prefix_2, "thinlto-jobs=", thinlto_jobs, Joined, INVALID, INVALID, nullpt "Number of ThinLTO jobs", nullptr, nullptr) OPTION(prefix_2, "threads", threads, Flag, INVALID, INVALID, nullptr, 0, 0, "Run the linker multi-threaded", nullptr, nullptr) -OPTION(prefix_2, "trace-symbol=", trace_trace_symbol_eq, Joined, INVALID, trace_symbol, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "trace-symbol=", trace_symbol_eq, Joined, INVALID, trace_symbol, nullptr, 0, 0, + "Trace references to symbols", nullptr, nullptr) OPTION(prefix_2, "trace-symbol", trace_symbol, Separate, INVALID, INVALID, nullptr, 0, 0, "Trace references to symbols", nullptr, nullptr) OPTION(prefix_2, "trace", trace, Flag, INVALID, INVALID, nullptr, 0, 0, "Print the names of the input files", nullptr, nullptr) OPTION(prefix_2, "Ttext-segment=", alias_Ttext_segment_eq, Joined, INVALID, Ttext, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "Ttext-segment", alias_Ttext_segment, Separate, INVALID, Ttext, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "Ttext=", alias_Ttext, Joined, INVALID, Ttext, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "Ttext=", Ttext_eq, Joined, INVALID, Ttext, nullptr, 0, 0, + "Same as --section-start with .text as the sectionname", nullptr, nullptr) OPTION(prefix_2, "Ttext", Ttext, Separate, INVALID, INVALID, nullptr, 0, 0, "Same as --section-start with .text as the sectionname", nullptr, nullptr) OPTION(prefix_1, "T", alias_script_T, JoinedOrSeparate, INVALID, script, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_1, "t", alias_trace, Flag, INVALID, trace, nullptr, 0, 0, nullptr, nullptr, nullptr) -OPTION(prefix_2, "undefined=", alias_undefined_eq, Joined, INVALID, undefined, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "undefined=", undefined_eq, Joined, INVALID, undefined, nullptr, 0, 0, + "Force undefined symbol during linking", nullptr, nullptr) OPTION(prefix_2, "undefined", undefined, Separate, INVALID, INVALID, nullptr, 0, 0, "Force undefined symbol during linking", nullptr, nullptr) -OPTION(prefix_2, "unresolved-symbols=", unresolved_symbols, Joined, INVALID, INVALID, nullptr, 0, 0, +OPTION(prefix_2, "unresolved-symbols=", unresolved_symbols_eq, Joined, INVALID, unresolved_symbols, nullptr, 0, 0, + "Determine how to handle unresolved symbols", nullptr, nullptr) +OPTION(prefix_2, "unresolved-symbols", unresolved_symbols, Separate, INVALID, INVALID, nullptr, 0, 0, "Determine how to handle unresolved symbols", nullptr, nullptr) OPTION(prefix_1, "u", alias_undefined_u, JoinedOrSeparate, INVALID, undefined, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "verbose", verbose, Flag, INVALID, INVALID, nullptr, 0, 0, "Verbose mode", nullptr, nullptr) -OPTION(prefix_2, "version-script=", alias_version_script_eq, Joined, INVALID, version_script, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "version-script=", version_script_eq, Joined, INVALID, version_script, nullptr, 0, 0, + "Read a version script", nullptr, nullptr) OPTION(prefix_2, "version-script", version_script, Separate, INVALID, INVALID, nullptr, 0, 0, "Read a version script", nullptr, nullptr) OPTION(prefix_2, "version", version, Flag, INVALID, INVALID, nullptr, 0, 0, @@ -336,12 +405,14 @@ OPTION(prefix_1, "v", v, Flag, INVALID, INVALID, nullptr, 0, 0, OPTION(prefix_2, "warn-common", warn_common, Flag, INVALID, INVALID, nullptr, 0, 0, "Warn about duplicate common symbols", nullptr, nullptr) OPTION(prefix_2, "warn-execstack", warn_execstack, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "warn-once", warn_once, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "warn-shared-textrel", warn_shared_textrel, Flag, INVALID, INVALID, nullptr, 0, 0, nullptr, nullptr, nullptr) OPTION(prefix_2, "warn-unresolved-symbols", warn_unresolved_symbols, Flag, INVALID, INVALID, nullptr, 0, 0, "Report unresolved symbols as warnings", nullptr, nullptr) OPTION(prefix_2, "whole-archive", whole_archive, Flag, INVALID, INVALID, nullptr, 0, 0, "Force load of all members in a static library", nullptr, nullptr) -OPTION(prefix_2, "wrap=", alias_wrap_wrap, Joined, INVALID, wrap, nullptr, 0, 0, nullptr, nullptr, nullptr) +OPTION(prefix_2, "wrap=", wrap_eq, Joined, INVALID, wrap, nullptr, 0, 0, + "Use wrapper functions for symbol", "", nullptr) OPTION(prefix_2, "wrap", wrap, Separate, INVALID, INVALID, nullptr, 0, 0, "Use wrapper functions for symbol", "", nullptr) OPTION(prefix_1, "X", alias_discard_locals_X, Flag, INVALID, discard_locals, nullptr, 0, 0, nullptr, nullptr, nullptr) @@ -350,3 +421,10 @@ OPTION(prefix_1, "y", alias_trace_symbol_y, JoinedOrSeparate, INVALID, trace_sym OPTION(prefix_1, "z", z, JoinedOrSeparate, INVALID, INVALID, nullptr, 0, 0, "Linker option extensions", "