From 9c2d8056ce177fef2a1b859016b11362686213b1 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 16 Dec 2020 09:37:32 +0100 Subject: [PATCH 01/67] Update LLVM version numbers in CMake --- cmake/Findclang.cmake | 34 ++++++++++----------- cmake/Findlld.cmake | 20 ++++++------- cmake/Findllvm.cmake | 70 +++++++++++++++++++++---------------------- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/cmake/Findclang.cmake b/cmake/Findclang.cmake index b4bd80773d..3ce52df893 100644 --- a/cmake/Findclang.cmake +++ b/cmake/Findclang.cmake @@ -9,27 +9,27 @@ find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h PATHS - /usr/lib/llvm/11/include - /usr/lib/llvm-11/include - /usr/lib/llvm-11.0/include - /usr/local/llvm110/include - /usr/local/llvm11/include + /usr/lib/llvm/12/include + /usr/lib/llvm-12/include + /usr/lib/llvm-12.0/include + /usr/local/llvm120/include + /usr/local/llvm12/include /mingw64/include ) if(ZIG_PREFER_CLANG_CPP_DYLIB) find_library(CLANG_LIBRARIES NAMES - clang-cpp-11.0 - clang-cpp110 + clang-cpp-12.0 + clang-cpp120 clang-cpp PATHS ${CLANG_LIBDIRS} - /usr/lib/llvm/11/lib - /usr/lib/llvm/11/lib64 - /usr/lib/llvm-11/lib - /usr/local/llvm110/lib - /usr/local/llvm11/lib + /usr/lib/llvm/12/lib + /usr/lib/llvm/12/lib64 + /usr/lib/llvm-12/lib + /usr/local/llvm120/lib + /usr/local/llvm12/lib ) endif() @@ -39,11 +39,11 @@ if(NOT CLANG_LIBRARIES) find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_} PATHS ${CLANG_LIBDIRS} - /usr/lib/llvm/11/lib - /usr/lib/llvm-11/lib - /usr/lib/llvm-11.0/lib - /usr/local/llvm110/lib - /usr/local/llvm11/lib + /usr/lib/llvm/12/lib + /usr/lib/llvm-12/lib + /usr/lib/llvm-12.0/lib + /usr/local/llvm120/lib + /usr/local/llvm12/lib /mingw64/lib /c/msys64/mingw64/lib c:\\msys64\\mingw64\\lib diff --git a/cmake/Findlld.cmake b/cmake/Findlld.cmake index 3103601ff8..72724ecd1e 100644 --- a/cmake/Findlld.cmake +++ b/cmake/Findlld.cmake @@ -8,16 +8,16 @@ find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h PATHS - /usr/lib/llvm-11/include - /usr/local/llvm110/include - /usr/local/llvm11/include + /usr/lib/llvm-12/include + /usr/local/llvm120/include + /usr/local/llvm12/include /mingw64/include) -find_library(LLD_LIBRARY NAMES lld-11.0 lld110 lld +find_library(LLD_LIBRARY NAMES lld-12.0 lld120 lld PATHS - /usr/lib/llvm-11/lib - /usr/local/llvm110/lib - /usr/local/llvm11/lib + /usr/lib/llvm-12/lib + /usr/local/llvm120/lib + /usr/local/llvm12/lib ) if(EXISTS ${LLD_LIBRARY}) set(LLD_LIBRARIES ${LLD_LIBRARY}) @@ -27,9 +27,9 @@ else() find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_} PATHS ${LLD_LIBDIRS} - /usr/lib/llvm-11/lib - /usr/local/llvm110/lib - /usr/local/llvm11/lib + /usr/lib/llvm-12/lib + /usr/local/llvm120/lib + /usr/local/llvm12/lib /mingw64/lib /c/msys64/mingw64/lib c:/msys64/mingw64/lib) diff --git a/cmake/Findllvm.cmake b/cmake/Findllvm.cmake index 02f22037b8..ba29203169 100644 --- a/cmake/Findllvm.cmake +++ b/cmake/Findllvm.cmake @@ -9,37 +9,37 @@ find_path(LLVM_INCLUDE_DIRS NAMES llvm/IR/IRBuilder.h PATHS - /usr/lib/llvm/11/include - /usr/lib/llvm-11/include - /usr/lib/llvm-11.0/include - /usr/local/llvm11/include - /usr/local/llvm110/include + /usr/lib/llvm/12/include + /usr/lib/llvm-12/include + /usr/lib/llvm-12.0/include + /usr/local/llvm12/include + /usr/local/llvm120/include /mingw64/include ) if(ZIG_PREFER_CLANG_CPP_DYLIB) find_library(LLVM_LIBRARIES NAMES - LLVM-11.0 - LLVM-11 - LLVM-110 + LLVM-12.0 + LLVM-12 + LLVM-120 LLVM PATHS ${LLVM_LIBDIRS} - /usr/lib/llvm/11/lib - /usr/lib/llvm/11/lib64 - /usr/lib/llvm-11/lib - /usr/local/llvm11/lib - /usr/local/llvm110/lib + /usr/lib/llvm/12/lib + /usr/lib/llvm/12/lib64 + /usr/lib/llvm-12/lib + /usr/local/llvm12/lib + /usr/local/llvm120/lib ) find_program(LLVM_CONFIG_EXE - NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config + NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config PATHS "/mingw64/bin" "/c/msys64/mingw64/bin" "c:/msys64/mingw64/bin" - "C:/Libraries/llvm-11.0.0/bin") + "C:/Libraries/llvm-12.0.0/bin") if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND") message(FATAL_ERROR "unable to find llvm-config") @@ -54,23 +54,23 @@ if(ZIG_PREFER_CLANG_CPP_DYLIB) OUTPUT_VARIABLE LLVM_CONFIG_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) - if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() - if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() - if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG) find_program(LLVM_CONFIG_EXE - NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config + NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config PATHS "/mingw64/bin" "/c/msys64/mingw64/bin" "c:/msys64/mingw64/bin" - "C:/Libraries/llvm-11.0.0/bin") + "C:/Libraries/llvm-12.0.0/bin") if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND") message(FATAL_ERROR "unable to find llvm-config") @@ -85,14 +85,14 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG) OUTPUT_VARIABLE LLVM_CONFIG_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) - if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() - if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() - if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12) - message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") + if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13) + message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}") endif() execute_process( @@ -166,7 +166,7 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG) set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS}) if(NOT LLVM_LIBRARIES) - find_library(LLVM_LIBRARIES NAMES LLVM LLVM-11 LLVM-11.0) + find_library(LLVM_LIBRARIES NAMES LLVM LLVM-12 LLVM-12.0) endif() link_directories("${CMAKE_PREFIX_PATH}/lib") @@ -180,11 +180,11 @@ else() find_library(LLVM_${_prettylibname_}_LIB NAMES ${_libname_} PATHS ${LLVM_LIBDIRS} - /usr/lib/llvm/11/lib - /usr/lib/llvm-11/lib - /usr/lib/llvm-11.0/lib - /usr/local/llvm110/lib - /usr/local/llvm11/lib + /usr/lib/llvm/12/lib + /usr/lib/llvm-12/lib + /usr/lib/llvm-12.0/lib + /usr/local/llvm120/lib + /usr/local/llvm12/lib /mingw64/lib /c/msys64/mingw64/lib c:\\msys64\\mingw64\\lib) From 83ff94406e13e18c8826cd48a68c2c8d676feaac Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 16 Dec 2020 10:40:56 +0100 Subject: [PATCH 02/67] Update clang drivers llvm commit b2851aea80e5a8f0cfd6c3c5a56a6b00fb28c6b6 --- src/zig_clang_cc1as_main.cpp | 29 ++++++++++++++--------------- src/zig_clang_driver.cpp | 7 +++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/zig_clang_cc1as_main.cpp b/src/zig_clang_cc1as_main.cpp index 77b99b2013..de71026fbf 100644 --- a/src/zig_clang_cc1as_main.cpp +++ b/src/zig_clang_cc1as_main.cpp @@ -221,19 +221,13 @@ bool AssemblerInvocation::CreateFromArgs(AssemblerInvocation &Opts, // Any DebugInfoKind implies GenDwarfForAssembly. Opts.GenDwarfForAssembly = Args.hasArg(OPT_debug_info_kind_EQ); - if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections, - OPT_compress_debug_sections_EQ)) { - if (A->getOption().getID() == OPT_compress_debug_sections) { - // TODO: be more clever about the compression type auto-detection - Opts.CompressDebugSections = llvm::DebugCompressionType::GNU; - } else { - Opts.CompressDebugSections = - llvm::StringSwitch(A->getValue()) - .Case("none", llvm::DebugCompressionType::None) - .Case("zlib", llvm::DebugCompressionType::Z) - .Case("zlib-gnu", llvm::DebugCompressionType::GNU) - .Default(llvm::DebugCompressionType::None); - } + if (const Arg *A = Args.getLastArg(OPT_compress_debug_sections_EQ)) { + Opts.CompressDebugSections = + llvm::StringSwitch(A->getValue()) + .Case("none", llvm::DebugCompressionType::None) + .Case("zlib", llvm::DebugCompressionType::Z) + .Case("zlib-gnu", llvm::DebugCompressionType::GNU) + .Default(llvm::DebugCompressionType::None); } Opts.RelaxELFRelocations = Args.hasArg(OPT_mrelax_relocations); @@ -434,8 +428,11 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts, std::unique_ptr Str; std::unique_ptr MCII(TheTarget->createMCInstrInfo()); + assert(MCII && "Unable to create instruction info!"); + std::unique_ptr STI( TheTarget->createMCSubtargetInfo(Opts.Triple, Opts.CPU, FS)); + assert(STI && "Unable to create subtarget info!"); raw_pwrite_stream *Out = FDOS.get(); std::unique_ptr BOS; @@ -474,6 +471,8 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts, TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx)); std::unique_ptr MAB( TheTarget->createMCAsmBackend(*STI, *MRI, MCOptions)); + assert(MAB && "Unable to create asm backend!"); + std::unique_ptr OW = DwoOS ? MAB->createDwoObjectWriter(*Out, *DwoOS) : MAB->createObjectWriter(*Out); @@ -526,8 +525,8 @@ static bool ExecuteAssembler(AssemblerInvocation &Opts, Failed = Parser->Run(Opts.NoInitialTextSection); } - // Close Streamer first. - // It might have a reference to the output stream. + // Parser has a reference to the output stream (Str), so close Parser first. + Parser.reset(); Str.reset(); // Close the output stream early. BOS.reset(); diff --git a/src/zig_clang_driver.cpp b/src/zig_clang_driver.cpp index fbe407a06c..ac892f95e8 100644 --- a/src/zig_clang_driver.cpp +++ b/src/zig_clang_driver.cpp @@ -528,6 +528,13 @@ int ZigClang_main(int argc_, const char **argv_) { IsCrash = CommandRes < 0 || CommandRes == 70; #ifdef _WIN32 IsCrash |= CommandRes == 3; +#endif +#if LLVM_ON_UNIX + // When running in integrated-cc1 mode, the CrashRecoveryContext returns + // the same codes as if the program crashed. See section "Exit Status for + // Commands": + // https://pubs.opengroup.org/onlinepubs/9699919799/xrat/V4_xcu_chap02.html + IsCrash |= CommandRes > 128; #endif if (IsCrash) { TheDriver.generateCompilationDiagnostics(*C, *FailingCommand); From 8612dac225a55e09293b767aab3c81a459609bb4 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 16 Dec 2020 10:43:57 +0100 Subject: [PATCH 03/67] Update clang headers llvm commit b2851aea80e5a8f0cfd6c3c5a56a6b00fb28c6b6 --- lib/include/__clang_cuda_builtin_vars.h | 9 + lib/include/__clang_cuda_cmath.h | 50 +- lib/include/__clang_cuda_complex_builtins.h | 30 +- lib/include/__clang_cuda_math.h | 9 +- lib/include/__clang_cuda_runtime_wrapper.h | 28 +- lib/include/__clang_hip_cmath.h | 629 +++++++++ lib/include/__clang_hip_libdevice_declares.h | 26 +- lib/include/__clang_hip_math.h | 1194 ++++++++++-------- lib/include/__clang_hip_runtime_wrapper.h | 5 + lib/include/altivec.h | 1169 ++++++++++++++++- lib/include/amxintrin.h | 92 +- lib/include/arm_neon.h | 568 +++++++-- lib/include/arm_sve.h | 160 +-- lib/include/avx512fintrin.h | 157 +-- lib/include/avx512vlvnniintrin.h | 205 ++- lib/include/avxvnniintrin.h | 225 ++++ lib/include/cpuid.h | 8 + lib/include/cuda_wrappers/new | 8 + lib/include/gfniintrin.h | 209 ++- lib/include/hresetintrin.h | 49 + lib/include/ia32intrin.h | 97 +- lib/include/immintrin.h | 12 + lib/include/intrin.h | 173 ++- lib/include/keylockerintrin.h | 506 ++++++++ lib/include/mm_malloc.h | 6 + lib/include/opencl-c-base.h | 18 + lib/include/opencl-c.h | 2 + lib/include/openmp_wrappers/cmath | 5 +- lib/include/openmp_wrappers/complex | 25 + lib/include/openmp_wrappers/complex_cmath.h | 388 ++++++ lib/include/popcntintrin.h | 11 +- lib/include/ppc_wrappers/smmintrin.h | 24 + lib/include/uintrintrin.h | 150 +++ lib/include/wasm_simd128.h | 112 +- lib/include/x86gprintrin.h | 23 + 35 files changed, 5047 insertions(+), 1335 deletions(-) create mode 100644 lib/include/__clang_hip_cmath.h create mode 100644 lib/include/avxvnniintrin.h create mode 100644 lib/include/hresetintrin.h create mode 100644 lib/include/keylockerintrin.h create mode 100644 lib/include/openmp_wrappers/complex_cmath.h create mode 100644 lib/include/uintrintrin.h create mode 100644 lib/include/x86gprintrin.h diff --git a/lib/include/__clang_cuda_builtin_vars.h b/lib/include/__clang_cuda_builtin_vars.h index 2ba1521f25..412e823a82 100644 --- a/lib/include/__clang_cuda_builtin_vars.h +++ b/lib/include/__clang_cuda_builtin_vars.h @@ -55,7 +55,9 @@ struct __cuda_builtin_threadIdx_t { __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z()); // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a // uint3). This function is defined after we pull in vector_types.h. + __attribute__((device)) operator dim3() const; __attribute__((device)) operator uint3() const; + private: __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t); }; @@ -66,7 +68,9 @@ struct __cuda_builtin_blockIdx_t { __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z()); // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a // uint3). This function is defined after we pull in vector_types.h. + __attribute__((device)) operator dim3() const; __attribute__((device)) operator uint3() const; + private: __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t); }; @@ -78,6 +82,8 @@ struct __cuda_builtin_blockDim_t { // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a // dim3). This function is defined after we pull in vector_types.h. __attribute__((device)) operator dim3() const; + __attribute__((device)) operator uint3() const; + private: __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t); }; @@ -89,6 +95,8 @@ struct __cuda_builtin_gridDim_t { // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a // dim3). This function is defined after we pull in vector_types.h. __attribute__((device)) operator dim3() const; + __attribute__((device)) operator uint3() const; + private: __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t); }; @@ -108,5 +116,6 @@ __attribute__((device)) const int warpSize = 32; #undef __CUDA_DEVICE_BUILTIN #undef __CUDA_BUILTIN_VAR #undef __CUDA_DISALLOW_BUILTINVAR_ACCESS +#undef __DELETE #endif /* __CUDA_BUILTIN_VARS_H */ diff --git a/lib/include/__clang_cuda_cmath.h b/lib/include/__clang_cuda_cmath.h index 8ba182689a..5bbb59a93b 100644 --- a/lib/include/__clang_cuda_cmath.h +++ b/lib/include/__clang_cuda_cmath.h @@ -66,10 +66,38 @@ __DEVICE__ float frexp(float __arg, int *__exp) { } // For inscrutable reasons, the CUDA headers define these functions for us on -// Windows. For OpenMP we omit these as some old system headers have -// non-conforming `isinf(float)` and `isnan(float)` implementations that return -// an `int`. The system versions of these functions should be fine anyway. -#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__) +// Windows. +#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__) + +// For OpenMP we work around some old system headers that have non-conforming +// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do +// this by providing two versions of these functions, differing only in the +// return type. To avoid conflicting definitions we disable implicit base +// function generation. That means we will end up with two specializations, one +// per type, but only one has a base function defined by the system header. +#if defined(__OPENMP_NVPTX__) +#pragma omp begin declare variant match( \ + implementation = {extension(disable_implicit_base)}) + +// FIXME: We lack an extension to customize the mangling of the variants, e.g., +// add a suffix. This means we would clash with the names of the variants +// (note that we do not create implicit base functions here). To avoid +// this clash we add a new trait to some of them that is always true +// (this is LLVM after all ;)). It will only influence the mangled name +// of the variants inside the inner region and avoid the clash. +#pragma omp begin declare variant match(implementation = {vendor(llvm)}) + +__DEVICE__ int isinf(float __x) { return ::__isinff(__x); } +__DEVICE__ int isinf(double __x) { return ::__isinf(__x); } +__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); } +__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); } +__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); } +__DEVICE__ int isnan(double __x) { return ::__isnan(__x); } + +#pragma omp end declare variant + +#endif + __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); } __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); } __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); } @@ -79,6 +107,11 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); } __DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); } __DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); } __DEVICE__ bool isnan(double __x) { return ::__isnan(__x); } + +#if defined(__OPENMP_NVPTX__) +#pragma omp end declare variant +#endif + #endif __DEVICE__ bool isgreater(float __x, float __y) { @@ -142,6 +175,15 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); } __DEVICE__ float tan(float __x) { return ::tanf(__x); } __DEVICE__ float tanh(float __x) { return ::tanhf(__x); } +// There was a redefinition error for this this overload in CUDA mode. +// We restrict it to OpenMP mode for now, that is where it is actually needed +// anyway. +#ifdef __OPENMP_NVPTX__ +__DEVICE__ float remquo(float __n, float __d, int *__q) { + return ::remquof(__n, __d, __q); +} +#endif + // Notably missing above is nexttoward. We omit it because // libdevice doesn't provide an implementation, and we don't want to be in the // business of implementing tricky libm functions in this header. diff --git a/lib/include/__clang_cuda_complex_builtins.h b/lib/include/__clang_cuda_complex_builtins.h index d924487ab2..2b701fef0e 100644 --- a/lib/include/__clang_cuda_complex_builtins.h +++ b/lib/include/__clang_cuda_complex_builtins.h @@ -41,6 +41,27 @@ #define _ABSf std::abs #define _LOGBd std::logb #define _LOGBf std::logb +// Rather than pulling in std::max from algorithm everytime, use available ::max. +#define _fmaxd max +#define _fmaxf max +#else +#ifdef __AMDGCN__ +#define _ISNANd __ocml_isnan_f64 +#define _ISNANf __ocml_isnan_f32 +#define _ISINFd __ocml_isinf_f64 +#define _ISINFf __ocml_isinf_f32 +#define _ISFINITEd __ocml_isfinite_f64 +#define _ISFINITEf __ocml_isfinite_f32 +#define _COPYSIGNd __ocml_copysign_f64 +#define _COPYSIGNf __ocml_copysign_f32 +#define _SCALBNd __ocml_scalbn_f64 +#define _SCALBNf __ocml_scalbn_f32 +#define _ABSd __ocml_fabs_f64 +#define _ABSf __ocml_fabs_f32 +#define _LOGBd __ocml_logb_f64 +#define _LOGBf __ocml_logb_f32 +#define _fmaxd __ocml_fmax_f64 +#define _fmaxf __ocml_fmax_f32 #else #define _ISNANd __nv_isnand #define _ISNANf __nv_isnanf @@ -56,6 +77,9 @@ #define _ABSf __nv_fabsf #define _LOGBd __nv_logb #define _LOGBf __nv_logbf +#define _fmaxd __nv_fmax +#define _fmaxf __nv_fmaxf +#endif #endif #if defined(__cplusplus) @@ -167,7 +191,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c, // Can't use std::max, because that's defined in , and we don't // want to pull that in for every compile. The CUDA headers define // ::max(float, float) and ::max(double, double), which is sufficient for us. - double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d))); + double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d))); if (_ISFINITEd(__logbw)) { __ilogbw = (int)__logbw; __c = _SCALBNd(__c, -__ilogbw); @@ -200,7 +224,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c, __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) { int __ilogbw = 0; - float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d))); + float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d))); if (_ISFINITEf(__logbw)) { __ilogbw = (int)__logbw; __c = _SCALBNf(__c, -__ilogbw); @@ -249,6 +273,8 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) { #undef _ABSf #undef _LOGBd #undef _LOGBf +#undef _fmaxd +#undef _fmaxf #ifdef __OPENMP_NVPTX__ #pragma omp end declare target diff --git a/lib/include/__clang_cuda_math.h b/lib/include/__clang_cuda_math.h index 332e616702..acb26ad345 100644 --- a/lib/include/__clang_cuda_math.h +++ b/lib/include/__clang_cuda_math.h @@ -195,8 +195,8 @@ __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); } __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); } __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); } __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); } -__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); } -__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); } +__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); } +__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); } __DEVICE__ double nextafter(double __a, double __b) { return __nv_nextafter(__a, __b); } @@ -249,8 +249,9 @@ __DEVICE__ double rhypot(double __a, double __b) { __DEVICE__ float rhypotf(float __a, float __b) { return __nv_rhypotf(__a, __b); } -__DEVICE__ double rint(double __a) { return __nv_rint(__a); } -__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); } +// __nv_rint* in libdevice is buggy and produces incorrect results. +__DEVICE__ double rint(double __a) { return __builtin_rint(__a); } +__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); } __DEVICE__ double rnorm(int __a, const double *__b) { return __nv_rnorm(__a, __b); } diff --git a/lib/include/__clang_cuda_runtime_wrapper.h b/lib/include/__clang_cuda_runtime_wrapper.h index f43ed55de4..f88c39a9b6 100644 --- a/lib/include/__clang_cuda_runtime_wrapper.h +++ b/lib/include/__clang_cuda_runtime_wrapper.h @@ -377,30 +377,38 @@ __device__ static inline void *malloc(size_t __size) { // Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to // come after we've pulled in the definition of uint3 and dim3. +__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const { + return dim3(x, y, z); +} + __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const { - uint3 ret; - ret.x = x; - ret.y = y; - ret.z = z; - return ret; + return {x, y, z}; +} + +__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const { + return dim3(x, y, z); } __device__ inline __cuda_builtin_blockIdx_t::operator uint3() const { - uint3 ret; - ret.x = x; - ret.y = y; - ret.z = z; - return ret; + return {x, y, z}; } __device__ inline __cuda_builtin_blockDim_t::operator dim3() const { return dim3(x, y, z); } +__device__ inline __cuda_builtin_blockDim_t::operator uint3() const { + return {x, y, z}; +} + __device__ inline __cuda_builtin_gridDim_t::operator dim3() const { return dim3(x, y, z); } +__device__ inline __cuda_builtin_gridDim_t::operator uint3() const { + return {x, y, z}; +} + #include <__clang_cuda_cmath.h> #include <__clang_cuda_intrinsics.h> #include <__clang_cuda_complex_builtins.h> diff --git a/lib/include/__clang_hip_cmath.h b/lib/include/__clang_hip_cmath.h new file mode 100644 index 0000000000..3a702587ee --- /dev/null +++ b/lib/include/__clang_hip_cmath.h @@ -0,0 +1,629 @@ +/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_HIP_CMATH_H__ +#define __CLANG_HIP_CMATH_H__ + +#if !defined(__HIP__) +#error "This file is for HIP and OpenMP AMDGCN device compilation only." +#endif + +#if defined(__cplusplus) +#include +#include +#include +#endif +#include +#include + +#pragma push_macro("__DEVICE__") +#define __DEVICE__ static __device__ inline __attribute__((always_inline)) + +// Start with functions that cannot be defined by DEF macros below. +#if defined(__cplusplus) +__DEVICE__ double abs(double __x) { return ::fabs(__x); } +__DEVICE__ float abs(float __x) { return ::fabsf(__x); } +__DEVICE__ long long abs(long long __n) { return ::llabs(__n); } +__DEVICE__ long abs(long __n) { return ::labs(__n); } +__DEVICE__ float fma(float __x, float __y, float __z) { + return ::fmaf(__x, __y, __z); +} +__DEVICE__ int fpclassify(float __x) { + return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, + FP_ZERO, __x); +} +__DEVICE__ int fpclassify(double __x) { + return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, + FP_ZERO, __x); +} +__DEVICE__ float frexp(float __arg, int *__exp) { + return ::frexpf(__arg, __exp); +} +__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); } +__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); } +__DEVICE__ bool isgreater(float __x, float __y) { + return __builtin_isgreater(__x, __y); +} +__DEVICE__ bool isgreater(double __x, double __y) { + return __builtin_isgreater(__x, __y); +} +__DEVICE__ bool isgreaterequal(float __x, float __y) { + return __builtin_isgreaterequal(__x, __y); +} +__DEVICE__ bool isgreaterequal(double __x, double __y) { + return __builtin_isgreaterequal(__x, __y); +} +__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); } +__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); } +__DEVICE__ bool isless(float __x, float __y) { + return __builtin_isless(__x, __y); +} +__DEVICE__ bool isless(double __x, double __y) { + return __builtin_isless(__x, __y); +} +__DEVICE__ bool islessequal(float __x, float __y) { + return __builtin_islessequal(__x, __y); +} +__DEVICE__ bool islessequal(double __x, double __y) { + return __builtin_islessequal(__x, __y); +} +__DEVICE__ bool islessgreater(float __x, float __y) { + return __builtin_islessgreater(__x, __y); +} +__DEVICE__ bool islessgreater(double __x, double __y) { + return __builtin_islessgreater(__x, __y); +} +__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); } +__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); } +__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); } +__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); } +__DEVICE__ bool isunordered(float __x, float __y) { + return __builtin_isunordered(__x, __y); +} +__DEVICE__ bool isunordered(double __x, double __y) { + return __builtin_isunordered(__x, __y); +} +__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); } +__DEVICE__ float pow(float __base, int __iexp) { + return ::powif(__base, __iexp); +} +__DEVICE__ double pow(double __base, int __iexp) { + return ::powi(__base, __iexp); +} +__DEVICE__ float remquo(float __x, float __y, int *__quo) { + return ::remquof(__x, __y, __quo); +} +__DEVICE__ float scalbln(float __x, long int __n) { + return ::scalblnf(__x, __n); +} +__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); } +__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); } + +// Notably missing above is nexttoward. We omit it because +// ocml doesn't provide an implementation, and we don't want to be in the +// business of implementing tricky libm functions in this header. + +// Other functions. +__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) { + return __ocml_fma_f16(__x, __y, __z); +} +__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) { + return __ocml_pown_f16(__base, __iexp); +} + +// BEGIN DEF_FUN and HIP_OVERLOAD + +// BEGIN DEF_FUN + +#pragma push_macro("__DEF_FUN1") +#pragma push_macro("__DEF_FUN2") +#pragma push_macro("__DEF_FUN2_FI") + +// Define cmath functions with float argument and returns __retty. +#define __DEF_FUN1(__retty, __func) \ + __DEVICE__ \ + __retty __func(float __x) { return __func##f(__x); } + +// Define cmath functions with two float arguments and returns __retty. +#define __DEF_FUN2(__retty, __func) \ + __DEVICE__ \ + __retty __func(float __x, float __y) { return __func##f(__x, __y); } + +// Define cmath functions with a float and an int argument and returns __retty. +#define __DEF_FUN2_FI(__retty, __func) \ + __DEVICE__ \ + __retty __func(float __x, int __y) { return __func##f(__x, __y); } + +__DEF_FUN1(float, acos) +__DEF_FUN1(float, acosh) +__DEF_FUN1(float, asin) +__DEF_FUN1(float, asinh) +__DEF_FUN1(float, atan) +__DEF_FUN2(float, atan2) +__DEF_FUN1(float, atanh) +__DEF_FUN1(float, cbrt) +__DEF_FUN1(float, ceil) +__DEF_FUN2(float, copysign) +__DEF_FUN1(float, cos) +__DEF_FUN1(float, cosh) +__DEF_FUN1(float, erf) +__DEF_FUN1(float, erfc) +__DEF_FUN1(float, exp) +__DEF_FUN1(float, exp2) +__DEF_FUN1(float, expm1) +__DEF_FUN1(float, fabs) +__DEF_FUN2(float, fdim) +__DEF_FUN1(float, floor) +__DEF_FUN2(float, fmax) +__DEF_FUN2(float, fmin) +__DEF_FUN2(float, fmod) +__DEF_FUN2(float, hypot) +__DEF_FUN1(int, ilogb) +__DEF_FUN2_FI(float, ldexp) +__DEF_FUN1(float, lgamma) +__DEF_FUN1(float, log) +__DEF_FUN1(float, log10) +__DEF_FUN1(float, log1p) +__DEF_FUN1(float, log2) +__DEF_FUN1(float, logb) +__DEF_FUN1(long long, llrint) +__DEF_FUN1(long long, llround) +__DEF_FUN1(long, lrint) +__DEF_FUN1(long, lround) +__DEF_FUN1(float, nearbyint) +__DEF_FUN2(float, nextafter) +__DEF_FUN2(float, pow) +__DEF_FUN2(float, remainder) +__DEF_FUN1(float, rint) +__DEF_FUN1(float, round) +__DEF_FUN2_FI(float, scalbn) +__DEF_FUN1(float, sin) +__DEF_FUN1(float, sinh) +__DEF_FUN1(float, sqrt) +__DEF_FUN1(float, tan) +__DEF_FUN1(float, tanh) +__DEF_FUN1(float, tgamma) +__DEF_FUN1(float, trunc) + +#pragma pop_macro("__DEF_FUN1") +#pragma pop_macro("__DEF_FUN2") +#pragma pop_macro("__DEF_FUN2_FI") + +// END DEF_FUN + +// BEGIN HIP_OVERLOAD + +#pragma push_macro("__HIP_OVERLOAD1") +#pragma push_macro("__HIP_OVERLOAD2") + +// __hip_enable_if::type is a type function which returns __T if __B is true. +template struct __hip_enable_if {}; + +template struct __hip_enable_if { typedef __T type; }; + +// decltype is only available in C++11 and above. +#if __cplusplus >= 201103L +// __hip_promote +namespace __hip { + +template struct __numeric_type { + static void __test(...); + static _Float16 __test(_Float16); + static float __test(float); + static double __test(char); + static double __test(int); + static double __test(unsigned); + static double __test(long); + static double __test(unsigned long); + static double __test(long long); + static double __test(unsigned long long); + static double __test(double); + // No support for long double, use double instead. + static double __test(long double); + + typedef decltype(__test(std::declval<_Tp>())) type; + static const bool value = !std::is_same::value; +}; + +template <> struct __numeric_type { static const bool value = true; }; + +template ::value &&__numeric_type<_A2>::value + &&__numeric_type<_A3>::value> +class __promote_imp { +public: + static const bool value = false; +}; + +template +class __promote_imp<_A1, _A2, _A3, true> { +private: + typedef typename __promote_imp<_A1>::type __type1; + typedef typename __promote_imp<_A2>::type __type2; + typedef typename __promote_imp<_A3>::type __type3; + +public: + typedef decltype(__type1() + __type2() + __type3()) type; + static const bool value = true; +}; + +template class __promote_imp<_A1, _A2, void, true> { +private: + typedef typename __promote_imp<_A1>::type __type1; + typedef typename __promote_imp<_A2>::type __type2; + +public: + typedef decltype(__type1() + __type2()) type; + static const bool value = true; +}; + +template class __promote_imp<_A1, void, void, true> { +public: + typedef typename __numeric_type<_A1>::type type; + static const bool value = true; +}; + +template +class __promote : public __promote_imp<_A1, _A2, _A3> {}; + +} // namespace __hip +#endif //__cplusplus >= 201103L + +// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to +// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with +// floor(double). +#define __HIP_OVERLOAD1(__retty, __fn) \ + template \ + __DEVICE__ typename __hip_enable_if::is_integer, \ + __retty>::type \ + __fn(__T __x) { \ + return ::__fn((double)__x); \ + } + +// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double +// or integer argument to avoid compilation error due to ambibuity. e.g. +// max(5.0f, 6.0) is resolved with max(double, double). +#if __cplusplus >= 201103L +#define __HIP_OVERLOAD2(__retty, __fn) \ + template \ + __DEVICE__ typename __hip_enable_if< \ + std::numeric_limits<__T1>::is_specialized && \ + std::numeric_limits<__T2>::is_specialized, \ + typename __hip::__promote<__T1, __T2>::type>::type \ + __fn(__T1 __x, __T2 __y) { \ + typedef typename __hip::__promote<__T1, __T2>::type __result_type; \ + return __fn((__result_type)__x, (__result_type)__y); \ + } +#else +#define __HIP_OVERLOAD2(__retty, __fn) \ + template \ + __DEVICE__ \ + typename __hip_enable_if::is_specialized && \ + std::numeric_limits<__T2>::is_specialized, \ + __retty>::type \ + __fn(__T1 __x, __T2 __y) { \ + return __fn((double)__x, (double)__y); \ + } +#endif + +__HIP_OVERLOAD1(double, abs) +__HIP_OVERLOAD1(double, acos) +__HIP_OVERLOAD1(double, acosh) +__HIP_OVERLOAD1(double, asin) +__HIP_OVERLOAD1(double, asinh) +__HIP_OVERLOAD1(double, atan) +__HIP_OVERLOAD2(double, atan2) +__HIP_OVERLOAD1(double, atanh) +__HIP_OVERLOAD1(double, cbrt) +__HIP_OVERLOAD1(double, ceil) +__HIP_OVERLOAD2(double, copysign) +__HIP_OVERLOAD1(double, cos) +__HIP_OVERLOAD1(double, cosh) +__HIP_OVERLOAD1(double, erf) +__HIP_OVERLOAD1(double, erfc) +__HIP_OVERLOAD1(double, exp) +__HIP_OVERLOAD1(double, exp2) +__HIP_OVERLOAD1(double, expm1) +__HIP_OVERLOAD1(double, fabs) +__HIP_OVERLOAD2(double, fdim) +__HIP_OVERLOAD1(double, floor) +__HIP_OVERLOAD2(double, fmax) +__HIP_OVERLOAD2(double, fmin) +__HIP_OVERLOAD2(double, fmod) +__HIP_OVERLOAD1(int, fpclassify) +__HIP_OVERLOAD2(double, hypot) +__HIP_OVERLOAD1(int, ilogb) +__HIP_OVERLOAD1(bool, isfinite) +__HIP_OVERLOAD2(bool, isgreater) +__HIP_OVERLOAD2(bool, isgreaterequal) +__HIP_OVERLOAD1(bool, isinf) +__HIP_OVERLOAD2(bool, isless) +__HIP_OVERLOAD2(bool, islessequal) +__HIP_OVERLOAD2(bool, islessgreater) +__HIP_OVERLOAD1(bool, isnan) +__HIP_OVERLOAD1(bool, isnormal) +__HIP_OVERLOAD2(bool, isunordered) +__HIP_OVERLOAD1(double, lgamma) +__HIP_OVERLOAD1(double, log) +__HIP_OVERLOAD1(double, log10) +__HIP_OVERLOAD1(double, log1p) +__HIP_OVERLOAD1(double, log2) +__HIP_OVERLOAD1(double, logb) +__HIP_OVERLOAD1(long long, llrint) +__HIP_OVERLOAD1(long long, llround) +__HIP_OVERLOAD1(long, lrint) +__HIP_OVERLOAD1(long, lround) +__HIP_OVERLOAD1(double, nearbyint) +__HIP_OVERLOAD2(double, nextafter) +__HIP_OVERLOAD2(double, pow) +__HIP_OVERLOAD2(double, remainder) +__HIP_OVERLOAD1(double, rint) +__HIP_OVERLOAD1(double, round) +__HIP_OVERLOAD1(bool, signbit) +__HIP_OVERLOAD1(double, sin) +__HIP_OVERLOAD1(double, sinh) +__HIP_OVERLOAD1(double, sqrt) +__HIP_OVERLOAD1(double, tan) +__HIP_OVERLOAD1(double, tanh) +__HIP_OVERLOAD1(double, tgamma) +__HIP_OVERLOAD1(double, trunc) + +// Overload these but don't add them to std, they are not part of cmath. +__HIP_OVERLOAD2(double, max) +__HIP_OVERLOAD2(double, min) + +// Additional Overloads that don't quite match HIP_OVERLOAD. +#if __cplusplus >= 201103L +template +__DEVICE__ typename __hip_enable_if< + std::numeric_limits<__T1>::is_specialized && + std::numeric_limits<__T2>::is_specialized && + std::numeric_limits<__T3>::is_specialized, + typename __hip::__promote<__T1, __T2, __T3>::type>::type +fma(__T1 __x, __T2 __y, __T3 __z) { + typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type; + return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z); +} +#else +template +__DEVICE__ + typename __hip_enable_if::is_specialized && + std::numeric_limits<__T2>::is_specialized && + std::numeric_limits<__T3>::is_specialized, + double>::type + fma(__T1 __x, __T2 __y, __T3 __z) { + return ::fma((double)__x, (double)__y, (double)__z); +} +#endif + +template +__DEVICE__ + typename __hip_enable_if::is_integer, double>::type + frexp(__T __x, int *__exp) { + return ::frexp((double)__x, __exp); +} + +template +__DEVICE__ + typename __hip_enable_if::is_integer, double>::type + ldexp(__T __x, int __exp) { + return ::ldexp((double)__x, __exp); +} + +template +__DEVICE__ + typename __hip_enable_if::is_integer, double>::type + modf(__T __x, double *__exp) { + return ::modf((double)__x, __exp); +} + +#if __cplusplus >= 201103L +template +__DEVICE__ + typename __hip_enable_if::is_specialized && + std::numeric_limits<__T2>::is_specialized, + typename __hip::__promote<__T1, __T2>::type>::type + remquo(__T1 __x, __T2 __y, int *__quo) { + typedef typename __hip::__promote<__T1, __T2>::type __result_type; + return ::remquo((__result_type)__x, (__result_type)__y, __quo); +} +#else +template +__DEVICE__ + typename __hip_enable_if::is_specialized && + std::numeric_limits<__T2>::is_specialized, + double>::type + remquo(__T1 __x, __T2 __y, int *__quo) { + return ::remquo((double)__x, (double)__y, __quo); +} +#endif + +template +__DEVICE__ + typename __hip_enable_if::is_integer, double>::type + scalbln(__T __x, long int __exp) { + return ::scalbln((double)__x, __exp); +} + +template +__DEVICE__ + typename __hip_enable_if::is_integer, double>::type + scalbn(__T __x, int __exp) { + return ::scalbn((double)__x, __exp); +} + +#pragma pop_macro("__HIP_OVERLOAD1") +#pragma pop_macro("__HIP_OVERLOAD2") + +// END HIP_OVERLOAD + +// END DEF_FUN and HIP_OVERLOAD + +#endif // defined(__cplusplus) + +// Define these overloads inside the namespace our standard library uses. +#ifdef _LIBCPP_BEGIN_NAMESPACE_STD +_LIBCPP_BEGIN_NAMESPACE_STD +#else +namespace std { +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_BEGIN_NAMESPACE_VERSION +#endif +#endif + +// Pull the new overloads we defined above into namespace std. +// using ::abs; - This may be considered for C++. +using ::acos; +using ::acosh; +using ::asin; +using ::asinh; +using ::atan; +using ::atan2; +using ::atanh; +using ::cbrt; +using ::ceil; +using ::copysign; +using ::cos; +using ::cosh; +using ::erf; +using ::erfc; +using ::exp; +using ::exp2; +using ::expm1; +using ::fabs; +using ::fdim; +using ::floor; +using ::fma; +using ::fmax; +using ::fmin; +using ::fmod; +using ::fpclassify; +using ::frexp; +using ::hypot; +using ::ilogb; +using ::isfinite; +using ::isgreater; +using ::isgreaterequal; +using ::isless; +using ::islessequal; +using ::islessgreater; +using ::isnormal; +using ::isunordered; +using ::ldexp; +using ::lgamma; +using ::llrint; +using ::llround; +using ::log; +using ::log10; +using ::log1p; +using ::log2; +using ::logb; +using ::lrint; +using ::lround; +using ::modf; +// using ::nan; - This may be considered for C++. +// using ::nanf; - This may be considered for C++. +// using ::nanl; - This is not yet defined. +using ::nearbyint; +using ::nextafter; +// using ::nexttoward; - Omit this since we do not have a definition. +using ::pow; +using ::remainder; +using ::remquo; +using ::rint; +using ::round; +using ::scalbln; +using ::scalbn; +using ::signbit; +using ::sin; +using ::sinh; +using ::sqrt; +using ::tan; +using ::tanh; +using ::tgamma; +using ::trunc; + +// Well this is fun: We need to pull these symbols in for libc++, but we can't +// pull them in with libstdc++, because its ::isinf and ::isnan are different +// than its std::isinf and std::isnan. +#ifndef __GLIBCXX__ +using ::isinf; +using ::isnan; +#endif + +// Finally, pull the "foobarf" functions that HIP defines into std. +using ::acosf; +using ::acoshf; +using ::asinf; +using ::asinhf; +using ::atan2f; +using ::atanf; +using ::atanhf; +using ::cbrtf; +using ::ceilf; +using ::copysignf; +using ::cosf; +using ::coshf; +using ::erfcf; +using ::erff; +using ::exp2f; +using ::expf; +using ::expm1f; +using ::fabsf; +using ::fdimf; +using ::floorf; +using ::fmaf; +using ::fmaxf; +using ::fminf; +using ::fmodf; +using ::frexpf; +using ::hypotf; +using ::ilogbf; +using ::ldexpf; +using ::lgammaf; +using ::llrintf; +using ::llroundf; +using ::log10f; +using ::log1pf; +using ::log2f; +using ::logbf; +using ::logf; +using ::lrintf; +using ::lroundf; +using ::modff; +using ::nearbyintf; +using ::nextafterf; +// using ::nexttowardf; - Omit this since we do not have a definition. +using ::powf; +using ::remainderf; +using ::remquof; +using ::rintf; +using ::roundf; +using ::scalblnf; +using ::scalbnf; +using ::sinf; +using ::sinhf; +using ::sqrtf; +using ::tanf; +using ::tanhf; +using ::tgammaf; +using ::truncf; + +#ifdef _LIBCPP_END_NAMESPACE_STD +_LIBCPP_END_NAMESPACE_STD +#else +#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION +_GLIBCXX_END_NAMESPACE_VERSION +#endif +} // namespace std +#endif + +#pragma pop_macro("__DEVICE__") + +#endif // __CLANG_HIP_CMATH_H__ diff --git a/lib/include/__clang_hip_libdevice_declares.h b/lib/include/__clang_hip_libdevice_declares.h index e1cd49a39c..ac98907ad5 100644 --- a/lib/include/__clang_hip_libdevice_declares.h +++ b/lib/include/__clang_hip_libdevice_declares.h @@ -10,7 +10,9 @@ #ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__ #define __CLANG_HIP_LIBDEVICE_DECLARES_H__ +#ifdef __cplusplus extern "C" { +#endif // BEGIN FLOAT __device__ __attribute__((const)) float __ocml_acos_f32(float); @@ -78,6 +80,7 @@ __device__ __attribute__((const)) float __ocml_len4_f32(float, float, float, __device__ __attribute__((pure)) float __ocml_ncdf_f32(float); __device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float); __device__ __attribute__((pure)) float __ocml_pow_f32(float, float); +__device__ __attribute__((pure)) float __ocml_pown_f32(float, int); __device__ __attribute__((pure)) float __ocml_rcbrt_f32(float); __device__ __attribute__((const)) float __ocml_remainder_f32(float, float); __device__ float __ocml_remquo_f32(float, float, @@ -126,10 +129,10 @@ __device__ __attribute__((const)) float __ocml_div_rte_f32(float, float); __device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float); __device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float); __device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float); -__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float); -__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float); -__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float); -__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float); +__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float); +__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float); __device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float); __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float); __device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float); @@ -205,6 +208,7 @@ __device__ __attribute__((const)) double __ocml_len4_f64(double, double, double, __device__ __attribute__((pure)) double __ocml_ncdf_f64(double); __device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double); __device__ __attribute__((pure)) double __ocml_pow_f64(double, double); +__device__ __attribute__((pure)) double __ocml_pown_f64(double, int); __device__ __attribute__((pure)) double __ocml_rcbrt_f64(double); __device__ __attribute__((const)) double __ocml_remainder_f64(double, double); __device__ double __ocml_remquo_f64(double, double, @@ -252,10 +256,10 @@ __device__ __attribute__((const)) double __ocml_div_rte_f64(double, double); __device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double); __device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double); __device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double); -__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double); -__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double); -__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double); -__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double); +__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double); +__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double); __device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double, double); __device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double, @@ -290,6 +294,7 @@ __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); __device__ _Float16 __ocml_sin_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16); +__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int); typedef _Float16 __2f16 __attribute__((ext_vector_type(2))); typedef short __2i16 __attribute__((ext_vector_type(2))); @@ -313,14 +318,17 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); __device__ inline __2f16 __llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL. { - return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)}; + return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)); } __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); __device__ __2f16 __ocml_sin_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); +__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16); +#ifdef __cplusplus } // extern "C" +#endif #endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__ diff --git a/lib/include/__clang_hip_math.h b/lib/include/__clang_hip_math.h index cf7014b9ae..14d91c66b3 100644 --- a/lib/include/__clang_hip_math.h +++ b/lib/include/__clang_hip_math.h @@ -1,4 +1,4 @@ -/*===---- __clang_hip_math.h - HIP math decls -------------------------------=== +/*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. @@ -6,24 +6,57 @@ * *===-----------------------------------------------------------------------=== */ - #ifndef __CLANG_HIP_MATH_H__ #define __CLANG_HIP_MATH_H__ +#if !defined(__HIP__) +#error "This file is for HIP and OpenMP AMDGCN device compilation only." +#endif + +#if defined(__cplusplus) #include +#endif #include -#include #include #pragma push_macro("__DEVICE__") -#pragma push_macro("__RETURN_TYPE") +#define __DEVICE__ static __device__ inline __attribute__((always_inline)) -// to be consistent with __clang_cuda_math_forward_declares -#define __DEVICE__ static __device__ +// A few functions return bool type starting only in C++11. +#pragma push_macro("__RETURN_TYPE") +#if defined(__cplusplus) #define __RETURN_TYPE bool +#else +#define __RETURN_TYPE int +#endif + +#if defined (__cplusplus) && __cplusplus < 201103L +// emulate static_assert on type sizes +template +struct __compare_result{}; +template<> +struct __compare_result { + static const bool valid; +}; __DEVICE__ -inline uint64_t __make_mantissa_base8(const char *__tagp) { +void __suppress_unused_warning(bool b){}; +template +__DEVICE__ void __static_assert_equal_size() { + __suppress_unused_warning(__compare_result::valid); +} + +#define __static_assert_type_size_equal(A, B) \ + __static_assert_equal_size() + +#else +#define __static_assert_type_size_equal(A,B) \ + static_assert((A) == (B), "") + +#endif + +__DEVICE__ +uint64_t __make_mantissa_base8(const char *__tagp) { uint64_t __r = 0; while (__tagp) { char __tmp = *__tagp; @@ -40,7 +73,7 @@ inline uint64_t __make_mantissa_base8(const char *__tagp) { } __DEVICE__ -inline uint64_t __make_mantissa_base10(const char *__tagp) { +uint64_t __make_mantissa_base10(const char *__tagp) { uint64_t __r = 0; while (__tagp) { char __tmp = *__tagp; @@ -57,7 +90,7 @@ inline uint64_t __make_mantissa_base10(const char *__tagp) { } __DEVICE__ -inline uint64_t __make_mantissa_base16(const char *__tagp) { +uint64_t __make_mantissa_base16(const char *__tagp) { uint64_t __r = 0; while (__tagp) { char __tmp = *__tagp; @@ -78,7 +111,7 @@ inline uint64_t __make_mantissa_base16(const char *__tagp) { } __DEVICE__ -inline uint64_t __make_mantissa(const char *__tagp) { +uint64_t __make_mantissa(const char *__tagp) { if (!__tagp) return 0u; @@ -95,78 +128,124 @@ inline uint64_t __make_mantissa(const char *__tagp) { } // BEGIN FLOAT +#if defined(__cplusplus) __DEVICE__ -inline float abs(float __x) { return __ocml_fabs_f32(__x); } -__DEVICE__ -inline float acosf(float __x) { return __ocml_acos_f32(__x); } -__DEVICE__ -inline float acoshf(float __x) { return __ocml_acosh_f32(__x); } -__DEVICE__ -inline float asinf(float __x) { return __ocml_asin_f32(__x); } -__DEVICE__ -inline float asinhf(float __x) { return __ocml_asinh_f32(__x); } -__DEVICE__ -inline float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } -__DEVICE__ -inline float atanf(float __x) { return __ocml_atan_f32(__x); } -__DEVICE__ -inline float atanhf(float __x) { return __ocml_atanh_f32(__x); } -__DEVICE__ -inline float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } -__DEVICE__ -inline float ceilf(float __x) { return __ocml_ceil_f32(__x); } -__DEVICE__ -inline float copysignf(float __x, float __y) { - return __ocml_copysign_f32(__x, __y); +int abs(int __x) { + int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; } __DEVICE__ -inline float cosf(float __x) { return __ocml_cos_f32(__x); } +long labs(long __x) { + long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; +} __DEVICE__ -inline float coshf(float __x) { return __ocml_cosh_f32(__x); } +long long llabs(long long __x) { + long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1); + return (__x ^ __sgn) - __sgn; +} +#endif + __DEVICE__ -inline float cospif(float __x) { return __ocml_cospi_f32(__x); } +float acosf(float __x) { return __ocml_acos_f32(__x); } + __DEVICE__ -inline float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } +float acoshf(float __x) { return __ocml_acosh_f32(__x); } + __DEVICE__ -inline float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } +float asinf(float __x) { return __ocml_asin_f32(__x); } + __DEVICE__ -inline float erfcf(float __x) { return __ocml_erfc_f32(__x); } +float asinhf(float __x) { return __ocml_asinh_f32(__x); } + __DEVICE__ -inline float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } +float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } + __DEVICE__ -inline float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } +float atanf(float __x) { return __ocml_atan_f32(__x); } + __DEVICE__ -inline float erff(float __x) { return __ocml_erf_f32(__x); } +float atanhf(float __x) { return __ocml_atanh_f32(__x); } + __DEVICE__ -inline float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } +float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } + __DEVICE__ -inline float exp10f(float __x) { return __ocml_exp10_f32(__x); } +float ceilf(float __x) { return __ocml_ceil_f32(__x); } + __DEVICE__ -inline float exp2f(float __x) { return __ocml_exp2_f32(__x); } +float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); } + __DEVICE__ -inline float expf(float __x) { return __ocml_exp_f32(__x); } +float cosf(float __x) { return __ocml_cos_f32(__x); } + __DEVICE__ -inline float expm1f(float __x) { return __ocml_expm1_f32(__x); } +float coshf(float __x) { return __ocml_cosh_f32(__x); } + __DEVICE__ -inline float fabsf(float __x) { return __ocml_fabs_f32(__x); } +float cospif(float __x) { return __ocml_cospi_f32(__x); } + __DEVICE__ -inline float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } +float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } + __DEVICE__ -inline float fdividef(float __x, float __y) { return __x / __y; } +float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } + __DEVICE__ -inline float floorf(float __x) { return __ocml_floor_f32(__x); } +float erfcf(float __x) { return __ocml_erfc_f32(__x); } + __DEVICE__ -inline float fmaf(float __x, float __y, float __z) { +float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } + +__DEVICE__ +float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } + +__DEVICE__ +float erff(float __x) { return __ocml_erf_f32(__x); } + +__DEVICE__ +float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } + +__DEVICE__ +float exp10f(float __x) { return __ocml_exp10_f32(__x); } + +__DEVICE__ +float exp2f(float __x) { return __ocml_exp2_f32(__x); } + +__DEVICE__ +float expf(float __x) { return __ocml_exp_f32(__x); } + +__DEVICE__ +float expm1f(float __x) { return __ocml_expm1_f32(__x); } + +__DEVICE__ +float fabsf(float __x) { return __ocml_fabs_f32(__x); } + +__DEVICE__ +float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } + +__DEVICE__ +float fdividef(float __x, float __y) { return __x / __y; } + +__DEVICE__ +float floorf(float __x) { return __ocml_floor_f32(__x); } + +__DEVICE__ +float fmaf(float __x, float __y, float __z) { return __ocml_fma_f32(__x, __y, __z); } + __DEVICE__ -inline float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); } +float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); } + __DEVICE__ -inline float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); } +float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); } + __DEVICE__ -inline float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } +float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } + __DEVICE__ -inline float frexpf(float __x, int *__nptr) { +float frexpf(float __x, int *__nptr) { int __tmp; float __r = __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp); @@ -174,24 +253,31 @@ inline float frexpf(float __x, int *__nptr) { return __r; } + __DEVICE__ -inline float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } +float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } + __DEVICE__ -inline int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } +int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } + __DEVICE__ -inline __RETURN_TYPE isfinite(float __x) { return __ocml_isfinite_f32(__x); } +__RETURN_TYPE __finitef(float __x) { return __ocml_isfinite_f32(__x); } + __DEVICE__ -inline __RETURN_TYPE isinf(float __x) { return __ocml_isinf_f32(__x); } +__RETURN_TYPE __isinff(float __x) { return __ocml_isinf_f32(__x); } + __DEVICE__ -inline __RETURN_TYPE isnan(float __x) { return __ocml_isnan_f32(__x); } +__RETURN_TYPE __isnanf(float __x) { return __ocml_isnan_f32(__x); } + __DEVICE__ -inline float j0f(float __x) { return __ocml_j0_f32(__x); } +float j0f(float __x) { return __ocml_j0_f32(__x); } + __DEVICE__ -inline float j1f(float __x) { return __ocml_j1_f32(__x); } +float j1f(float __x) { return __ocml_j1_f32(__x); } + __DEVICE__ -inline float jnf(int __n, - float __x) { // TODO: we could use Ahmes multiplication - // and the Miller & Brown algorithm +float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm // for linear recurrences to get O(log n) steps, but it's unclear if // it'd be beneficial in this case. if (__n == 0) @@ -209,50 +295,61 @@ inline float jnf(int __n, return __x1; } + __DEVICE__ -inline float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); } +float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); } + __DEVICE__ -inline float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } +float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } + __DEVICE__ -inline long long int llrintf(float __x) { return __ocml_rint_f32(__x); } +long long int llrintf(float __x) { return __ocml_rint_f32(__x); } + __DEVICE__ -inline long long int llroundf(float __x) { return __ocml_round_f32(__x); } +long long int llroundf(float __x) { return __ocml_round_f32(__x); } + __DEVICE__ -inline float log10f(float __x) { return __ocml_log10_f32(__x); } +float log10f(float __x) { return __ocml_log10_f32(__x); } + __DEVICE__ -inline float log1pf(float __x) { return __ocml_log1p_f32(__x); } +float log1pf(float __x) { return __ocml_log1p_f32(__x); } + __DEVICE__ -inline float log2f(float __x) { return __ocml_log2_f32(__x); } +float log2f(float __x) { return __ocml_log2_f32(__x); } + __DEVICE__ -inline float logbf(float __x) { return __ocml_logb_f32(__x); } +float logbf(float __x) { return __ocml_logb_f32(__x); } + __DEVICE__ -inline float logf(float __x) { return __ocml_log_f32(__x); } +float logf(float __x) { return __ocml_log_f32(__x); } + __DEVICE__ -inline long int lrintf(float __x) { return __ocml_rint_f32(__x); } +long int lrintf(float __x) { return __ocml_rint_f32(__x); } + __DEVICE__ -inline long int lroundf(float __x) { return __ocml_round_f32(__x); } +long int lroundf(float __x) { return __ocml_round_f32(__x); } + __DEVICE__ -inline float modff(float __x, float *__iptr) { +float modff(float __x, float *__iptr) { float __tmp; float __r = __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); *__iptr = __tmp; - return __r; } + __DEVICE__ -inline float nanf(const char *__tagp) { +float nanf(const char *__tagp) { union { float val; struct ieee_float { - uint32_t mantissa : 22; - uint32_t quiet : 1; - uint32_t exponent : 8; - uint32_t sign : 1; + unsigned int mantissa : 22; + unsigned int quiet : 1; + unsigned int exponent : 8; + unsigned int sign : 1; } bits; - - static_assert(sizeof(float) == sizeof(ieee_float), ""); } __tmp; + __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); __tmp.bits.sign = 0u; __tmp.bits.exponent = ~0u; @@ -261,28 +358,34 @@ inline float nanf(const char *__tagp) { return __tmp.val; } + __DEVICE__ -inline float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); } +float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); } + __DEVICE__ -inline float nextafterf(float __x, float __y) { +float nextafterf(float __x, float __y) { return __ocml_nextafter_f32(__x, __y); } + __DEVICE__ -inline float norm3df(float __x, float __y, float __z) { +float norm3df(float __x, float __y, float __z) { return __ocml_len3_f32(__x, __y, __z); } + __DEVICE__ -inline float norm4df(float __x, float __y, float __z, float __w) { +float norm4df(float __x, float __y, float __z, float __w) { return __ocml_len4_f32(__x, __y, __z, __w); } + __DEVICE__ -inline float normcdff(float __x) { return __ocml_ncdf_f32(__x); } +float normcdff(float __x) { return __ocml_ncdf_f32(__x); } + __DEVICE__ -inline float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } +float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } + __DEVICE__ -inline float -normf(int __dim, - const float *__a) { // TODO: placeholder until OCML adds support. +float normf(int __dim, + const float *__a) { // TODO: placeholder until OCML adds support. float __r = 0; while (__dim--) { __r += __a[0] * __a[0]; @@ -291,16 +394,23 @@ normf(int __dim, return __ocml_sqrt_f32(__r); } + __DEVICE__ -inline float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } +float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + __DEVICE__ -inline float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } +float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } + __DEVICE__ -inline float remainderf(float __x, float __y) { +float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } + +__DEVICE__ +float remainderf(float __x, float __y) { return __ocml_remainder_f32(__x, __y); } + __DEVICE__ -inline float remquof(float __x, float __y, int *__quo) { +float remquof(float __x, float __y, int *__quo) { int __tmp; float __r = __ocml_remquo_f32( __x, __y, (__attribute__((address_space(5))) int *)&__tmp); @@ -308,25 +418,26 @@ inline float remquof(float __x, float __y, int *__quo) { return __r; } + __DEVICE__ -inline float rhypotf(float __x, float __y) { - return __ocml_rhypot_f32(__x, __y); -} +float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } + __DEVICE__ -inline float rintf(float __x) { return __ocml_rint_f32(__x); } +float rintf(float __x) { return __ocml_rint_f32(__x); } + __DEVICE__ -inline float rnorm3df(float __x, float __y, float __z) { +float rnorm3df(float __x, float __y, float __z) { return __ocml_rlen3_f32(__x, __y, __z); } __DEVICE__ -inline float rnorm4df(float __x, float __y, float __z, float __w) { +float rnorm4df(float __x, float __y, float __z, float __w) { return __ocml_rlen4_f32(__x, __y, __z, __w); } + __DEVICE__ -inline float -rnormf(int __dim, - const float *__a) { // TODO: placeholder until OCML adds support. +float rnormf(int __dim, + const float *__a) { // TODO: placeholder until OCML adds support. float __r = 0; while (__dim--) { __r += __a[0] * __a[0]; @@ -335,59 +446,74 @@ rnormf(int __dim, return __ocml_rsqrt_f32(__r); } + __DEVICE__ -inline float roundf(float __x) { return __ocml_round_f32(__x); } +float roundf(float __x) { return __ocml_round_f32(__x); } + __DEVICE__ -inline float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } +float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } + __DEVICE__ -inline float scalblnf(float __x, long int __n) { +float scalblnf(float __x, long int __n) { return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n) : __ocml_scalb_f32(__x, __n); } -__DEVICE__ -inline float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); } -__DEVICE__ -inline __RETURN_TYPE signbit(float __x) { return __ocml_signbit_f32(__x); } -__DEVICE__ -inline void sincosf(float __x, float *__sinptr, float *__cosptr) { - float __tmp; +__DEVICE__ +float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); } + +__DEVICE__ +__RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); } + +__DEVICE__ +void sincosf(float __x, float *__sinptr, float *__cosptr) { + float __tmp; *__sinptr = __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); *__cosptr = __tmp; } -__DEVICE__ -inline void sincospif(float __x, float *__sinptr, float *__cosptr) { - float __tmp; +__DEVICE__ +void sincospif(float __x, float *__sinptr, float *__cosptr) { + float __tmp; *__sinptr = __ocml_sincospi_f32( __x, (__attribute__((address_space(5))) float *)&__tmp); *__cosptr = __tmp; } + __DEVICE__ -inline float sinf(float __x) { return __ocml_sin_f32(__x); } +float sinf(float __x) { return __ocml_sin_f32(__x); } + __DEVICE__ -inline float sinhf(float __x) { return __ocml_sinh_f32(__x); } +float sinhf(float __x) { return __ocml_sinh_f32(__x); } + __DEVICE__ -inline float sinpif(float __x) { return __ocml_sinpi_f32(__x); } +float sinpif(float __x) { return __ocml_sinpi_f32(__x); } + __DEVICE__ -inline float sqrtf(float __x) { return __ocml_sqrt_f32(__x); } +float sqrtf(float __x) { return __ocml_sqrt_f32(__x); } + __DEVICE__ -inline float tanf(float __x) { return __ocml_tan_f32(__x); } +float tanf(float __x) { return __ocml_tan_f32(__x); } + __DEVICE__ -inline float tanhf(float __x) { return __ocml_tanh_f32(__x); } +float tanhf(float __x) { return __ocml_tanh_f32(__x); } + __DEVICE__ -inline float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } +float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } + __DEVICE__ -inline float truncf(float __x) { return __ocml_trunc_f32(__x); } +float truncf(float __x) { return __ocml_trunc_f32(__x); } + __DEVICE__ -inline float y0f(float __x) { return __ocml_y0_f32(__x); } +float y0f(float __x) { return __ocml_y0_f32(__x); } + __DEVICE__ -inline float y1f(float __x) { return __ocml_y1_f32(__x); } +float y1f(float __x) { return __ocml_y1_f32(__x); } + __DEVICE__ -inline float ynf(int __n, - float __x) { // TODO: we could use Ahmes multiplication - // and the Miller & Brown algorithm +float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm // for linear recurrences to get O(log n) steps, but it's unclear if // it'd be beneficial in this case. Placeholder until OCML adds // support. @@ -408,290 +534,343 @@ inline float ynf(int __n, } // BEGIN INTRINSICS + __DEVICE__ -inline float __cosf(float __x) { return __ocml_native_cos_f32(__x); } +float __cosf(float __x) { return __ocml_native_cos_f32(__x); } + __DEVICE__ -inline float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); } +float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); } + __DEVICE__ -inline float __expf(float __x) { return __ocml_native_exp_f32(__x); } +float __expf(float __x) { return __ocml_native_exp_f32(__x); } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fadd_rd(float __x, float __y) { - return __ocml_add_rtn_f32(__x, __y); -} +float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } +__DEVICE__ +float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } +__DEVICE__ +float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } +__DEVICE__ +float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fadd_rn(float __x, float __y) { return __x + __y; } #endif -__DEVICE__ -inline float __fadd_rn(float __x, float __y) { return __x + __y; } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fadd_ru(float __x, float __y) { - return __ocml_add_rtp_f32(__x, __y); -} +float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } __DEVICE__ -inline float __fadd_rz(float __x, float __y) { - return __ocml_add_rtz_f32(__x, __y); -} +float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } __DEVICE__ -inline float __fdiv_rd(float __x, float __y) { - return __ocml_div_rtn_f32(__x, __y); -} +float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } +__DEVICE__ +float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fdiv_rn(float __x, float __y) { return __x / __y; } #endif + __DEVICE__ -inline float __fdiv_rn(float __x, float __y) { return __x / __y; } +float __fdividef(float __x, float __y) { return __x / __y; } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fdiv_ru(float __x, float __y) { - return __ocml_div_rtp_f32(__x, __y); -} -__DEVICE__ -inline float __fdiv_rz(float __x, float __y) { - return __ocml_div_rtz_f32(__x, __y); -} -#endif -__DEVICE__ -inline float __fdividef(float __x, float __y) { return __x / __y; } -#if defined OCML_BASIC_ROUNDED_OPERATIONS -__DEVICE__ -inline float __fmaf_rd(float __x, float __y, float __z) { +float __fmaf_rd(float __x, float __y, float __z) { return __ocml_fma_rtn_f32(__x, __y, __z); } -#endif __DEVICE__ -inline float __fmaf_rn(float __x, float __y, float __z) { - return __ocml_fma_f32(__x, __y, __z); +float __fmaf_rn(float __x, float __y, float __z) { + return __ocml_fma_rte_f32(__x, __y, __z); } -#if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fmaf_ru(float __x, float __y, float __z) { +float __fmaf_ru(float __x, float __y, float __z) { return __ocml_fma_rtp_f32(__x, __y, __z); } __DEVICE__ -inline float __fmaf_rz(float __x, float __y, float __z) { +float __fmaf_rz(float __x, float __y, float __z) { return __ocml_fma_rtz_f32(__x, __y, __z); } +#else __DEVICE__ -inline float __fmul_rd(float __x, float __y) { - return __ocml_mul_rtn_f32(__x, __y); +float __fmaf_rn(float __x, float __y, float __z) { + return __ocml_fma_f32(__x, __y, __z); } #endif -__DEVICE__ -inline float __fmul_rn(float __x, float __y) { return __x * __y; } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fmul_ru(float __x, float __y) { - return __ocml_mul_rtp_f32(__x, __y); -} +float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } __DEVICE__ -inline float __fmul_rz(float __x, float __y) { - return __ocml_mul_rtz_f32(__x, __y); -} +float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } __DEVICE__ -inline float __frcp_rd(float __x) { return __llvm_amdgcn_rcp_f32(__x); } +float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } +__DEVICE__ +float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fmul_rn(float __x, float __y) { return __x * __y; } #endif -__DEVICE__ -inline float __frcp_rn(float __x) { return __llvm_amdgcn_rcp_f32(__x); } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __frcp_ru(float __x) { return __llvm_amdgcn_rcp_f32(__x); } +float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } __DEVICE__ -inline float __frcp_rz(float __x) { return __llvm_amdgcn_rcp_f32(__x); } +float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } +__DEVICE__ +float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } +__DEVICE__ +float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } +#else +__DEVICE__ +float __frcp_rn(float __x) { return 1.0f / __x; } #endif + __DEVICE__ -inline float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); } +float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } -#endif +float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } __DEVICE__ -inline float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } +float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } +__DEVICE__ +float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } +__DEVICE__ +float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } +#else +__DEVICE__ +float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } +#endif + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } +float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } __DEVICE__ -inline float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } +float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } __DEVICE__ -inline float __fsub_rd(float __x, float __y) { - return __ocml_sub_rtn_f32(__x, __y); -} +float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } +__DEVICE__ +float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } +#else +__DEVICE__ +float __fsub_rn(float __x, float __y) { return __x - __y; } #endif + __DEVICE__ -inline float __fsub_rn(float __x, float __y) { return __x - __y; } -#if defined OCML_BASIC_ROUNDED_OPERATIONS +float __log10f(float __x) { return __ocml_native_log10_f32(__x); } + __DEVICE__ -inline float __fsub_ru(float __x, float __y) { - return __ocml_sub_rtp_f32(__x, __y); -} +float __log2f(float __x) { return __ocml_native_log2_f32(__x); } + __DEVICE__ -inline float __fsub_rz(float __x, float __y) { - return __ocml_sub_rtz_f32(__x, __y); -} -#endif +float __logf(float __x) { return __ocml_native_log_f32(__x); } + __DEVICE__ -inline float __log10f(float __x) { return __ocml_native_log10_f32(__x); } +float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } + __DEVICE__ -inline float __log2f(float __x) { return __ocml_native_log2_f32(__x); } +float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } + __DEVICE__ -inline float __logf(float __x) { return __ocml_native_log_f32(__x); } -__DEVICE__ -inline float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } -__DEVICE__ -inline float __saturatef(float __x) { - return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); -} -__DEVICE__ -inline void __sincosf(float __x, float *__sinptr, float *__cosptr) { +void __sincosf(float __x, float *__sinptr, float *__cosptr) { *__sinptr = __ocml_native_sin_f32(__x); *__cosptr = __ocml_native_cos_f32(__x); } + __DEVICE__ -inline float __sinf(float __x) { return __ocml_native_sin_f32(__x); } +float __sinf(float __x) { return __ocml_native_sin_f32(__x); } + __DEVICE__ -inline float __tanf(float __x) { return __ocml_tan_f32(__x); } +float __tanf(float __x) { return __ocml_tan_f32(__x); } // END INTRINSICS // END FLOAT // BEGIN DOUBLE __DEVICE__ -inline double abs(double __x) { return __ocml_fabs_f64(__x); } +double acos(double __x) { return __ocml_acos_f64(__x); } + __DEVICE__ -inline double acos(double __x) { return __ocml_acos_f64(__x); } +double acosh(double __x) { return __ocml_acosh_f64(__x); } + __DEVICE__ -inline double acosh(double __x) { return __ocml_acosh_f64(__x); } +double asin(double __x) { return __ocml_asin_f64(__x); } + __DEVICE__ -inline double asin(double __x) { return __ocml_asin_f64(__x); } +double asinh(double __x) { return __ocml_asinh_f64(__x); } + __DEVICE__ -inline double asinh(double __x) { return __ocml_asinh_f64(__x); } +double atan(double __x) { return __ocml_atan_f64(__x); } + __DEVICE__ -inline double atan(double __x) { return __ocml_atan_f64(__x); } +double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } + __DEVICE__ -inline double atan2(double __x, double __y) { - return __ocml_atan2_f64(__x, __y); -} +double atanh(double __x) { return __ocml_atanh_f64(__x); } + __DEVICE__ -inline double atanh(double __x) { return __ocml_atanh_f64(__x); } +double cbrt(double __x) { return __ocml_cbrt_f64(__x); } + __DEVICE__ -inline double cbrt(double __x) { return __ocml_cbrt_f64(__x); } +double ceil(double __x) { return __ocml_ceil_f64(__x); } + __DEVICE__ -inline double ceil(double __x) { return __ocml_ceil_f64(__x); } -__DEVICE__ -inline double copysign(double __x, double __y) { +double copysign(double __x, double __y) { return __ocml_copysign_f64(__x, __y); } + __DEVICE__ -inline double cos(double __x) { return __ocml_cos_f64(__x); } +double cos(double __x) { return __ocml_cos_f64(__x); } + __DEVICE__ -inline double cosh(double __x) { return __ocml_cosh_f64(__x); } +double cosh(double __x) { return __ocml_cosh_f64(__x); } + __DEVICE__ -inline double cospi(double __x) { return __ocml_cospi_f64(__x); } +double cospi(double __x) { return __ocml_cospi_f64(__x); } + __DEVICE__ -inline double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } +double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } + __DEVICE__ -inline double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } +double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } + __DEVICE__ -inline double erf(double __x) { return __ocml_erf_f64(__x); } +double erf(double __x) { return __ocml_erf_f64(__x); } + __DEVICE__ -inline double erfc(double __x) { return __ocml_erfc_f64(__x); } +double erfc(double __x) { return __ocml_erfc_f64(__x); } + __DEVICE__ -inline double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } +double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } + __DEVICE__ -inline double erfcx(double __x) { return __ocml_erfcx_f64(__x); } +double erfcx(double __x) { return __ocml_erfcx_f64(__x); } + __DEVICE__ -inline double erfinv(double __x) { return __ocml_erfinv_f64(__x); } +double erfinv(double __x) { return __ocml_erfinv_f64(__x); } + __DEVICE__ -inline double exp(double __x) { return __ocml_exp_f64(__x); } +double exp(double __x) { return __ocml_exp_f64(__x); } + __DEVICE__ -inline double exp10(double __x) { return __ocml_exp10_f64(__x); } +double exp10(double __x) { return __ocml_exp10_f64(__x); } + __DEVICE__ -inline double exp2(double __x) { return __ocml_exp2_f64(__x); } +double exp2(double __x) { return __ocml_exp2_f64(__x); } + __DEVICE__ -inline double expm1(double __x) { return __ocml_expm1_f64(__x); } +double expm1(double __x) { return __ocml_expm1_f64(__x); } + __DEVICE__ -inline double fabs(double __x) { return __ocml_fabs_f64(__x); } +double fabs(double __x) { return __ocml_fabs_f64(__x); } + __DEVICE__ -inline double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } +double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } + __DEVICE__ -inline double floor(double __x) { return __ocml_floor_f64(__x); } +double floor(double __x) { return __ocml_floor_f64(__x); } + __DEVICE__ -inline double fma(double __x, double __y, double __z) { +double fma(double __x, double __y, double __z) { return __ocml_fma_f64(__x, __y, __z); } + __DEVICE__ -inline double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); } +double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); } + __DEVICE__ -inline double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); } +double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); } + __DEVICE__ -inline double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } +double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } + __DEVICE__ -inline double frexp(double __x, int *__nptr) { +double frexp(double __x, int *__nptr) { int __tmp; double __r = __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp); *__nptr = __tmp; - return __r; } + __DEVICE__ -inline double hypot(double __x, double __y) { - return __ocml_hypot_f64(__x, __y); -} +double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } + __DEVICE__ -inline int ilogb(double __x) { return __ocml_ilogb_f64(__x); } +int ilogb(double __x) { return __ocml_ilogb_f64(__x); } + __DEVICE__ -inline __RETURN_TYPE isfinite(double __x) { return __ocml_isfinite_f64(__x); } +__RETURN_TYPE __finite(double __x) { return __ocml_isfinite_f64(__x); } + __DEVICE__ -inline __RETURN_TYPE isinf(double __x) { return __ocml_isinf_f64(__x); } +__RETURN_TYPE __isinf(double __x) { return __ocml_isinf_f64(__x); } + __DEVICE__ -inline __RETURN_TYPE isnan(double __x) { return __ocml_isnan_f64(__x); } +__RETURN_TYPE __isnan(double __x) { return __ocml_isnan_f64(__x); } + __DEVICE__ -inline double j0(double __x) { return __ocml_j0_f64(__x); } +double j0(double __x) { return __ocml_j0_f64(__x); } + __DEVICE__ -inline double j1(double __x) { return __ocml_j1_f64(__x); } +double j1(double __x) { return __ocml_j1_f64(__x); } + __DEVICE__ -inline double jn(int __n, - double __x) { // TODO: we could use Ahmes multiplication - // and the Miller & Brown algorithm +double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm // for linear recurrences to get O(log n) steps, but it's unclear if // it'd be beneficial in this case. Placeholder until OCML adds // support. if (__n == 0) - return j0f(__x); + return j0(__x); if (__n == 1) - return j1f(__x); + return j1(__x); - double __x0 = j0f(__x); - double __x1 = j1f(__x); + double __x0 = j0(__x); + double __x1 = j1(__x); for (int __i = 1; __i < __n; ++__i) { double __x2 = (2 * __i) / __x * __x1 - __x0; __x0 = __x1; __x1 = __x2; } - return __x1; } + __DEVICE__ -inline double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); } +double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); } + __DEVICE__ -inline double lgamma(double __x) { return __ocml_lgamma_f64(__x); } +double lgamma(double __x) { return __ocml_lgamma_f64(__x); } + __DEVICE__ -inline long long int llrint(double __x) { return __ocml_rint_f64(__x); } +long long int llrint(double __x) { return __ocml_rint_f64(__x); } + __DEVICE__ -inline long long int llround(double __x) { return __ocml_round_f64(__x); } +long long int llround(double __x) { return __ocml_round_f64(__x); } + __DEVICE__ -inline double log(double __x) { return __ocml_log_f64(__x); } +double log(double __x) { return __ocml_log_f64(__x); } + __DEVICE__ -inline double log10(double __x) { return __ocml_log10_f64(__x); } +double log10(double __x) { return __ocml_log10_f64(__x); } + __DEVICE__ -inline double log1p(double __x) { return __ocml_log1p_f64(__x); } +double log1p(double __x) { return __ocml_log1p_f64(__x); } + __DEVICE__ -inline double log2(double __x) { return __ocml_log2_f64(__x); } +double log2(double __x) { return __ocml_log2_f64(__x); } + __DEVICE__ -inline double logb(double __x) { return __ocml_logb_f64(__x); } +double logb(double __x) { return __ocml_logb_f64(__x); } + __DEVICE__ -inline long int lrint(double __x) { return __ocml_rint_f64(__x); } +long int lrint(double __x) { return __ocml_rint_f64(__x); } + __DEVICE__ -inline long int lround(double __x) { return __ocml_round_f64(__x); } +long int lround(double __x) { return __ocml_round_f64(__x); } + __DEVICE__ -inline double modf(double __x, double *__iptr) { +double modf(double __x, double *__iptr) { double __tmp; double __r = __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); @@ -699,8 +878,9 @@ inline double modf(double __x, double *__iptr) { return __r; } + __DEVICE__ -inline double nan(const char *__tagp) { +double nan(const char *__tagp) { #if !_WIN32 union { double val; @@ -710,8 +890,8 @@ inline double nan(const char *__tagp) { uint32_t exponent : 11; uint32_t sign : 1; } bits; - static_assert(sizeof(double) == sizeof(ieee_double), ""); } __tmp; + __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); __tmp.bits.sign = 0u; __tmp.bits.exponent = ~0u; @@ -720,22 +900,24 @@ inline double nan(const char *__tagp) { return __tmp.val; #else - static_assert(sizeof(uint64_t) == sizeof(double)); - uint64_t val = __make_mantissa(__tagp); - val |= 0xFFF << 51; - return *reinterpret_cast(&val); + __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); + uint64_t __val = __make_mantissa(__tagp); + __val |= 0xFFF << 51; + return *reinterpret_cast(&__val); #endif } + __DEVICE__ -inline double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); } +double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); } + __DEVICE__ -inline double nextafter(double __x, double __y) { +double nextafter(double __x, double __y) { return __ocml_nextafter_f64(__x, __y); } + __DEVICE__ -inline double -norm(int __dim, - const double *__a) { // TODO: placeholder until OCML adds support. +double norm(int __dim, + const double *__a) { // TODO: placeholder until OCML adds support. double __r = 0; while (__dim--) { __r += __a[0] * __a[0]; @@ -744,28 +926,39 @@ norm(int __dim, return __ocml_sqrt_f64(__r); } + __DEVICE__ -inline double norm3d(double __x, double __y, double __z) { +double norm3d(double __x, double __y, double __z) { return __ocml_len3_f64(__x, __y, __z); } + __DEVICE__ -inline double norm4d(double __x, double __y, double __z, double __w) { +double norm4d(double __x, double __y, double __z, double __w) { return __ocml_len4_f64(__x, __y, __z, __w); } + __DEVICE__ -inline double normcdf(double __x) { return __ocml_ncdf_f64(__x); } +double normcdf(double __x) { return __ocml_ncdf_f64(__x); } + __DEVICE__ -inline double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } +double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } + __DEVICE__ -inline double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } +double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } + __DEVICE__ -inline double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } +double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } + __DEVICE__ -inline double remainder(double __x, double __y) { +double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } + +__DEVICE__ +double remainder(double __x, double __y) { return __ocml_remainder_f64(__x, __y); } + __DEVICE__ -inline double remquo(double __x, double __y, int *__quo) { +double remquo(double __x, double __y, int *__quo) { int __tmp; double __r = __ocml_remquo_f64( __x, __y, (__attribute__((address_space(5))) int *)&__tmp); @@ -773,16 +966,16 @@ inline double remquo(double __x, double __y, int *__quo) { return __r; } + __DEVICE__ -inline double rhypot(double __x, double __y) { - return __ocml_rhypot_f64(__x, __y); -} +double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } + __DEVICE__ -inline double rint(double __x) { return __ocml_rint_f64(__x); } +double rint(double __x) { return __ocml_rint_f64(__x); } + __DEVICE__ -inline double -rnorm(int __dim, - const double *__a) { // TODO: placeholder until OCML adds support. +double rnorm(int __dim, + const double *__a) { // TODO: placeholder until OCML adds support. double __r = 0; while (__dim--) { __r += __a[0] * __a[0]; @@ -791,77 +984,93 @@ rnorm(int __dim, return __ocml_rsqrt_f64(__r); } + __DEVICE__ -inline double rnorm3d(double __x, double __y, double __z) { +double rnorm3d(double __x, double __y, double __z) { return __ocml_rlen3_f64(__x, __y, __z); } + __DEVICE__ -inline double rnorm4d(double __x, double __y, double __z, double __w) { +double rnorm4d(double __x, double __y, double __z, double __w) { return __ocml_rlen4_f64(__x, __y, __z, __w); } + __DEVICE__ -inline double round(double __x) { return __ocml_round_f64(__x); } +double round(double __x) { return __ocml_round_f64(__x); } + __DEVICE__ -inline double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } +double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } + __DEVICE__ -inline double scalbln(double __x, long int __n) { +double scalbln(double __x, long int __n) { return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n) : __ocml_scalb_f64(__x, __n); } __DEVICE__ -inline double scalbn(double __x, int __n) { - return __ocml_scalbn_f64(__x, __n); -} +double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); } + __DEVICE__ -inline __RETURN_TYPE signbit(double __x) { return __ocml_signbit_f64(__x); } +__RETURN_TYPE __signbit(double __x) { return __ocml_signbit_f64(__x); } + __DEVICE__ -inline double sin(double __x) { return __ocml_sin_f64(__x); } +double sin(double __x) { return __ocml_sin_f64(__x); } + __DEVICE__ -inline void sincos(double __x, double *__sinptr, double *__cosptr) { +void sincos(double __x, double *__sinptr, double *__cosptr) { double __tmp; *__sinptr = __ocml_sincos_f64( __x, (__attribute__((address_space(5))) double *)&__tmp); *__cosptr = __tmp; } + __DEVICE__ -inline void sincospi(double __x, double *__sinptr, double *__cosptr) { +void sincospi(double __x, double *__sinptr, double *__cosptr) { double __tmp; *__sinptr = __ocml_sincospi_f64( __x, (__attribute__((address_space(5))) double *)&__tmp); *__cosptr = __tmp; } + __DEVICE__ -inline double sinh(double __x) { return __ocml_sinh_f64(__x); } +double sinh(double __x) { return __ocml_sinh_f64(__x); } + __DEVICE__ -inline double sinpi(double __x) { return __ocml_sinpi_f64(__x); } +double sinpi(double __x) { return __ocml_sinpi_f64(__x); } + __DEVICE__ -inline double sqrt(double __x) { return __ocml_sqrt_f64(__x); } +double sqrt(double __x) { return __ocml_sqrt_f64(__x); } + __DEVICE__ -inline double tan(double __x) { return __ocml_tan_f64(__x); } +double tan(double __x) { return __ocml_tan_f64(__x); } + __DEVICE__ -inline double tanh(double __x) { return __ocml_tanh_f64(__x); } +double tanh(double __x) { return __ocml_tanh_f64(__x); } + __DEVICE__ -inline double tgamma(double __x) { return __ocml_tgamma_f64(__x); } +double tgamma(double __x) { return __ocml_tgamma_f64(__x); } + __DEVICE__ -inline double trunc(double __x) { return __ocml_trunc_f64(__x); } +double trunc(double __x) { return __ocml_trunc_f64(__x); } + __DEVICE__ -inline double y0(double __x) { return __ocml_y0_f64(__x); } +double y0(double __x) { return __ocml_y0_f64(__x); } + __DEVICE__ -inline double y1(double __x) { return __ocml_y1_f64(__x); } +double y1(double __x) { return __ocml_y1_f64(__x); } + __DEVICE__ -inline double yn(int __n, - double __x) { // TODO: we could use Ahmes multiplication - // and the Miller & Brown algorithm +double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication + // and the Miller & Brown algorithm // for linear recurrences to get O(log n) steps, but it's unclear if // it'd be beneficial in this case. Placeholder until OCML adds // support. if (__n == 0) - return j0f(__x); + return y0(__x); if (__n == 1) - return j1f(__x); + return y1(__x); - double __x0 = j0f(__x); - double __x1 = j1f(__x); + double __x0 = y0(__x); + double __x1 = y1(__x); for (int __i = 1; __i < __n; ++__i) { double __x2 = (2 * __i) / __x * __x1 - __x0; __x0 = __x1; @@ -874,296 +1083,182 @@ inline double yn(int __n, // BEGIN INTRINSICS #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __dadd_rd(double __x, double __y) { +double __dadd_rd(double __x, double __y) { return __ocml_add_rtn_f64(__x, __y); } -#endif __DEVICE__ -inline double __dadd_rn(double __x, double __y) { return __x + __y; } -#if defined OCML_BASIC_ROUNDED_OPERATIONS +double __dadd_rn(double __x, double __y) { + return __ocml_add_rte_f64(__x, __y); +} __DEVICE__ -inline double __dadd_ru(double __x, double __y) { +double __dadd_ru(double __x, double __y) { return __ocml_add_rtp_f64(__x, __y); } __DEVICE__ -inline double __dadd_rz(double __x, double __y) { +double __dadd_rz(double __x, double __y) { return __ocml_add_rtz_f64(__x, __y); } +#else __DEVICE__ -inline double __ddiv_rd(double __x, double __y) { - return __ocml_div_rtn_f64(__x, __y); -} +double __dadd_rn(double __x, double __y) { return __x + __y; } #endif -__DEVICE__ -inline double __ddiv_rn(double __x, double __y) { return __x / __y; } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __ddiv_ru(double __x, double __y) { +double __ddiv_rd(double __x, double __y) { + return __ocml_div_rtn_f64(__x, __y); +} +__DEVICE__ +double __ddiv_rn(double __x, double __y) { + return __ocml_div_rte_f64(__x, __y); +} +__DEVICE__ +double __ddiv_ru(double __x, double __y) { return __ocml_div_rtp_f64(__x, __y); } __DEVICE__ -inline double __ddiv_rz(double __x, double __y) { +double __ddiv_rz(double __x, double __y) { return __ocml_div_rtz_f64(__x, __y); } +#else __DEVICE__ -inline double __dmul_rd(double __x, double __y) { - return __ocml_mul_rtn_f64(__x, __y); -} +double __ddiv_rn(double __x, double __y) { return __x / __y; } #endif -__DEVICE__ -inline double __dmul_rn(double __x, double __y) { return __x * __y; } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __dmul_ru(double __x, double __y) { +double __dmul_rd(double __x, double __y) { + return __ocml_mul_rtn_f64(__x, __y); +} +__DEVICE__ +double __dmul_rn(double __x, double __y) { + return __ocml_mul_rte_f64(__x, __y); +} +__DEVICE__ +double __dmul_ru(double __x, double __y) { return __ocml_mul_rtp_f64(__x, __y); } __DEVICE__ -inline double __dmul_rz(double __x, double __y) { +double __dmul_rz(double __x, double __y) { return __ocml_mul_rtz_f64(__x, __y); } +#else __DEVICE__ -inline double __drcp_rd(double __x) { return __llvm_amdgcn_rcp_f64(__x); } +double __dmul_rn(double __x, double __y) { return __x * __y; } #endif -__DEVICE__ -inline double __drcp_rn(double __x) { return __llvm_amdgcn_rcp_f64(__x); } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __drcp_ru(double __x) { return __llvm_amdgcn_rcp_f64(__x); } +double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } __DEVICE__ -inline double __drcp_rz(double __x) { return __llvm_amdgcn_rcp_f64(__x); } +double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } __DEVICE__ -inline double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } +double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } +__DEVICE__ +double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } +#else +__DEVICE__ +double __drcp_rn(double __x) { return 1.0 / __x; } #endif -__DEVICE__ -inline double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); } + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } +double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } __DEVICE__ -inline double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } +double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } __DEVICE__ -inline double __dsub_rd(double __x, double __y) { +double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } +__DEVICE__ +double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } +#else +__DEVICE__ +double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); } +#endif + +#if defined OCML_BASIC_ROUNDED_OPERATIONS +__DEVICE__ +double __dsub_rd(double __x, double __y) { return __ocml_sub_rtn_f64(__x, __y); } -#endif __DEVICE__ -inline double __dsub_rn(double __x, double __y) { return __x - __y; } -#if defined OCML_BASIC_ROUNDED_OPERATIONS +double __dsub_rn(double __x, double __y) { + return __ocml_sub_rte_f64(__x, __y); +} __DEVICE__ -inline double __dsub_ru(double __x, double __y) { +double __dsub_ru(double __x, double __y) { return __ocml_sub_rtp_f64(__x, __y); } __DEVICE__ -inline double __dsub_rz(double __x, double __y) { +double __dsub_rz(double __x, double __y) { return __ocml_sub_rtz_f64(__x, __y); } +#else __DEVICE__ -inline double __fma_rd(double __x, double __y, double __z) { - return __ocml_fma_rtn_f64(__x, __y, __z); -} +double __dsub_rn(double __x, double __y) { return __x - __y; } #endif -__DEVICE__ -inline double __fma_rn(double __x, double __y, double __z) { - return __ocml_fma_f64(__x, __y, __z); -} + #if defined OCML_BASIC_ROUNDED_OPERATIONS __DEVICE__ -inline double __fma_ru(double __x, double __y, double __z) { +double __fma_rd(double __x, double __y, double __z) { + return __ocml_fma_rtn_f64(__x, __y, __z); +} +__DEVICE__ +double __fma_rn(double __x, double __y, double __z) { + return __ocml_fma_rte_f64(__x, __y, __z); +} +__DEVICE__ +double __fma_ru(double __x, double __y, double __z) { return __ocml_fma_rtp_f64(__x, __y, __z); } __DEVICE__ -inline double __fma_rz(double __x, double __y, double __z) { +double __fma_rz(double __x, double __y, double __z) { return __ocml_fma_rtz_f64(__x, __y, __z); } +#else +__DEVICE__ +double __fma_rn(double __x, double __y, double __z) { + return __ocml_fma_f64(__x, __y, __z); +} #endif // END INTRINSICS // END DOUBLE -// BEGIN INTEGER -__DEVICE__ -inline int abs(int __x) { - int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1); - return (__x ^ __sgn) - __sgn; -} -__DEVICE__ -inline long labs(long __x) { - long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1); - return (__x ^ __sgn) - __sgn; -} -__DEVICE__ -inline long long llabs(long long __x) { - long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1); - return (__x ^ __sgn) - __sgn; -} +// C only macros +#if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L +#define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) +#define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) +#define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) +#define signbit(__x) \ + _Generic((__x), float : __signbitf, double : __signbit)(__x) +#endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L #if defined(__cplusplus) -__DEVICE__ -inline long abs(long __x) { return labs(__x); } -__DEVICE__ -inline long long abs(long long __x) { return llabs(__x); } -#endif -// END INTEGER - -__DEVICE__ -inline _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) { - return __ocml_fma_f16(__x, __y, __z); -} - -__DEVICE__ -inline float fma(float __x, float __y, float __z) { - return fmaf(__x, __y, __z); -} - -#pragma push_macro("__DEF_FUN1") -#pragma push_macro("__DEF_FUN2") -#pragma push_macro("__DEF_FUNI") -#pragma push_macro("__DEF_FLOAT_FUN2I") -#pragma push_macro("__HIP_OVERLOAD1") -#pragma push_macro("__HIP_OVERLOAD2") - -// __hip_enable_if::type is a type function which returns __T if __B is true. -template struct __hip_enable_if {}; - -template struct __hip_enable_if { typedef __T type; }; - -// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to -// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with -// floor(double). -#define __HIP_OVERLOAD1(__retty, __fn) \ - template \ - __DEVICE__ typename __hip_enable_if::is_integer, \ - __retty>::type \ - __fn(__T __x) { \ - return ::__fn((double)__x); \ - } - -// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double -// or integer argument to avoid compilation error due to ambibuity. e.g. -// max(5.0f, 6.0) is resolved with max(double, double). -#define __HIP_OVERLOAD2(__retty, __fn) \ - template \ - __DEVICE__ \ - typename __hip_enable_if::is_specialized && \ - std::numeric_limits<__T2>::is_specialized, \ - __retty>::type \ - __fn(__T1 __x, __T2 __y) { \ - return __fn((double)__x, (double)__y); \ - } - -// Define cmath functions with float argument and returns float. -#define __DEF_FUN1(__retty, __func) \ - __DEVICE__ \ - inline float __func(float __x) { return __func##f(__x); } \ - __HIP_OVERLOAD1(__retty, __func) - -// Define cmath functions with float argument and returns __retty. -#define __DEF_FUNI(__retty, __func) \ - __DEVICE__ \ - inline __retty __func(float __x) { return __func##f(__x); } \ - __HIP_OVERLOAD1(__retty, __func) - -// define cmath functions with two float arguments. -#define __DEF_FUN2(__retty, __func) \ - __DEVICE__ \ - inline float __func(float __x, float __y) { return __func##f(__x, __y); } \ - __HIP_OVERLOAD2(__retty, __func) - -__DEF_FUN1(double, acos) -__DEF_FUN1(double, acosh) -__DEF_FUN1(double, asin) -__DEF_FUN1(double, asinh) -__DEF_FUN1(double, atan) -__DEF_FUN2(double, atan2); -__DEF_FUN1(double, atanh) -__DEF_FUN1(double, cbrt) -__DEF_FUN1(double, ceil) -__DEF_FUN2(double, copysign); -__DEF_FUN1(double, cos) -__DEF_FUN1(double, cosh) -__DEF_FUN1(double, erf) -__DEF_FUN1(double, erfc) -__DEF_FUN1(double, exp) -__DEF_FUN1(double, exp2) -__DEF_FUN1(double, expm1) -__DEF_FUN1(double, fabs) -__DEF_FUN2(double, fdim); -__DEF_FUN1(double, floor) -__DEF_FUN2(double, fmax); -__DEF_FUN2(double, fmin); -__DEF_FUN2(double, fmod); -//__HIP_OVERLOAD1(int, fpclassify) -__DEF_FUN2(double, hypot); -__DEF_FUNI(int, ilogb) -__HIP_OVERLOAD1(bool, isfinite) -__HIP_OVERLOAD2(bool, isgreater); -__HIP_OVERLOAD2(bool, isgreaterequal); -__HIP_OVERLOAD1(bool, isinf); -__HIP_OVERLOAD2(bool, isless); -__HIP_OVERLOAD2(bool, islessequal); -__HIP_OVERLOAD2(bool, islessgreater); -__HIP_OVERLOAD1(bool, isnan); -//__HIP_OVERLOAD1(bool, isnormal) -__HIP_OVERLOAD2(bool, isunordered); -__DEF_FUN1(double, lgamma) -__DEF_FUN1(double, log) -__DEF_FUN1(double, log10) -__DEF_FUN1(double, log1p) -__DEF_FUN1(double, log2) -__DEF_FUN1(double, logb) -__DEF_FUNI(long long, llrint) -__DEF_FUNI(long long, llround) -__DEF_FUNI(long, lrint) -__DEF_FUNI(long, lround) -__DEF_FUN1(double, nearbyint); -__DEF_FUN2(double, nextafter); -__DEF_FUN2(double, pow); -__DEF_FUN2(double, remainder); -__DEF_FUN1(double, rint); -__DEF_FUN1(double, round); -__HIP_OVERLOAD1(bool, signbit) -__DEF_FUN1(double, sin) -__DEF_FUN1(double, sinh) -__DEF_FUN1(double, sqrt) -__DEF_FUN1(double, tan) -__DEF_FUN1(double, tanh) -__DEF_FUN1(double, tgamma) -__DEF_FUN1(double, trunc); - -// define cmath functions with a float and an integer argument. -#define __DEF_FLOAT_FUN2I(__func) \ - __DEVICE__ \ - inline float __func(float __x, int __y) { return __func##f(__x, __y); } -__DEF_FLOAT_FUN2I(scalbn) - -template __DEVICE__ inline T min(T __arg1, T __arg2) { +template __DEVICE__ T min(T __arg1, T __arg2) { return (__arg1 < __arg2) ? __arg1 : __arg2; } -template __DEVICE__ inline T max(T __arg1, T __arg2) { +template __DEVICE__ T max(T __arg1, T __arg2) { return (__arg1 > __arg2) ? __arg1 : __arg2; } -__DEVICE__ inline int min(int __arg1, int __arg2) { +__DEVICE__ int min(int __arg1, int __arg2) { return (__arg1 < __arg2) ? __arg1 : __arg2; } -__DEVICE__ inline int max(int __arg1, int __arg2) { +__DEVICE__ int max(int __arg1, int __arg2) { return (__arg1 > __arg2) ? __arg1 : __arg2; } __DEVICE__ -inline float max(float __x, float __y) { return fmaxf(__x, __y); } +float max(float __x, float __y) { return fmaxf(__x, __y); } __DEVICE__ -inline double max(double __x, double __y) { return fmax(__x, __y); } +double max(double __x, double __y) { return fmax(__x, __y); } __DEVICE__ -inline float min(float __x, float __y) { return fminf(__x, __y); } +float min(float __x, float __y) { return fminf(__x, __y); } __DEVICE__ -inline double min(double __x, double __y) { return fmin(__x, __y); } - -__HIP_OVERLOAD2(double, max) -__HIP_OVERLOAD2(double, min) +double min(double __x, double __y) { return fmin(__x, __y); } __host__ inline static int min(int __arg1, int __arg2) { return std::min(__arg1, __arg2); @@ -1172,13 +1267,8 @@ __host__ inline static int min(int __arg1, int __arg2) { __host__ inline static int max(int __arg1, int __arg2) { return std::max(__arg1, __arg2); } +#endif -#pragma pop_macro("__DEF_FUN1") -#pragma pop_macro("__DEF_FUN2") -#pragma pop_macro("__DEF_FUNI") -#pragma pop_macro("__DEF_FLOAT_FUN2I") -#pragma pop_macro("__HIP_OVERLOAD1") -#pragma pop_macro("__HIP_OVERLOAD2") #pragma pop_macro("__DEVICE__") #pragma pop_macro("__RETURN_TYPE") diff --git a/lib/include/__clang_hip_runtime_wrapper.h b/lib/include/__clang_hip_runtime_wrapper.h index addae5605a..81a16a265a 100644 --- a/lib/include/__clang_hip_runtime_wrapper.h +++ b/lib/include/__clang_hip_runtime_wrapper.h @@ -28,6 +28,10 @@ #define __shared__ __attribute__((shared)) #define __constant__ __attribute__((constant)) +#if !defined(__cplusplus) || __cplusplus < 201103L + #define nullptr NULL; +#endif + #if __HIP_ENABLE_DEVICE_MALLOC__ extern "C" __device__ void *__hip_malloc(size_t __size); extern "C" __device__ void *__hip_free(void *__ptr); @@ -51,6 +55,7 @@ static inline __device__ void *free(void *__ptr) { #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__ #include <__clang_cuda_math_forward_declares.h> +#include <__clang_hip_cmath.h> #include <__clang_cuda_complex_builtins.h> #include diff --git a/lib/include/altivec.h b/lib/include/altivec.h index ac5f438363..2b82113de3 100644 --- a/lib/include/altivec.h +++ b/lib/include/altivec.h @@ -1709,6 +1709,20 @@ vec_cmpeq(vector double __a, vector double __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpeq(vector signed __int128 __a, vector signed __int128 __b) { + return (vector bool __int128)__builtin_altivec_vcmpequq( + (vector bool __int128)__a, (vector bool __int128)__b); +} + +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return (vector bool __int128)__builtin_altivec_vcmpequq( + (vector bool __int128)__a, (vector bool __int128)__b); +} +#endif + #ifdef __POWER9_VECTOR__ /* vec_cmpne */ @@ -1766,36 +1780,26 @@ vec_cmpne(vector unsigned int __a, vector unsigned int __b) { (vector int)__b); } -static __inline__ vector bool long long __ATTRS_o_ai -vec_cmpne(vector bool long long __a, vector bool long long __b) { - return (vector bool long long) - ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); -} - -static __inline__ vector bool long long __ATTRS_o_ai -vec_cmpne(vector signed long long __a, vector signed long long __b) { - return (vector bool long long) - ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); -} - -static __inline__ vector bool long long __ATTRS_o_ai -vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) { - return (vector bool long long) - ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); -} - static __inline__ vector bool int __ATTRS_o_ai vec_cmpne(vector float __a, vector float __b) { return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a, (vector int)__b); } -static __inline__ vector bool long long __ATTRS_o_ai -vec_cmpne(vector double __a, vector double __b) { - return (vector bool long long) - ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpne(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return (vector bool __int128) ~(__builtin_altivec_vcmpequq( + (vector bool __int128)__a, (vector bool __int128)__b)); } +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) { + return (vector bool __int128) ~(__builtin_altivec_vcmpequq( + (vector bool __int128)__a, (vector bool __int128)__b)); +} +#endif + /* vec_cmpnez */ static __inline__ vector bool char __ATTRS_o_ai @@ -1900,6 +1904,86 @@ vec_parity_lsbb(vector signed long long __a) { return __builtin_altivec_vprtybd(__a); } +#else +/* vec_cmpne */ + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector bool char __a, vector bool char __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector signed char __a, vector signed char __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool char __ATTRS_o_ai +vec_cmpne(vector unsigned char __a, vector unsigned char __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector bool short __a, vector bool short __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector signed short __a, vector signed short __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool short __ATTRS_o_ai +vec_cmpne(vector unsigned short __a, vector unsigned short __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector bool int __a, vector bool int __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector signed int __a, vector signed int __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector unsigned int __a, vector unsigned int __b) { + return ~(vec_cmpeq(__a, __b)); +} + +static __inline__ vector bool int __ATTRS_o_ai +vec_cmpne(vector float __a, vector float __b) { + return ~(vec_cmpeq(__a, __b)); +} +#endif + +#ifdef __POWER8_VECTOR__ +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector bool long long __a, vector bool long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector signed long long __a, vector signed long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} + +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} +#endif + +#ifdef __VSX__ +static __inline__ vector bool long long __ATTRS_o_ai +vec_cmpne(vector double __a, vector double __b) { + return (vector bool long long) + ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b)); +} #endif /* vec_cmpgt */ @@ -1962,6 +2046,20 @@ vec_cmpgt(vector double __a, vector double __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpgt(vector signed __int128 __a, vector signed __int128 __b) { + return (vector bool __int128)__builtin_altivec_vcmpgtsq( + (vector bool __int128)__a, (vector bool __int128)__b); +} + +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpgt(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return (vector bool __int128)__builtin_altivec_vcmpgtuq( + (vector bool __int128)__a, (vector bool __int128)__b); +} +#endif + /* vec_cmpge */ static __inline__ vector bool char __ATTRS_o_ai @@ -2022,6 +2120,18 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpge(vector signed __int128 __a, vector signed __int128 __b) { + return ~(vec_cmpgt(__b, __a)); +} + +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmpge(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return ~(vec_cmpgt(__b, __a)); +} +#endif + /* vec_vcmpgefp */ static __inline__ vector bool int __attribute__((__always_inline__)) @@ -2134,6 +2244,18 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmple(vector signed __int128 __a, vector signed __int128 __b) { + return vec_cmpge(__b, __a); +} + +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmple(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return vec_cmpge(__b, __a); +} +#endif + /* vec_cmplt */ static __inline__ vector bool char __ATTRS_o_ai @@ -2178,6 +2300,18 @@ vec_cmplt(vector double __a, vector double __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmplt(vector signed __int128 __a, vector signed __int128 __b) { + return vec_cmpgt(__b, __a); +} + +static __inline__ vector bool __int128 __ATTRS_o_ai +vec_cmplt(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return vec_cmpgt(__b, __a); +} +#endif + #ifdef __POWER8_VECTOR__ static __inline__ vector bool long long __ATTRS_o_ai vec_cmplt(vector signed long long __a, vector signed long long __b) { @@ -2702,67 +2836,67 @@ vec_insert_exp(vector unsigned int __a, vector unsigned int __b) { } #if defined(__powerpc64__) -static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a, +static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(const signed char *__a, size_t __b) { return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector unsigned char __ATTRS_o_ai -vec_xl_len(unsigned char *__a, size_t __b) { +vec_xl_len(const unsigned char *__a, size_t __b) { return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56)); } -static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a, +static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(const signed short *__a, size_t __b) { return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector unsigned short __ATTRS_o_ai -vec_xl_len(unsigned short *__a, size_t __b) { +vec_xl_len(const unsigned short *__a, size_t __b) { return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56)); } -static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a, +static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(const signed int *__a, size_t __b) { return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56)); } -static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a, +static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(const unsigned int *__a, size_t __b) { return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56)); } -static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) { +static __inline__ vector float __ATTRS_o_ai vec_xl_len(const float *__a, size_t __b) { return (vector float)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector signed __int128 __ATTRS_o_ai -vec_xl_len(signed __int128 *__a, size_t __b) { +vec_xl_len(const signed __int128 *__a, size_t __b) { return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector unsigned __int128 __ATTRS_o_ai -vec_xl_len(unsigned __int128 *__a, size_t __b) { +vec_xl_len(const unsigned __int128 *__a, size_t __b) { return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector signed long long __ATTRS_o_ai -vec_xl_len(signed long long *__a, size_t __b) { +vec_xl_len(const signed long long *__a, size_t __b) { return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector unsigned long long __ATTRS_o_ai -vec_xl_len(unsigned long long *__a, size_t __b) { +vec_xl_len(const unsigned long long *__a, size_t __b) { return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56)); } -static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a, +static __inline__ vector double __ATTRS_o_ai vec_xl_len(const double *__a, size_t __b) { return (vector double)__builtin_vsx_lxvl(__a, (__b << 56)); } static __inline__ vector unsigned char __ATTRS_o_ai -vec_xl_len_r(unsigned char *__a, size_t __b) { +vec_xl_len_r(const unsigned char *__a, size_t __b) { vector unsigned char __res = (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56)); #ifdef __LITTLE_ENDIAN__ @@ -2862,12 +2996,12 @@ static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a, #ifdef __VSX__ static __inline__ vector float __ATTRS_o_ai vec_cpsgn(vector float __a, vector float __b) { - return __builtin_vsx_xvcpsgnsp(__a, __b); + return __builtin_vsx_xvcpsgnsp(__b, __a); } static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a, vector double __b) { - return __builtin_vsx_xvcpsgndp(__a, __b); + return __builtin_vsx_xvcpsgndp(__b, __a); } #endif @@ -2951,6 +3085,42 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a, #define vec_vctuxs __builtin_altivec_vctuxs +/* vec_signext */ + +#ifdef __POWER9_VECTOR__ +static __inline__ vector signed int __ATTRS_o_ai +vec_signexti(vector signed char __a) { + return __builtin_altivec_vextsb2w(__a); +} + +static __inline__ vector signed int __ATTRS_o_ai +vec_signexti(vector signed short __a) { + return __builtin_altivec_vextsh2w(__a); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_signextll(vector signed char __a) { + return __builtin_altivec_vextsb2d(__a); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_signextll(vector signed short __a) { + return __builtin_altivec_vextsh2d(__a); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_signextll(vector signed int __a) { + return __builtin_altivec_vextsw2d(__a); +} +#endif + +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_signextq(vector signed long long __a) { + return __builtin_altivec_vextsd2q(__a); +} +#endif + /* vec_signed */ static __inline__ vector signed int __ATTRS_o_ai @@ -3288,6 +3458,66 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a, } #endif +/* vec_dive */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed int __ATTRS_o_ai +vec_dive(vector signed int __a, vector signed int __b) { + return __builtin_altivec_vdivesw(__a, __b); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_dive(vector unsigned int __a, vector unsigned int __b) { + return __builtin_altivec_vdiveuw(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_dive(vector signed long long __a, vector signed long long __b) { + return __builtin_altivec_vdivesd(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_dive(vector unsigned long long __a, vector unsigned long long __b) { + return __builtin_altivec_vdiveud(__a, __b); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_dive(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __builtin_altivec_vdiveuq(__a, __b); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_dive(vector signed __int128 __a, vector signed __int128 __b) { + return __builtin_altivec_vdivesq(__a, __b); +} +#endif + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_div(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __a / __b; +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_div(vector signed __int128 __a, vector signed __int128 __b) { + return __a / __b; +} +#endif /* __POWER10_VECTOR__ */ + +/* vec_xvtdiv */ + +#ifdef __VSX__ +static __inline__ int __ATTRS_o_ai vec_test_swdiv(vector double __a, + vector double __b) { + return __builtin_vsx_xvtdivdp(__a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_test_swdivs(vector float __a, + vector float __b) { + return __builtin_vsx_xvtdivsp(__a, __b); +} +#endif + /* vec_dss */ #define vec_dss __builtin_altivec_dss @@ -3300,23 +3530,19 @@ static __inline__ void __attribute__((__always_inline__)) vec_dssall(void) { /* vec_dst */ #define vec_dst(__PTR, __CW, __STR) \ - __extension__( \ - { __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)); }) + __builtin_altivec_dst((const void *)(__PTR), (__CW), (__STR)) /* vec_dstst */ #define vec_dstst(__PTR, __CW, __STR) \ - __extension__( \ - { __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)); }) + __builtin_altivec_dstst((const void *)(__PTR), (__CW), (__STR)) /* vec_dststt */ #define vec_dststt(__PTR, __CW, __STR) \ - __extension__( \ - { __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)); }) + __builtin_altivec_dststt((const void *)(__PTR), (__CW), (__STR)) /* vec_dstt */ #define vec_dstt(__PTR, __CW, __STR) \ - __extension__( \ - { __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)); }) + __builtin_altivec_dstt((const void *)(__PTR), (__CW), (__STR)) /* vec_eqv */ @@ -5467,6 +5693,16 @@ vec_msum(vector unsigned short __a, vector unsigned short __b, return __builtin_altivec_vmsumuhm(__a, __b, __c); } +/* vec_msumc */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_msumc(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vmsumcud(__a, __b, __c); +} +#endif + /* vec_vmsummbm */ static __inline__ vector int __attribute__((__always_inline__)) @@ -5693,6 +5929,26 @@ vec_mule(vector unsigned int __a, vector unsigned int __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mule(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulosd(__a, __b); +#else + return __builtin_altivec_vmulesd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mule(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuloud(__a, __b); +#else + return __builtin_altivec_vmuleud(__a, __b); +#endif +} +#endif + /* vec_vmulesb */ static __inline__ vector short __attribute__((__always_inline__)) @@ -5737,6 +5993,30 @@ vec_vmuleuh(vector unsigned short __a, vector unsigned short __b) { #endif } +/* vec_mulh */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed int __ATTRS_o_ai +vec_mulh(vector signed int __a, vector signed int __b) { + return __builtin_altivec_vmulhsw(__a, __b); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_mulh(vector unsigned int __a, vector unsigned int __b) { + return __builtin_altivec_vmulhuw(__a, __b); +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mulh(vector signed long long __a, vector signed long long __b) { + return __builtin_altivec_vmulhsd(__a, __b); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mulh(vector unsigned long long __a, vector unsigned long long __b) { + return __builtin_altivec_vmulhud(__a, __b); +} +#endif + /* vec_mulo */ static __inline__ vector short __ATTRS_o_ai vec_mulo(vector signed char __a, @@ -5795,6 +6075,26 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mulo(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulesd(__a, __b); +#else + return __builtin_altivec_vmulosd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mulo(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuleud(__a, __b); +#else + return __builtin_altivec_vmuloud(__a, __b); +#endif +} +#endif + /* vec_vmulosb */ static __inline__ vector short __attribute__((__always_inline__)) @@ -7627,6 +7927,18 @@ vec_rl(vector unsigned long long __a, vector unsigned long long __b) { } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) { + return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a)); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a)); +} +#endif + /* vec_rlmi */ #ifdef __POWER9_VECTOR__ static __inline__ vector unsigned int __ATTRS_o_ai @@ -7640,8 +7952,24 @@ vec_rlmi(vector unsigned long long __a, vector unsigned long long __b, vector unsigned long long __c) { return __builtin_altivec_vrldmi(__a, __c, __b); } +#endif + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vrlqmi(__a, __c, __b); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rlmi(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vrlqmi(__a, __c, __b); +} +#endif /* vec_rlnm */ +#ifdef __POWER9_VECTOR__ static __inline__ vector unsigned int __ATTRS_o_ai vec_rlnm(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) { @@ -7657,6 +7985,42 @@ vec_rlnm(vector unsigned long long __a, vector unsigned long long __b, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + // Merge __b and __c using an appropriate shuffle. + vector unsigned char TmpB = (vector unsigned char)__b; + vector unsigned char TmpC = (vector unsigned char)__c; + vector unsigned char MaskAndShift = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0, + 1, -1, -1, -1, -1, -1); +#else + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1, + -1, -1, -1, -1, -1, -1, -1); +#endif + return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rlnm(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + // Merge __b and __c using an appropriate shuffle. + vector unsigned char TmpB = (vector unsigned char)__b; + vector unsigned char TmpC = (vector unsigned char)__c; + vector unsigned char MaskAndShift = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0, + 1, -1, -1, -1, -1, -1); +#else + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1, + -1, -1, -1, -1, -1, -1, -1); +#endif + return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift); +} +#endif + /* vec_vrlb */ static __inline__ vector signed char __ATTRS_o_ai @@ -7771,6 +8135,18 @@ vec_vrsqrtefp(vector float __a) { return __builtin_altivec_vrsqrtefp(__a); } +/* vec_xvtsqrt */ + +#ifdef __VSX__ +static __inline__ int __ATTRS_o_ai vec_test_swsqrt(vector double __a) { + return __builtin_vsx_xvtsqrtdp(__a); +} + +static __inline__ int __ATTRS_o_ai vec_test_swsqrts(vector float __a) { + return __builtin_vsx_xvtsqrtsp(__a); +} +#endif + /* vec_sel */ #define __builtin_altivec_vsel_4si vec_sel @@ -13900,6 +14276,18 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b); +} +#endif + /* vec_all_ge */ static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed char __a, @@ -14071,6 +14459,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ge(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_ge(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __b, __a); +} + +static __inline__ int __ATTRS_o_ai vec_all_ge(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __b, __a); +} +#endif + /* vec_all_gt */ static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed char __a, @@ -14242,6 +14642,18 @@ static __inline__ int __ATTRS_o_ai vec_all_gt(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_gt(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_all_gt(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __a, __b); +} +#endif + /* vec_all_in */ static __inline__ int __attribute__((__always_inline__)) @@ -14421,6 +14833,18 @@ static __inline__ int __ATTRS_o_ai vec_all_le(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_le(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_EQ, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_all_le(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_EQ, __a, __b); +} +#endif + /* vec_all_lt */ static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed char __a, @@ -14593,6 +15017,18 @@ static __inline__ int __ATTRS_o_ai vec_all_lt(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_lt(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_LT, __b, __a); +} + +static __inline__ int __ATTRS_o_ai vec_all_lt(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_LT, __b, __a); +} +#endif + /* vec_all_nan */ static __inline__ int __ATTRS_o_ai vec_all_nan(vector float __a) { @@ -14797,6 +15233,18 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_all_ne(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b); +} +#endif + /* vec_all_nge */ static __inline__ int __ATTRS_o_ai vec_all_nge(vector float __a, @@ -15042,6 +15490,18 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b); +} +#endif + /* vec_any_ge */ static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, @@ -15221,6 +15681,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ge(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __b, __a); +} + +static __inline__ int __ATTRS_o_ai vec_any_ge(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __b, __a); +} +#endif + /* vec_any_gt */ static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed char __a, @@ -15400,6 +15872,18 @@ static __inline__ int __ATTRS_o_ai vec_any_gt(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_gt(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_any_gt(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __a, __b); +} +#endif + /* vec_any_le */ static __inline__ int __ATTRS_o_ai vec_any_le(vector signed char __a, @@ -15579,6 +16063,18 @@ static __inline__ int __ATTRS_o_ai vec_any_le(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_le(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_LT_REV, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_any_le(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_LT_REV, __a, __b); +} +#endif + /* vec_any_lt */ static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed char __a, @@ -15758,6 +16254,18 @@ static __inline__ int __ATTRS_o_ai vec_any_lt(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_lt(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpgtsq_p(__CR6_EQ_REV, __b, __a); +} + +static __inline__ int __ATTRS_o_ai vec_any_lt(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpgtuq_p(__CR6_EQ_REV, __b, __a); +} +#endif + /* vec_any_nan */ static __inline__ int __attribute__((__always_inline__)) @@ -15953,6 +16461,18 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector double __a, } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed __int128 __a, + vector signed __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b); +} + +static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a, + vector unsigned __int128 __b) { + return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b); +} +#endif + /* vec_any_nge */ static __inline__ int __attribute__((__always_inline__)) @@ -16353,41 +16873,41 @@ typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1))); typedef vector float unaligned_vec_float __attribute__((aligned(1))); static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset, - signed char *__ptr) { + const signed char *__ptr) { return *(unaligned_vec_schar *)(__ptr + __offset); } static inline __ATTRS_o_ai vector unsigned char -vec_xl(signed long long __offset, unsigned char *__ptr) { +vec_xl(signed long long __offset, const unsigned char *__ptr) { return *(unaligned_vec_uchar*)(__ptr + __offset); } static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset, - signed short *__ptr) { + const signed short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sshort *)__addr; } static inline __ATTRS_o_ai vector unsigned short -vec_xl(signed long long __offset, unsigned short *__ptr) { +vec_xl(signed long long __offset, const unsigned short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ushort *)__addr; } static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset, - signed int *__ptr) { + const signed int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sint *)__addr; } static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset, - unsigned int *__ptr) { + const unsigned int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_uint *)__addr; } static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset, - float *__ptr) { + const float *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_float *)__addr; } @@ -16398,19 +16918,19 @@ typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1))); typedef vector double unaligned_vec_double __attribute__((aligned(1))); static inline __ATTRS_o_ai vector signed long long -vec_xl(signed long long __offset, signed long long *__ptr) { +vec_xl(signed long long __offset, const signed long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sll *)__addr; } static inline __ATTRS_o_ai vector unsigned long long -vec_xl(signed long long __offset, unsigned long long *__ptr) { +vec_xl(signed long long __offset, const unsigned long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ull *)__addr; } static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset, - double *__ptr) { + const double *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_double *)__addr; } @@ -16421,13 +16941,13 @@ typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1))); typedef vector unsigned __int128 unaligned_vec_ui128 __attribute__((aligned(1))); static inline __ATTRS_o_ai vector signed __int128 -vec_xl(signed long long __offset, signed __int128 *__ptr) { +vec_xl(signed long long __offset, const signed __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_si128 *)__addr; } static inline __ATTRS_o_ai vector unsigned __int128 -vec_xl(signed long long __offset, unsigned __int128 *__ptr) { +vec_xl(signed long long __offset, const unsigned __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ui128 *)__addr; } @@ -16437,71 +16957,71 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) { #ifdef __LITTLE_ENDIAN__ static __inline__ vector signed char __ATTRS_o_ai -vec_xl_be(signed long long __offset, signed char *__ptr) { +vec_xl_be(signed long long __offset, const signed char *__ptr) { vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr); return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } static __inline__ vector unsigned char __ATTRS_o_ai -vec_xl_be(signed long long __offset, unsigned char *__ptr) { +vec_xl_be(signed long long __offset, const unsigned char *__ptr) { vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr); return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } static __inline__ vector signed short __ATTRS_o_ai -vec_xl_be(signed long long __offset, signed short *__ptr) { +vec_xl_be(signed long long __offset, const signed short *__ptr) { vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr); return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); } static __inline__ vector unsigned short __ATTRS_o_ai -vec_xl_be(signed long long __offset, unsigned short *__ptr) { +vec_xl_be(signed long long __offset, const unsigned short *__ptr) { vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr); return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4); } static __inline__ vector signed int __ATTRS_o_ai -vec_xl_be(signed long long __offset, signed int *__ptr) { +vec_xl_be(signed long long __offset, const signed int *__ptr) { return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr); } static __inline__ vector unsigned int __ATTRS_o_ai -vec_xl_be(signed long long __offset, unsigned int *__ptr) { +vec_xl_be(signed long long __offset, const unsigned int *__ptr) { return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr); } static __inline__ vector float __ATTRS_o_ai -vec_xl_be(signed long long __offset, float *__ptr) { +vec_xl_be(signed long long __offset, const float *__ptr) { return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr); } #ifdef __VSX__ static __inline__ vector signed long long __ATTRS_o_ai -vec_xl_be(signed long long __offset, signed long long *__ptr) { +vec_xl_be(signed long long __offset, const signed long long *__ptr) { return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr); } static __inline__ vector unsigned long long __ATTRS_o_ai -vec_xl_be(signed long long __offset, unsigned long long *__ptr) { +vec_xl_be(signed long long __offset, const unsigned long long *__ptr) { return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr); } static __inline__ vector double __ATTRS_o_ai -vec_xl_be(signed long long __offset, double *__ptr) { +vec_xl_be(signed long long __offset, const double *__ptr) { return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr); } #endif #if defined(__POWER8_VECTOR__) && defined(__powerpc64__) static __inline__ vector signed __int128 __ATTRS_o_ai -vec_xl_be(signed long long __offset, signed __int128 *__ptr) { +vec_xl_be(signed long long __offset, const signed __int128 *__ptr) { return vec_xl(__offset, __ptr); } static __inline__ vector unsigned __int128 __ATTRS_o_ai -vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) { +vec_xl_be(signed long long __offset, const unsigned __int128 *__ptr) { return vec_xl(__offset, __ptr); } #endif @@ -16509,6 +17029,54 @@ vec_xl_be(signed long long __offset, unsigned __int128 *__ptr) { #define vec_xl_be vec_xl #endif +#if defined(__POWER10_VECTOR__) && defined(__VSX__) + +/* vect_xl_sext */ + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, const signed char *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, const signed short *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, const signed int *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, const signed long long *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +/* vec_xl_zext */ + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, const unsigned char *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, const unsigned short *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, const unsigned int *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, const unsigned long long *__pointer) { + return (vector unsigned __int128)*(__pointer + __offset); +} + +#endif + /* vec_xst */ static inline __ATTRS_o_ai void vec_xst(vector signed char __vec, @@ -16597,6 +17165,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, } #endif +/* vec_xst_trunc */ + +#if defined(__POWER10_VECTOR__) && defined(__VSX__) +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed char *__ptr) { + *(__ptr + __offset) = (signed char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned char *__ptr) { + *(__ptr + __offset) = (unsigned char)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed short *__ptr) { + *(__ptr + __offset) = (signed short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned short *__ptr) { + *(__ptr + __offset) = (unsigned short)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed int *__ptr) { + *(__ptr + __offset) = (signed int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned int *__ptr) { + *(__ptr + __offset) = (unsigned int)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + signed long long __offset, + signed long long *__ptr) { + *(__ptr + __offset) = (signed long long)__vec[0]; +} + +static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, + signed long long __offset, + unsigned long long *__ptr) { + *(__ptr + __offset) = (unsigned long long)__vec[0]; +} +#endif + /* vec_xst_be */ #ifdef __LITTLE_ENDIAN__ @@ -16763,6 +17383,100 @@ static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) { } #ifdef __POWER10_VECTOR__ + +/* vec_extractm */ + +static __inline__ unsigned int __ATTRS_o_ai +vec_extractm(vector unsigned char __a) { + return __builtin_altivec_vextractbm(__a); +} + +static __inline__ unsigned int __ATTRS_o_ai +vec_extractm(vector unsigned short __a) { + return __builtin_altivec_vextracthm(__a); +} + +static __inline__ unsigned int __ATTRS_o_ai +vec_extractm(vector unsigned int __a) { + return __builtin_altivec_vextractwm(__a); +} + +static __inline__ unsigned int __ATTRS_o_ai +vec_extractm(vector unsigned long long __a) { + return __builtin_altivec_vextractdm(__a); +} + +static __inline__ unsigned int __ATTRS_o_ai +vec_extractm(vector unsigned __int128 __a) { + return __builtin_altivec_vextractqm(__a); +} + +/* vec_expandm */ + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_expandm(vector unsigned char __a) { + return __builtin_altivec_vexpandbm(__a); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_expandm(vector unsigned short __a) { + return __builtin_altivec_vexpandhm(__a); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_expandm(vector unsigned int __a) { + return __builtin_altivec_vexpandwm(__a); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_expandm(vector unsigned long long __a) { + return __builtin_altivec_vexpanddm(__a); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_expandm(vector unsigned __int128 __a) { + return __builtin_altivec_vexpandqm(__a); +} + +/* vec_cntm */ + +#define vec_cntm(__a, __mp) \ + _Generic((__a), vector unsigned char \ + : __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)), \ + vector unsigned short \ + : __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)), \ + vector unsigned int \ + : __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)), \ + vector unsigned long long \ + : __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp))) + +/* vec_gen[b|h|w|d|q]m */ + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_genbm(unsigned long long __bm) { + return __builtin_altivec_mtvsrbm(__bm); +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_genhm(unsigned long long __bm) { + return __builtin_altivec_mtvsrhm(__bm); +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_genwm(unsigned long long __bm) { + return __builtin_altivec_mtvsrwm(__bm); +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_gendm(unsigned long long __bm) { + return __builtin_altivec_mtvsrdm(__bm); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_genqm(unsigned long long __bm) { + return __builtin_altivec_mtvsrqm(__bm); +} + /* vec_pdep */ static __inline__ vector unsigned long long __ATTRS_o_ai @@ -16881,6 +17595,38 @@ vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) { return __builtin_altivec_vctzdm(__a, __b); } +/* vec_mod */ + +static __inline__ vector signed int __ATTRS_o_ai +vec_mod(vector signed int __a, vector signed int __b) { + return __a % __b; +} + +static __inline__ vector unsigned int __ATTRS_o_ai +vec_mod(vector unsigned int __a, vector unsigned int __b) { + return __a % __b; +} + +static __inline__ vector signed long long __ATTRS_o_ai +vec_mod(vector signed long long __a, vector signed long long __b) { + return __a % __b; +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_mod(vector unsigned long long __a, vector unsigned long long __b) { + return __a % __b; +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mod(vector signed __int128 __a, vector signed __int128 __b) { + return __a % __b; +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mod(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __a % __b; +} + /* vec_sldbi */ #define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7)) @@ -17027,6 +17773,92 @@ vec_inserth(vector unsigned int __a, vector unsigned int __b, #endif } +/* vec_extractl */ + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl( + vector unsigned char __a, vector unsigned char __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextdubvrx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextdubvlx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl( + vector unsigned short __a, vector unsigned short __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextduhvrx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextduhvlx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extractl( + vector unsigned int __a, vector unsigned int __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextduwvrx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextduwvlx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_extractl(vector unsigned long long __a, vector unsigned long long __b, + unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextddvrx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextddvlx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +/* vec_extracth */ + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth( + vector unsigned char __a, vector unsigned char __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextdubvlx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextdubvrx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth( + vector unsigned short __a, vector unsigned short __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextduhvlx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextduhvrx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai vec_extracth( + vector unsigned int __a, vector unsigned int __b, unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextduwvlx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextduwvrx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + +static __inline__ vector unsigned long long __ATTRS_o_ai +vec_extracth(vector unsigned long long __a, vector unsigned long long __b, + unsigned int __c) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vextddvlx(__a, __b, __c); +#else + vector unsigned long long __ret = __builtin_altivec_vextddvrx(__a, __b, __c); + return vec_sld(__ret, __ret, 8); +#endif +} + #ifdef __VSX__ /* vec_permx */ @@ -17095,6 +17927,14 @@ vec_blendv(vector double __a, vector double __b, return __builtin_vsx_xxblendvd(__a, __b, __c); } +/* vec_replace_elt */ + +#define vec_replace_elt __builtin_altivec_vec_replace_elt + +/* vec_replace_unaligned */ + +#define vec_replace_unaligned __builtin_altivec_vec_replace_unaligned + /* vec_splati */ #define vec_splati(__a) \ @@ -17161,6 +18001,197 @@ vec_test_lsbb_all_zeros(vector unsigned char __a) { return __builtin_vsx_xvtlsbb(__a, 0); } #endif /* __VSX__ */ + +/* vec_stril */ + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_stril(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribr((vector signed char)__a); +#else + return __builtin_altivec_vstribl((vector signed char)__a); +#endif +} + +static __inline__ vector signed char __ATTRS_o_ai +vec_stril(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribr(__a); +#else + return __builtin_altivec_vstribl(__a); +#endif +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_stril(vector unsigned short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihr((vector signed short)__a); +#else + return __builtin_altivec_vstrihl((vector signed short)__a); +#endif +} + +static __inline__ vector signed short __ATTRS_o_ai +vec_stril(vector signed short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihr(__a); +#else + return __builtin_altivec_vstrihl(__a); +#endif +} + +/* vec_stril_p */ + +static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a); +#else + return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribr_p(__CR6_EQ, __a); +#else + return __builtin_altivec_vstribl_p(__CR6_EQ, __a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_stril_p(vector unsigned short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a); +#else + return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_stril_p(vector signed short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihr_p(__CR6_EQ, __a); +#else + return __builtin_altivec_vstrihl_p(__CR6_EQ, __a); +#endif +} + +/* vec_strir */ + +static __inline__ vector unsigned char __ATTRS_o_ai +vec_strir(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribl((vector signed char)__a); +#else + return __builtin_altivec_vstribr((vector signed char)__a); +#endif +} + +static __inline__ vector signed char __ATTRS_o_ai +vec_strir(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribl(__a); +#else + return __builtin_altivec_vstribr(__a); +#endif +} + +static __inline__ vector unsigned short __ATTRS_o_ai +vec_strir(vector unsigned short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihl((vector signed short)__a); +#else + return __builtin_altivec_vstrihr((vector signed short)__a); +#endif +} + +static __inline__ vector signed short __ATTRS_o_ai +vec_strir(vector signed short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihl(__a); +#else + return __builtin_altivec_vstrihr(__a); +#endif +} + +/* vec_strir_p */ + +static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribl_p(__CR6_EQ, (vector signed char)__a); +#else + return __builtin_altivec_vstribr_p(__CR6_EQ, (vector signed char)__a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed char __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstribl_p(__CR6_EQ, __a); +#else + return __builtin_altivec_vstribr_p(__CR6_EQ, __a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_strir_p(vector unsigned short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihl_p(__CR6_EQ, (vector signed short)__a); +#else + return __builtin_altivec_vstrihr_p(__CR6_EQ, (vector signed short)__a); +#endif +} + +static __inline__ int __ATTRS_o_ai vec_strir_p(vector signed short __a) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vstrihl_p(__CR6_EQ, __a); +#else + return __builtin_altivec_vstrihr_p(__CR6_EQ, __a); +#endif +} + +/* vs[l | r | ra] */ + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_sl(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) * + __CHAR_BIT__)); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_sl(vector signed __int128 __a, vector unsigned __int128 __b) { + return __a << (__b % (vector unsigned __int128)(sizeof(unsigned __int128) * + __CHAR_BIT__)); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_sr(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) * + __CHAR_BIT__)); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_sr(vector signed __int128 __a, vector unsigned __int128 __b) { + return ( + vector signed __int128)(((vector unsigned __int128)__a) >> + (__b % + (vector unsigned __int128)(sizeof( + unsigned __int128) * + __CHAR_BIT__))); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_sra(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return ( + vector unsigned __int128)(((vector signed __int128)__a) >> + (__b % + (vector unsigned __int128)(sizeof( + unsigned __int128) * + __CHAR_BIT__))); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) { + return __a >> (__b % (vector unsigned __int128)(sizeof(unsigned __int128) * + __CHAR_BIT__)); +} + #endif /* __POWER10_VECTOR__ */ #undef __ATTRS_o_ai diff --git a/lib/include/amxintrin.h b/lib/include/amxintrin.h index 58254e21c8..03a468ef15 100644 --- a/lib/include/amxintrin.h +++ b/lib/include/amxintrin.h @@ -15,8 +15,8 @@ #define __AMXINTRIN_H #ifdef __x86_64__ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) +#define __DEFAULT_FN_ATTRS_TILE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) /// Load tile configuration from a 64-byte memory location specified by /// "mem_addr". The tile configuration includes the tile type palette, the @@ -31,9 +31,8 @@ /// /// \param __config /// A pointer to 512-bits configuration -static __inline__ void __DEFAULT_FN_ATTRS -_tile_loadconfig(const void *__config) -{ +static __inline__ void __DEFAULT_FN_ATTRS_TILE +_tile_loadconfig(const void *__config) { __builtin_ia32_tile_loadconfig(__config); } @@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config) /// /// \param __config /// A pointer to 512-bits configuration -static __inline__ void __DEFAULT_FN_ATTRS -_tile_storeconfig(void *__config) -{ +static __inline__ void __DEFAULT_FN_ATTRS_TILE +_tile_storeconfig(void *__config) { __builtin_ia32_tile_storeconfig(__config); } @@ -60,9 +58,7 @@ _tile_storeconfig(void *__config) /// \headerfile /// /// This intrinsic corresponds to the TILERELEASE instruction. -static __inline__ void __DEFAULT_FN_ATTRS -_tile_release(void) -{ +static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { __builtin_ia32_tilerelease(); } @@ -80,8 +76,9 @@ _tile_release(void) /// A pointer to base address. /// \param stride /// The stride between the rows' data to be loaded in memory. -#define _tile_loadd(dst, base, stride) \ - __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) +#define _tile_loadd(dst, base, stride) \ + __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) /// Load tile rows from memory specifieid by "base" address and "stride" into /// destination tile "dst" using the tile configuration previously configured @@ -99,8 +96,9 @@ _tile_release(void) /// A pointer to base address. /// \param stride /// The stride between the rows' data to be loaded in memory. -#define _tile_stream_loadd(dst, base, stride) \ - __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) +#define _tile_stream_loadd(dst, base, stride) \ + __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) /// Store the tile specified by "src" to memory specifieid by "base" address and /// "stride" using the tile configuration previously configured via @@ -116,7 +114,7 @@ _tile_release(void) /// A pointer to base address. /// \param stride /// The stride between the rows' data to be stored in memory. -#define _tile_stored(dst, base, stride) \ +#define _tile_stored(dst, base, stride) \ __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) /// Zero the tile specified by "tdest". @@ -145,7 +143,8 @@ _tile_release(void) /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1)) +#define _tile_dpbssd(dst, src0, src1) \ + __builtin_ia32_tdpbssd((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with @@ -163,7 +162,8 @@ _tile_release(void) /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1)) +#define _tile_dpbsud(dst, src0, src1) \ + __builtin_ia32_tdpbsud((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with @@ -181,7 +181,8 @@ _tile_release(void) /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1)) +#define _tile_dpbusd(dst, src0, src1) \ + __builtin_ia32_tdpbusd((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with @@ -199,7 +200,8 @@ _tile_release(void) /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1)) +#define _tile_dpbuud(dst, src0, src1) \ + __builtin_ia32_tdpbuud((dst), (src0), (src1)) /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and /// src1, accumulating the intermediate single-precision (32-bit) floating-point @@ -216,10 +218,56 @@ _tile_release(void) /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbf16ps(dst, src0, src1) \ +#define _tile_dpbf16ps(dst, src0, src1) \ __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) -#undef __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS_INT8 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) + +typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loadd_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloadd64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); +} + +static __inline__ void __DEFAULT_FN_ATTRS_INT8 +_tile_stored_internal(unsigned short m, unsigned short n, void *base, + __SIZE_TYPE__ stride, _tile1024i tile) { + return __builtin_ia32_tilestored64_internal(m, n, base, + (__SIZE_TYPE__)(stride), tile); +} + +typedef struct __tile1024i_str { + const unsigned short row; + const unsigned short col; + _tile1024i tile; +} __tile1024i; + +__DEFAULT_FN_ATTRS_INT8 +static void __tile_loadd(__tile1024i *dst, const void *base, + __SIZE_TYPE__ stride) { + dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); +} + +__DEFAULT_FN_ATTRS_INT8 +static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1, + __tile1024i src2) { + dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +__DEFAULT_FN_ATTRS_INT8 +static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { + _tile_stored_internal(src.row, src.col, base, stride, src.tile); +} #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ diff --git a/lib/include/arm_neon.h b/lib/include/arm_neon.h index da1e17cc00..4959646dd5 100644 --- a/lib/include/arm_neon.h +++ b/lib/include/arm_neon.h @@ -40429,6 +40429,150 @@ __ai float32x4_t vcaddq_rot90_f32(float32x4_t __p0, float32x4_t __p1) { } #endif +#ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); + return __ret; +} +#else +__ai float32x4_t vcmlaq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); + return __ret; +} +#else +__ai float32x2_t vcmla_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); + return __ret; +} +#else +__ai float32x4_t vcmlaq_rot180_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); + return __ret; +} +#else +__ai float32x2_t vcmla_rot180_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); + return __ret; +} +#else +__ai float32x4_t vcmlaq_rot270_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); + return __ret; +} +#else +__ai float32x2_t vcmla_rot270_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 41); + return __ret; +} +#else +__ai float32x4_t vcmlaq_rot90_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 9); + return __ret; +} +#else +__ai float32x2_t vcmla_rot90_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float32x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + #endif #if defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #ifdef __LITTLE_ENDIAN__ @@ -40499,6 +40643,150 @@ __ai float16x8_t vcaddq_rot90_f16(float16x8_t __p0, float16x8_t __p1) { } #endif +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vcmlaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vcmla_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vcmlaq_rot180_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vcmla_rot180_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vcmlaq_rot270_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vcmla_rot270_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vcmlaq_rot90_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vcmla_rot90_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + #endif #if defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ @@ -40535,6 +40823,98 @@ __ai float64x2_t vcaddq_rot90_f64(float64x2_t __p0, float64x2_t __p1) { } #endif +#ifdef __LITTLE_ENDIAN__ +__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); + return __ret; +} +#else +__ai float64x2_t vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +__ai float64x1_t vcmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { + float64x1_t __ret; + __ret = (float64x1_t) __builtin_neon_vcmla_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); + return __ret; +} +#ifdef __LITTLE_ENDIAN__ +__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); + return __ret; +} +#else +__ai float64x2_t vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +__ai float64x1_t vcmla_rot180_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { + float64x1_t __ret; + __ret = (float64x1_t) __builtin_neon_vcmla_rot180_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); + return __ret; +} +#ifdef __LITTLE_ENDIAN__ +__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); + return __ret; +} +#else +__ai float64x2_t vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +__ai float64x1_t vcmla_rot270_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { + float64x1_t __ret; + __ret = (float64x1_t) __builtin_neon_vcmla_rot270_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); + return __ret; +} +#ifdef __LITTLE_ENDIAN__ +__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); + return __ret; +} +#else +__ai float64x2_t vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float64x2_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 1, 0); + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +__ai float64x1_t vcmla_rot90_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { + float64x1_t __ret; + __ret = (float64x1_t) __builtin_neon_vcmla_rot90_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); + return __ret; +} #endif #if defined(__ARM_FEATURE_DOTPROD) #ifdef __LITTLE_ENDIAN__ @@ -45860,9 +46240,9 @@ __ai uint64_t vceqd_u64(uint64_t __p0, uint64_t __p1) { __ret = (uint64_t) __builtin_neon_vceqd_u64(__p0, __p1); return __ret; } -__ai int64_t vceqd_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vceqd_s64(__p0, __p1); +__ai uint64_t vceqd_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vceqd_s64(__p0, __p1); return __ret; } __ai uint64_t vceqd_f64(float64_t __p0, float64_t __p1) { @@ -45896,22 +46276,6 @@ __ai uint64x1_t vceqz_p64(poly64x1_t __p0) { __ret = (uint64x1_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 19); return __ret; } -#ifdef __LITTLE_ENDIAN__ -__ai uint16x4_t vceqz_p16(poly16x4_t __p0) { - uint16x4_t __ret; - __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17); - return __ret; -} -#else -__ai uint16x4_t vceqz_p16(poly16x4_t __p0) { - poly16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint16x4_t __ret; - __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - #ifdef __LITTLE_ENDIAN__ __ai uint8x16_t vceqzq_p8(poly8x16_t __p0) { uint8x16_t __ret; @@ -45944,22 +46308,6 @@ __ai uint64x2_t vceqzq_p64(poly64x2_t __p0) { } #endif -#ifdef __LITTLE_ENDIAN__ -__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) { - uint16x8_t __ret; - __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49); - return __ret; -} -#else -__ai uint16x8_t vceqzq_p16(poly16x8_t __p0) { - poly16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - uint16x8_t __ret; - __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - #ifdef __LITTLE_ENDIAN__ __ai uint8x16_t vceqzq_u8(uint8x16_t __p0) { uint8x16_t __ret; @@ -46252,9 +46600,9 @@ __ai uint64_t vceqzd_u64(uint64_t __p0) { __ret = (uint64_t) __builtin_neon_vceqzd_u64(__p0); return __ret; } -__ai int64_t vceqzd_s64(int64_t __p0) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vceqzd_s64(__p0); +__ai uint64_t vceqzd_s64(int64_t __p0) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vceqzd_s64(__p0); return __ret; } __ai uint64_t vceqzd_f64(float64_t __p0) { @@ -46333,9 +46681,9 @@ __ai uint64x1_t vcge_s64(int64x1_t __p0, int64x1_t __p1) { __ret = (uint64x1_t)(__p0 >= __p1); return __ret; } -__ai int64_t vcged_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcged_s64(__p0, __p1); +__ai uint64_t vcged_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcged_s64(__p0, __p1); return __ret; } __ai uint64_t vcged_u64(uint64_t __p0, uint64_t __p1) { @@ -46523,9 +46871,9 @@ __ai uint16x4_t vcgez_s16(int16x4_t __p0) { } #endif -__ai int64_t vcgezd_s64(int64_t __p0) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcgezd_s64(__p0); +__ai uint64_t vcgezd_s64(int64_t __p0) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcgezd_s64(__p0); return __ret; } __ai uint64_t vcgezd_f64(float64_t __p0) { @@ -46604,9 +46952,9 @@ __ai uint64x1_t vcgt_s64(int64x1_t __p0, int64x1_t __p1) { __ret = (uint64x1_t)(__p0 > __p1); return __ret; } -__ai int64_t vcgtd_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcgtd_s64(__p0, __p1); +__ai uint64_t vcgtd_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcgtd_s64(__p0, __p1); return __ret; } __ai uint64_t vcgtd_u64(uint64_t __p0, uint64_t __p1) { @@ -46794,9 +47142,9 @@ __ai uint16x4_t vcgtz_s16(int16x4_t __p0) { } #endif -__ai int64_t vcgtzd_s64(int64_t __p0) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcgtzd_s64(__p0); +__ai uint64_t vcgtzd_s64(int64_t __p0) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcgtzd_s64(__p0); return __ret; } __ai uint64_t vcgtzd_f64(float64_t __p0) { @@ -46880,9 +47228,9 @@ __ai uint64_t vcled_u64(uint64_t __p0, uint64_t __p1) { __ret = (uint64_t) __builtin_neon_vcled_u64(__p0, __p1); return __ret; } -__ai int64_t vcled_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcled_s64(__p0, __p1); +__ai uint64_t vcled_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcled_s64(__p0, __p1); return __ret; } __ai uint64_t vcled_f64(float64_t __p0, float64_t __p1) { @@ -47065,9 +47413,9 @@ __ai uint16x4_t vclez_s16(int16x4_t __p0) { } #endif -__ai int64_t vclezd_s64(int64_t __p0) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vclezd_s64(__p0); +__ai uint64_t vclezd_s64(int64_t __p0) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vclezd_s64(__p0); return __ret; } __ai uint64_t vclezd_f64(float64_t __p0) { @@ -47151,9 +47499,9 @@ __ai uint64_t vcltd_u64(uint64_t __p0, uint64_t __p1) { __ret = (uint64_t) __builtin_neon_vcltd_u64(__p0, __p1); return __ret; } -__ai int64_t vcltd_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcltd_s64(__p0, __p1); +__ai uint64_t vcltd_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcltd_s64(__p0, __p1); return __ret; } __ai uint64_t vcltd_f64(float64_t __p0, float64_t __p1) { @@ -47336,9 +47684,9 @@ __ai uint16x4_t vcltz_s16(int16x4_t __p0) { } #endif -__ai int64_t vcltzd_s64(int64_t __p0) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vcltzd_s64(__p0); +__ai uint64_t vcltzd_s64(int64_t __p0) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vcltzd_s64(__p0); return __ret; } __ai uint64_t vcltzd_f64(float64_t __p0) { @@ -52787,23 +53135,6 @@ __ai float64x1_t vmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) }) #endif -#ifdef __LITTLE_ENDIAN__ -__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { - float64x2_t __ret; - __ret = __p0 + __p1 * (float64x2_t) {__p2, __p2}; - return __ret; -} -#else -__ai float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { - float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); - float64x2_t __ret; - __ret = __rev0 + __rev1 * (float64x2_t) {__p2, __p2}; - __ret = __builtin_shufflevector(__ret, __ret, 1, 0); - return __ret; -} -#endif - #ifdef __LITTLE_ENDIAN__ #define vmlal_high_lane_u32(__p0_443, __p1_443, __p2_443, __p3_443) __extension__ ({ \ uint64x2_t __s0_443 = __p0_443; \ @@ -53355,23 +53686,6 @@ __ai float64x1_t vmls_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) }) #endif -#ifdef __LITTLE_ENDIAN__ -__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { - float64x2_t __ret; - __ret = __p0 - __p1 * (float64x2_t) {__p2, __p2}; - return __ret; -} -#else -__ai float64x2_t vmlsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { - float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); - float64x2_t __ret; - __ret = __rev0 - __rev1 * (float64x2_t) {__p2, __p2}; - __ret = __builtin_shufflevector(__ret, __ret, 1, 0); - return __ret; -} -#endif - #ifdef __LITTLE_ENDIAN__ #define vmlsl_high_lane_u32(__p0_487, __p1_487, __p2_487, __p3_487) __extension__ ({ \ uint64x2_t __s0_487 = __p0_487; \ @@ -57188,30 +57502,30 @@ __ai int8x16_t vqmovn_high_s16(int8x8_t __p0, int16x8_t __p1) { } #endif -__ai int16_t vqmovuns_s32(int32_t __p0) { - int16_t __ret; - __ret = (int16_t) __builtin_neon_vqmovuns_s32(__p0); +__ai uint16_t vqmovuns_s32(int32_t __p0) { + uint16_t __ret; + __ret = (uint16_t) __builtin_neon_vqmovuns_s32(__p0); return __ret; } -__ai int32_t vqmovund_s64(int64_t __p0) { - int32_t __ret; - __ret = (int32_t) __builtin_neon_vqmovund_s64(__p0); +__ai uint32_t vqmovund_s64(int64_t __p0) { + uint32_t __ret; + __ret = (uint32_t) __builtin_neon_vqmovund_s64(__p0); return __ret; } -__ai int8_t vqmovunh_s16(int16_t __p0) { - int8_t __ret; - __ret = (int8_t) __builtin_neon_vqmovunh_s16(__p0); +__ai uint8_t vqmovunh_s16(int16_t __p0) { + uint8_t __ret; + __ret = (uint8_t) __builtin_neon_vqmovunh_s16(__p0); return __ret; } #ifdef __LITTLE_ENDIAN__ -__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) { +__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) { uint16x8_t __ret; __ret = vcombine_u16((uint16x4_t)(__p0), vqmovun_s32(__p1)); return __ret; } #else -__ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) { - int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); +__ai uint16x8_t vqmovun_high_s32(uint16x4_t __p0, int32x4_t __p1) { + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); uint16x8_t __ret; __ret = __noswap_vcombine_u16((uint16x4_t)(__rev0), __noswap_vqmovun_s32(__rev1)); @@ -57221,14 +57535,14 @@ __ai uint16x8_t vqmovun_high_s32(int16x4_t __p0, int32x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) { +__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) { uint32x4_t __ret; __ret = vcombine_u32((uint32x2_t)(__p0), vqmovun_s64(__p1)); return __ret; } #else -__ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) { - int32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); +__ai uint32x4_t vqmovun_high_s64(uint32x2_t __p0, int64x2_t __p1) { + uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); uint32x4_t __ret; __ret = __noswap_vcombine_u32((uint32x2_t)(__rev0), __noswap_vqmovun_s64(__rev1)); @@ -57238,14 +57552,14 @@ __ai uint32x4_t vqmovun_high_s64(int32x2_t __p0, int64x2_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) { +__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) { uint8x16_t __ret; __ret = vcombine_u8((uint8x8_t)(__p0), vqmovun_s16(__p1)); return __ret; } #else -__ai uint8x16_t vqmovun_high_s16(int8x8_t __p0, int16x8_t __p1) { - int8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); +__ai uint8x16_t vqmovun_high_s16(uint8x8_t __p0, int16x8_t __p1) { + uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); uint8x16_t __ret; __ret = __noswap_vcombine_u8((uint8x8_t)(__rev0), __noswap_vqmovun_s16(__rev1)); @@ -57549,22 +57863,22 @@ __ai int16_t vqrdmulhh_s16(int16_t __p0, int16_t __p1) { }) #endif -__ai uint8_t vqrshlb_u8(uint8_t __p0, uint8_t __p1) { +__ai uint8_t vqrshlb_u8(uint8_t __p0, int8_t __p1) { uint8_t __ret; __ret = (uint8_t) __builtin_neon_vqrshlb_u8(__p0, __p1); return __ret; } -__ai uint32_t vqrshls_u32(uint32_t __p0, uint32_t __p1) { +__ai uint32_t vqrshls_u32(uint32_t __p0, int32_t __p1) { uint32_t __ret; __ret = (uint32_t) __builtin_neon_vqrshls_u32(__p0, __p1); return __ret; } -__ai uint64_t vqrshld_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vqrshld_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vqrshld_u64(__p0, __p1); return __ret; } -__ai uint16_t vqrshlh_u16(uint16_t __p0, uint16_t __p1) { +__ai uint16_t vqrshlh_u16(uint16_t __p0, int16_t __p1) { uint16_t __ret; __ret = (uint16_t) __builtin_neon_vqrshlh_u16(__p0, __p1); return __ret; @@ -57832,22 +58146,22 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) { __ret = (int8_t) __builtin_neon_vqrshrunh_n_s16(__s0, __p1); \ __ret; \ }) -__ai uint8_t vqshlb_u8(uint8_t __p0, uint8_t __p1) { +__ai uint8_t vqshlb_u8(uint8_t __p0, int8_t __p1) { uint8_t __ret; __ret = (uint8_t) __builtin_neon_vqshlb_u8(__p0, __p1); return __ret; } -__ai uint32_t vqshls_u32(uint32_t __p0, uint32_t __p1) { +__ai uint32_t vqshls_u32(uint32_t __p0, int32_t __p1) { uint32_t __ret; __ret = (uint32_t) __builtin_neon_vqshls_u32(__p0, __p1); return __ret; } -__ai uint64_t vqshld_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vqshld_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vqshld_u64(__p0, __p1); return __ret; } -__ai uint16_t vqshlh_u16(uint16_t __p0, uint16_t __p1) { +__ai uint16_t vqshlh_u16(uint16_t __p0, int16_t __p1) { uint16_t __ret; __ret = (uint16_t) __builtin_neon_vqshlh_u16(__p0, __p1); return __ret; @@ -59452,7 +59766,7 @@ __ai float32_t vrecpxs_f32(float32_t __p0) { __ret = (float32_t) __builtin_neon_vrecpxs_f32(__p0); return __ret; } -__ai uint64_t vrshld_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vrshld_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1); return __ret; @@ -59853,7 +60167,7 @@ __ai int8x16_t vrsubhn_high_s16(int8x8_t __p0, int16x8_t __p1, int16x8_t __p2) { __ret = (float64x1_t) __builtin_neon_vset_lane_f64(__s0, (float64x1_t)__s1, __p2); \ __ret; \ }) -__ai uint64_t vshld_u64(uint64_t __p0, uint64_t __p1) { +__ai uint64_t vshld_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vshld_u64(__p0, __p1); return __ret; @@ -62423,9 +62737,9 @@ __ai uint64_t vtstd_u64(uint64_t __p0, uint64_t __p1) { __ret = (uint64_t) __builtin_neon_vtstd_u64(__p0, __p1); return __ret; } -__ai int64_t vtstd_s64(int64_t __p0, int64_t __p1) { - int64_t __ret; - __ret = (int64_t) __builtin_neon_vtstd_s64(__p0, __p1); +__ai uint64_t vtstd_s64(int64_t __p0, int64_t __p1) { + uint64_t __ret; + __ret = (uint64_t) __builtin_neon_vtstd_s64(__p0, __p1); return __ret; } __ai int8_t vuqaddb_s8(int8_t __p0, uint8_t __p1) { diff --git a/lib/include/arm_sve.h b/lib/include/arm_sve.h index 1035d41811..8a03f9da58 100644 --- a/lib/include/arm_sve.h +++ b/lib/include/arm_sve.h @@ -94,7 +94,7 @@ typedef __clang_svbfloat16x2_t svbfloat16x2_t; typedef __clang_svbfloat16x3_t svbfloat16x3_t; typedef __clang_svbfloat16x4_t svbfloat16x4_t; #endif -typedef enum +enum svpattern { SV_POW2 = 0, SV_VL1 = 1, @@ -113,9 +113,9 @@ typedef enum SV_MUL4 = 29, SV_MUL3 = 30, SV_ALL = 31 -} sv_pattern; +}; -typedef enum +enum svprfop { SV_PLDL1KEEP = 0, SV_PLDL1STRM = 1, @@ -129,7 +129,7 @@ typedef enum SV_PSTL2STRM = 11, SV_PSTL3KEEP = 12, SV_PSTL3STRM = 13 -} sv_prfop; +}; /* Function attributes */ #define __aio static inline __attribute__((__always_inline__, __nodebug__, __overloadable__)) @@ -10013,69 +10013,69 @@ int16_t svorv(svbool_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b))) svbool_t svpfirst(svbool_t, svbool_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base))) -void svprfb_gather(svbool_t, svuint32_t, sv_prfop); +void svprfb_gather(svbool_t, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base))) -void svprfb_gather(svbool_t, svuint64_t, sv_prfop); +void svprfb_gather(svbool_t, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset))) -void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, sv_prfop); +void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset))) -void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, sv_prfop); +void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset))) -void svprfb_gather_offset(svbool_t, void const *, svint32_t, sv_prfop); +void svprfb_gather_offset(svbool_t, void const *, svint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset))) -void svprfb_gather_offset(svbool_t, void const *, svuint32_t, sv_prfop); +void svprfb_gather_offset(svbool_t, void const *, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset))) -void svprfb_gather_offset(svbool_t, void const *, svint64_t, sv_prfop); +void svprfb_gather_offset(svbool_t, void const *, svint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset))) -void svprfb_gather_offset(svbool_t, void const *, svuint64_t, sv_prfop); +void svprfb_gather_offset(svbool_t, void const *, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base))) -void svprfd_gather(svbool_t, svuint32_t, sv_prfop); +void svprfd_gather(svbool_t, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base))) -void svprfd_gather(svbool_t, svuint64_t, sv_prfop); +void svprfd_gather(svbool_t, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index))) -void svprfd_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop); +void svprfd_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index))) -void svprfd_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop); +void svprfd_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index))) -void svprfd_gather_index(svbool_t, void const *, svint32_t, sv_prfop); +void svprfd_gather_index(svbool_t, void const *, svint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index))) -void svprfd_gather_index(svbool_t, void const *, svuint32_t, sv_prfop); +void svprfd_gather_index(svbool_t, void const *, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index))) -void svprfd_gather_index(svbool_t, void const *, svint64_t, sv_prfop); +void svprfd_gather_index(svbool_t, void const *, svint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index))) -void svprfd_gather_index(svbool_t, void const *, svuint64_t, sv_prfop); +void svprfd_gather_index(svbool_t, void const *, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base))) -void svprfh_gather(svbool_t, svuint32_t, sv_prfop); +void svprfh_gather(svbool_t, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base))) -void svprfh_gather(svbool_t, svuint64_t, sv_prfop); +void svprfh_gather(svbool_t, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index))) -void svprfh_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop); +void svprfh_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index))) -void svprfh_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop); +void svprfh_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index))) -void svprfh_gather_index(svbool_t, void const *, svint32_t, sv_prfop); +void svprfh_gather_index(svbool_t, void const *, svint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index))) -void svprfh_gather_index(svbool_t, void const *, svuint32_t, sv_prfop); +void svprfh_gather_index(svbool_t, void const *, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index))) -void svprfh_gather_index(svbool_t, void const *, svint64_t, sv_prfop); +void svprfh_gather_index(svbool_t, void const *, svint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index))) -void svprfh_gather_index(svbool_t, void const *, svuint64_t, sv_prfop); +void svprfh_gather_index(svbool_t, void const *, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base))) -void svprfw_gather(svbool_t, svuint32_t, sv_prfop); +void svprfw_gather(svbool_t, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base))) -void svprfw_gather(svbool_t, svuint64_t, sv_prfop); +void svprfw_gather(svbool_t, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index))) -void svprfw_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop); +void svprfw_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index))) -void svprfw_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop); +void svprfw_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index))) -void svprfw_gather_index(svbool_t, void const *, svint32_t, sv_prfop); +void svprfw_gather_index(svbool_t, void const *, svint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index))) -void svprfw_gather_index(svbool_t, void const *, svuint32_t, sv_prfop); +void svprfw_gather_index(svbool_t, void const *, svuint32_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index))) -void svprfw_gather_index(svbool_t, void const *, svint64_t, sv_prfop); +void svprfw_gather_index(svbool_t, void const *, svint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index))) -void svprfw_gather_index(svbool_t, void const *, svuint64_t, sv_prfop); +void svprfw_gather_index(svbool_t, void const *, svuint64_t, enum svprfop); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8))) svint8_t svqadd(svint8_t, int8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32))) @@ -10117,13 +10117,13 @@ uint32_t svqdecb(uint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64))) uint64_t svqdecb(uint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32))) -int32_t svqdecb_pat(int32_t, sv_pattern, uint64_t); +int32_t svqdecb_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64))) -int64_t svqdecb_pat(int64_t, sv_pattern, uint64_t); +int64_t svqdecb_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32))) -uint32_t svqdecb_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqdecb_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64))) -uint64_t svqdecb_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqdecb_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32))) int32_t svqdecd(int32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64))) @@ -10137,17 +10137,17 @@ svint64_t svqdecd(svint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64))) svuint64_t svqdecd(svuint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32))) -int32_t svqdecd_pat(int32_t, sv_pattern, uint64_t); +int32_t svqdecd_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64))) -int64_t svqdecd_pat(int64_t, sv_pattern, uint64_t); +int64_t svqdecd_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32))) -uint32_t svqdecd_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqdecd_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64))) -uint64_t svqdecd_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqdecd_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64))) -svint64_t svqdecd_pat(svint64_t, sv_pattern, uint64_t); +svint64_t svqdecd_pat(svint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64))) -svuint64_t svqdecd_pat(svuint64_t, sv_pattern, uint64_t); +svuint64_t svqdecd_pat(svuint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32))) int32_t svqdech(int32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64))) @@ -10161,17 +10161,17 @@ svint16_t svqdech(svint16_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16))) svuint16_t svqdech(svuint16_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32))) -int32_t svqdech_pat(int32_t, sv_pattern, uint64_t); +int32_t svqdech_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64))) -int64_t svqdech_pat(int64_t, sv_pattern, uint64_t); +int64_t svqdech_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32))) -uint32_t svqdech_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqdech_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64))) -uint64_t svqdech_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqdech_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16))) -svint16_t svqdech_pat(svint16_t, sv_pattern, uint64_t); +svint16_t svqdech_pat(svint16_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16))) -svuint16_t svqdech_pat(svuint16_t, sv_pattern, uint64_t); +svuint16_t svqdech_pat(svuint16_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8))) int32_t svqdecp_b8(int32_t, svbool_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32))) @@ -10229,17 +10229,17 @@ svint32_t svqdecw(svint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32))) svuint32_t svqdecw(svuint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32))) -int32_t svqdecw_pat(int32_t, sv_pattern, uint64_t); +int32_t svqdecw_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64))) -int64_t svqdecw_pat(int64_t, sv_pattern, uint64_t); +int64_t svqdecw_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32))) -uint32_t svqdecw_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqdecw_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64))) -uint64_t svqdecw_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqdecw_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32))) -svint32_t svqdecw_pat(svint32_t, sv_pattern, uint64_t); +svint32_t svqdecw_pat(svint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32))) -svuint32_t svqdecw_pat(svuint32_t, sv_pattern, uint64_t); +svuint32_t svqdecw_pat(svuint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32))) int32_t svqincb(int32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64))) @@ -10249,13 +10249,13 @@ uint32_t svqincb(uint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64))) uint64_t svqincb(uint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32))) -int32_t svqincb_pat(int32_t, sv_pattern, uint64_t); +int32_t svqincb_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64))) -int64_t svqincb_pat(int64_t, sv_pattern, uint64_t); +int64_t svqincb_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32))) -uint32_t svqincb_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqincb_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64))) -uint64_t svqincb_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqincb_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32))) int32_t svqincd(int32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64))) @@ -10269,17 +10269,17 @@ svint64_t svqincd(svint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64))) svuint64_t svqincd(svuint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32))) -int32_t svqincd_pat(int32_t, sv_pattern, uint64_t); +int32_t svqincd_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64))) -int64_t svqincd_pat(int64_t, sv_pattern, uint64_t); +int64_t svqincd_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32))) -uint32_t svqincd_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqincd_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64))) -uint64_t svqincd_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqincd_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64))) -svint64_t svqincd_pat(svint64_t, sv_pattern, uint64_t); +svint64_t svqincd_pat(svint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64))) -svuint64_t svqincd_pat(svuint64_t, sv_pattern, uint64_t); +svuint64_t svqincd_pat(svuint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32))) int32_t svqinch(int32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64))) @@ -10293,17 +10293,17 @@ svint16_t svqinch(svint16_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16))) svuint16_t svqinch(svuint16_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32))) -int32_t svqinch_pat(int32_t, sv_pattern, uint64_t); +int32_t svqinch_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64))) -int64_t svqinch_pat(int64_t, sv_pattern, uint64_t); +int64_t svqinch_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32))) -uint32_t svqinch_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqinch_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64))) -uint64_t svqinch_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqinch_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16))) -svint16_t svqinch_pat(svint16_t, sv_pattern, uint64_t); +svint16_t svqinch_pat(svint16_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16))) -svuint16_t svqinch_pat(svuint16_t, sv_pattern, uint64_t); +svuint16_t svqinch_pat(svuint16_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8))) int32_t svqincp_b8(int32_t, svbool_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32))) @@ -10361,17 +10361,17 @@ svint32_t svqincw(svint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32))) svuint32_t svqincw(svuint32_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32))) -int32_t svqincw_pat(int32_t, sv_pattern, uint64_t); +int32_t svqincw_pat(int32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64))) -int64_t svqincw_pat(int64_t, sv_pattern, uint64_t); +int64_t svqincw_pat(int64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32))) -uint32_t svqincw_pat(uint32_t, sv_pattern, uint64_t); +uint32_t svqincw_pat(uint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64))) -uint64_t svqincw_pat(uint64_t, sv_pattern, uint64_t); +uint64_t svqincw_pat(uint64_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32))) -svint32_t svqincw_pat(svint32_t, sv_pattern, uint64_t); +svint32_t svqincw_pat(svint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32))) -svuint32_t svqincw_pat(svuint32_t, sv_pattern, uint64_t); +svuint32_t svqincw_pat(svuint32_t, enum svpattern, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8))) svint8_t svqsub(svint8_t, int8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32))) diff --git a/lib/include/avx512fintrin.h b/lib/include/avx512fintrin.h index fa22ef3fdd..2ee4350b14 100644 --- a/lib/include/avx512fintrin.h +++ b/lib/include/avx512fintrin.h @@ -9305,295 +9305,218 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) * This takes log2(n) steps where n is the number of elements in the vector. */ -#define _mm512_mask_reduce_operator(op) \ - __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \ - __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \ - __m256i __t3 = (__m256i)(__t1 op __t2); \ - __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \ - __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \ - __v2du __t6 = __t4 op __t5; \ - __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ - __v2du __t8 = __t6 op __t7; \ - return __t8[0] - static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_add_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_mul_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { - _mm512_mask_reduce_operator(&); + return __builtin_ia32_reduce_and_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { - _mm512_mask_reduce_operator(|); + return __builtin_ia32_reduce_or_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { __W = _mm512_maskz_mov_epi64(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_add_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_mul_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); - _mm512_mask_reduce_operator(&); + return __builtin_ia32_reduce_and_q512(__W); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { __W = _mm512_maskz_mov_epi64(__M, __W); - _mm512_mask_reduce_operator(|); + return __builtin_ia32_reduce_or_q512(__W); } -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ - __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ - __m256d __t3 = __t1 op __t2; \ - __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ - __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ - __m128d __t6 = __t4 op __t5; \ - __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ - __m128d __t8 = __t6 op __t7; \ - return __t8[0] static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_pd512(0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { __W = _mm512_maskz_mov_pd(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_pd512(0.0, __W); } static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); } -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \ - __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \ - __m256i __t3 = (__m256i)(__t1 op __t2); \ - __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \ - __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \ - __v4su __t6 = __t4 op __t5; \ - __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ - __v4su __t8 = __t6 op __t7; \ - __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ - __v4su __t10 = __t8 op __t9; \ - return __t10[0] static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi32(__m512i __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_add_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi32(__m512i __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_mul_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi32(__m512i __W) { - _mm512_mask_reduce_operator(&); + return __builtin_ia32_reduce_and_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi32(__m512i __W) { - _mm512_mask_reduce_operator(|); + return __builtin_ia32_reduce_or_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { __W = _mm512_maskz_mov_epi32(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_add_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_mul_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); - _mm512_mask_reduce_operator(&); + return __builtin_ia32_reduce_and_d512((__v16si)__W); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { __W = _mm512_maskz_mov_epi32(__M, __W); - _mm512_mask_reduce_operator(|); + return __builtin_ia32_reduce_or_d512((__v16si)__W); } -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ - __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ - __m256 __t3 = __t1 op __t2; \ - __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ - __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ - __m128 __t6 = __t4 op __t5; \ - __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ - __m128 __t8 = __t6 op __t7; \ - __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ - __m128 __t10 = __t8 op __t9; \ - return __t10[0] static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_add_ps(__m512 __W) { - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_ps(__m512 __W) { - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { __W = _mm512_maskz_mov_ps(__M, __W); - _mm512_mask_reduce_operator(+); + return __builtin_ia32_reduce_fadd_ps512(0.0f, __W); } static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); - _mm512_mask_reduce_operator(*); + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); } -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \ - __m512i __t2 = _mm512_##op(__V, __t1); \ - __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \ - __m512i __t4 = _mm512_##op(__t2, __t3); \ - __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \ - __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \ - return __t6[0] static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epi64(__m512i __V) { - _mm512_mask_reduce_operator(max_epi64); + return __builtin_ia32_reduce_smax_q512(__V); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epu64(__m512i __V) { - _mm512_mask_reduce_operator(max_epu64); + return __builtin_ia32_reduce_umax_q512(__V); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_min_epi64(__m512i __V) { - _mm512_mask_reduce_operator(min_epi64); + return __builtin_ia32_reduce_smin_q512(__V); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_reduce_min_epu64(__m512i __V) { - _mm512_mask_reduce_operator(min_epu64); + return __builtin_ia32_reduce_umin_q512(__V); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); - _mm512_mask_reduce_operator(max_epi64); + return __builtin_ia32_reduce_smax_q512(__V); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { __V = _mm512_maskz_mov_epi64(__M, __V); - _mm512_mask_reduce_operator(max_epu64); + return __builtin_ia32_reduce_umax_q512(__V); } static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); - _mm512_mask_reduce_operator(min_epi64); + return __builtin_ia32_reduce_smin_q512(__V); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); - _mm512_mask_reduce_operator(min_epu64); + return __builtin_ia32_reduce_umin_q512(__V); } -#undef _mm512_mask_reduce_operator - -#define _mm512_mask_reduce_operator(op) \ - __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \ - __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \ - __m256i __t3 = _mm256_##op(__t1, __t2); \ - __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \ - __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \ - __m128i __t6 = _mm_##op(__t4, __t5); \ - __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \ - __m128i __t8 = _mm_##op(__t6, __t7); \ - __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \ - __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \ - return __t10[0] - static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epi32(__m512i __V) { - _mm512_mask_reduce_operator(max_epi32); + return __builtin_ia32_reduce_smax_d512((__v16si)__V); } static __inline__ unsigned int __DEFAULT_FN_ATTRS512 _mm512_reduce_max_epu32(__m512i __V) { - _mm512_mask_reduce_operator(max_epu32); + return __builtin_ia32_reduce_umax_d512((__v16si)__V); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_min_epi32(__m512i __V) { - _mm512_mask_reduce_operator(min_epi32); + return __builtin_ia32_reduce_smin_d512((__v16si)__V); } static __inline__ unsigned int __DEFAULT_FN_ATTRS512 _mm512_reduce_min_epu32(__m512i __V) { - _mm512_mask_reduce_operator(min_epu32); + return __builtin_ia32_reduce_umin_d512((__v16si)__V); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); - _mm512_mask_reduce_operator(max_epi32); + return __builtin_ia32_reduce_smax_d512((__v16si)__V); } static __inline__ unsigned int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { __V = _mm512_maskz_mov_epi32(__M, __V); - _mm512_mask_reduce_operator(max_epu32); + return __builtin_ia32_reduce_umax_d512((__v16si)__V); } static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); - _mm512_mask_reduce_operator(min_epi32); + return __builtin_ia32_reduce_smin_d512((__v16si)__V); } static __inline__ unsigned int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); - _mm512_mask_reduce_operator(min_epu32); + return __builtin_ia32_reduce_umin_d512((__v16si)__V); } -#undef _mm512_mask_reduce_operator #define _mm512_mask_reduce_operator(op) \ __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \ diff --git a/lib/include/avx512vlvnniintrin.h b/lib/include/avx512vlvnniintrin.h index b7c8fa08c6..71ac1b4370 100644 --- a/lib/include/avx512vlvnniintrin.h +++ b/lib/include/avx512vlvnniintrin.h @@ -18,13 +18,157 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusd_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusds_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssd_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssds_epi32(S, A, B) \ + (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusd_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusds_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssd_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssds_epi32(S, A, B) \ + (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) @@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, - (__v8si)__B); -} - static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { @@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { @@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, - (__v4si)__B); -} - static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { diff --git a/lib/include/avxvnniintrin.h b/lib/include/avxvnniintrin.h new file mode 100644 index 0000000000..ad45cb7962 --- /dev/null +++ b/lib/include/avxvnniintrin.h @@ -0,0 +1,225 @@ +/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVXVNNIINTRIN_H +#define __AVXVNNIINTRIN_H + +/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */ +/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) + +/* Intrinsics with _avx_ prefix are for compatibility with msvc. */ +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXVNNIINTRIN_H diff --git a/lib/include/cpuid.h b/lib/include/cpuid.h index 2a88c042d0..34f0e76807 100644 --- a/lib/include/cpuid.h +++ b/lib/include/cpuid.h @@ -7,6 +7,9 @@ *===-----------------------------------------------------------------------=== */ +#ifndef __CPUID_H +#define __CPUID_H + #if !(__x86_64__ || __i386__) #error this header is for x86 only #endif @@ -186,6 +189,7 @@ /* Features in %edx for leaf 7 sub-leaf 0 */ #define bit_AVX5124VNNIW 0x00000004 #define bit_AVX5124FMAPS 0x00000008 +#define bit_UINTR 0x00000020 #define bit_SERIALIZE 0x00004000 #define bit_TSXLDTRK 0x00010000 #define bit_PCONFIG 0x00040000 @@ -195,7 +199,9 @@ #define bit_AMXINT8 0x02000000 /* Features in %eax for leaf 7 sub-leaf 1 */ +#define bit_AVXVNNI 0x00000008 #define bit_AVX512BF16 0x00000020 +#define bit_HRESET 0x00400000 /* Features in %eax for leaf 13 sub-leaf 1 */ #define bit_XSAVEOPT 0x00000001 @@ -309,3 +315,5 @@ static __inline int __get_cpuid_count (unsigned int __leaf, __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx); return 1; } + +#endif /* __CPUID_H */ diff --git a/lib/include/cuda_wrappers/new b/lib/include/cuda_wrappers/new index f49811c5a5..7f25531405 100644 --- a/lib/include/cuda_wrappers/new +++ b/lib/include/cuda_wrappers/new @@ -26,6 +26,13 @@ #include_next +#if !defined(__device__) +// The header has been included too early from the standard C++ library +// and CUDA-specific macros are not available yet. +// Undo the include guard and try again later. +#undef __CLANG_CUDA_WRAPPERS_NEW +#else + #pragma push_macro("CUDA_NOEXCEPT") #if __cplusplus >= 201103L #define CUDA_NOEXCEPT noexcept @@ -95,4 +102,5 @@ __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {} #pragma pop_macro("CUDA_NOEXCEPT") +#endif // __device__ #endif // include guard diff --git a/lib/include/gfniintrin.h b/lib/include/gfniintrin.h index 9bff0fcb60..11a321b7c9 100644 --- a/lib/include/gfniintrin.h +++ b/lib/include/gfniintrin.h @@ -14,38 +14,56 @@ #ifndef __GFNIINTRIN_H #define __GFNIINTRIN_H +/* Default attributes for simple form (no masking). */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128))) + +/* Default attributes for YMM unmasked form. */ +#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) + +/* Default attributes for VLX forms. */ +#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), \ (char)(I)) -#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)) - - -#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I) +#define _mm_gf2p8affine_epi64_epi8(A, B, I) \ + (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I)) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); +} +#ifdef __AVXINTRIN_H #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ (__v32qi)(__m256i)(B), \ (char)(I)) -#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)) - -#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) +#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ + (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I)) +static __inline__ __m256i __DEFAULT_FN_ATTRS_Y +_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, + (__v32qi) __B); +} +#endif /* __AVXINTRIN_H */ +#ifdef __AVX512BWINTRIN_H #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), \ @@ -60,37 +78,6 @@ (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ U, A, B, I) -#define _mm_gf2p8affine_epi64_epi8(A, B, I) \ - (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), \ - (char)(I)) - -#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)) - - -#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I) - - -#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ - (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), \ - (char)(I)) - -#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)) - -#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) - - #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \ (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), \ @@ -105,63 +92,6 @@ (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ U, A, B, I) -/* Default attributes for simple form (no masking). */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128))) - -/* Default attributes for YMM unmasked form. */ -#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) - -/* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) - -/* Default attributes for VLX forms. */ -#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, - (__v16qi) __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 -_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectb_128(__U, - (__v16qi) _mm_gf2p8mul_epi8(__A, __B), - (__v16qi) __S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 -_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), - __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS_Y -_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, - (__v32qi) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 -_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectb_256(__U, - (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), - (__v32qi) __S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 -_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), - __U, __A, __B); -} - static __inline__ __m512i __DEFAULT_FN_ATTRS_Z _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { @@ -183,6 +113,75 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(), __U, __A, __B); } +#endif /* __AVX512BWINTRIN_H */ + +#ifdef __AVX512VLBWINTRIN_H +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)) + +#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I) + +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)) + +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) + +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)) + +#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I) + +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)) + +#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 +_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128(__U, + (__v16qi) _mm_gf2p8mul_epi8(__A, __B), + (__v16qi) __S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 +_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), + __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 +_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256(__U, + (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), + (__v32qi) __S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 +_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), + __U, __A, __B); +} +#endif /* __AVX512VLBWINTRIN_H */ #undef __DEFAULT_FN_ATTRS #undef __DEFAULT_FN_ATTRS_Y diff --git a/lib/include/hresetintrin.h b/lib/include/hresetintrin.h new file mode 100644 index 0000000000..13e31a2e03 --- /dev/null +++ b/lib/include/hresetintrin.h @@ -0,0 +1,49 @@ +/*===---------------- hresetintrin.h - HRESET intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __X86GPRINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __HRESETINTRIN_H +#define __HRESETINTRIN_H + +#if __has_extension(gnu_asm) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("hreset"))) + +/// Provides a hint to the processor to selectively reset the prediction +/// history of the current logical processor specified by a 32-bit integer +/// value \a __eax. +/// +/// This intrinsic corresponds to the HRESET instruction. +/// +/// \operation +/// IF __eax == 0 +/// // nop +/// ELSE +/// FOR i := 0 to 31 +/// IF __eax[i] +/// ResetPredictionFeature(i) +/// FI +/// ENDFOR +/// FI +/// \endoperation +static __inline void __DEFAULT_FN_ATTRS +_hreset(int __eax) +{ + __asm__ ("hreset $0" :: "a"(__eax)); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __has_extension(gnu_asm) */ + +#endif /* __HRESETINTRIN_H */ diff --git a/lib/include/ia32intrin.h b/lib/include/ia32intrin.h index 79b7f0655c..00138effd5 100644 --- a/lib/include/ia32intrin.h +++ b/lib/include/ia32intrin.h @@ -14,6 +14,18 @@ #ifndef __IA32INTRIN_H #define __IA32INTRIN_H +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /** Find the first set bit starting from the lsb. Result is undefined if * input is 0. * @@ -26,7 +38,7 @@ * A 32-bit integer operand. * \returns A 32-bit integer containing the bit number. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __bsfd(int __A) { return __builtin_ctz(__A); } @@ -43,7 +55,7 @@ __bsfd(int __A) { * A 32-bit integer operand. * \returns A 32-bit integer containing the bit number. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __bsrd(int __A) { return 31 - __builtin_clz(__A); } @@ -59,12 +71,12 @@ __bsrd(int __A) { * A 32-bit integer operand. * \returns A 32-bit integer containing the swapped bytes. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __bswapd(int __A) { return __builtin_bswap32(__A); } -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _bswap(int __A) { return __builtin_bswap32(__A); } @@ -85,7 +97,7 @@ _bswap(int __A) { * A 64-bit integer operand. * \returns A 32-bit integer containing the bit number. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __bsfq(long long __A) { return __builtin_ctzll(__A); } @@ -102,7 +114,7 @@ __bsfq(long long __A) { * A 64-bit integer operand. * \returns A 32-bit integer containing the bit number. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __bsrq(long long __A) { return 63 - __builtin_clzll(__A); } @@ -118,7 +130,7 @@ __bsrq(long long __A) { * A 64-bit integer operand. * \returns A 64-bit integer containing the swapped bytes. */ -static __inline__ long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR __bswapq(long long __A) { return __builtin_bswap64(__A); } @@ -138,7 +150,7 @@ __bswapq(long long __A) { * \returns A 32-bit integer containing the number of bits with value 1 in the * source operand. */ -static __inline__ int __attribute__((__always_inline__, __nodebug__)) +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR __popcntd(unsigned int __A) { return __builtin_popcount(__A); @@ -159,7 +171,7 @@ __popcntd(unsigned int __A) * \returns A 64-bit integer containing the number of bits with value 1 in the * source operand. */ -static __inline__ long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR __popcntq(unsigned long long __A) { return __builtin_popcountll(__A); @@ -169,26 +181,26 @@ __popcntq(unsigned long long __A) #endif /* __x86_64__ */ #ifdef __x86_64__ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS __readeflags(void) { return __builtin_ia32_readeflags_u64(); } -static __inline__ void __attribute__((__always_inline__, __nodebug__)) +static __inline__ void __DEFAULT_FN_ATTRS __writeeflags(unsigned long long __f) { __builtin_ia32_writeeflags_u64(__f); } #else /* !__x86_64__ */ -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS __readeflags(void) { return __builtin_ia32_readeflags_u32(); } -static __inline__ void __attribute__((__always_inline__, __nodebug__)) +static __inline__ void __DEFAULT_FN_ATTRS __writeeflags(unsigned int __f) { __builtin_ia32_writeeflags_u32(__f); @@ -205,11 +217,9 @@ __writeeflags(unsigned int __f) * A 32-bit float value. * \returns a 32-bit unsigned integer containing the converted value. */ -static __inline__ unsigned int __attribute__((__always_inline__)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST _castf32_u32(float __A) { - unsigned int D; - __builtin_memcpy(&D, &__A, sizeof(__A)); - return D; + return __builtin_bit_cast(unsigned int, __A); } /** Cast a 64-bit float value to a 64-bit unsigned integer value @@ -222,11 +232,9 @@ _castf32_u32(float __A) { * A 64-bit float value. * \returns a 64-bit unsigned integer containing the converted value. */ -static __inline__ unsigned long long __attribute__((__always_inline__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST _castf64_u64(double __A) { - unsigned long long D; - __builtin_memcpy(&D, &__A, sizeof(__A)); - return D; + return __builtin_bit_cast(unsigned long long, __A); } /** Cast a 32-bit unsigned integer value to a 32-bit float value @@ -239,11 +247,9 @@ _castf64_u64(double __A) { * A 32-bit unsigned integer value. * \returns a 32-bit float value containing the converted value. */ -static __inline__ float __attribute__((__always_inline__)) +static __inline__ float __DEFAULT_FN_ATTRS_CAST _castu32_f32(unsigned int __A) { - float D; - __builtin_memcpy(&D, &__A, sizeof(__A)); - return D; + return __builtin_bit_cast(float, __A); } /** Cast a 64-bit unsigned integer value to a 64-bit float value @@ -256,11 +262,9 @@ _castu32_f32(unsigned int __A) { * A 64-bit unsigned integer value. * \returns a 64-bit float value containing the converted value. */ -static __inline__ double __attribute__((__always_inline__)) +static __inline__ double __DEFAULT_FN_ATTRS_CAST _castu64_f64(unsigned long long __A) { - double D; - __builtin_memcpy(&D, &__A, sizeof(__A)); - return D; + return __builtin_bit_cast(double, __A); } /** Adds the unsigned integer operand to the CRC-32C checksum of the @@ -278,7 +282,7 @@ _castu64_f64(unsigned long long __A) { * \returns The result of adding operand \a __C to the CRC-32C checksum of * operand \a __D. */ -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42 __crc32b(unsigned int __C, unsigned char __D) { return __builtin_ia32_crc32qi(__C, __D); @@ -299,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D) * \returns The result of adding operand \a __C to the CRC-32C checksum of * operand \a __D. */ -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42 __crc32w(unsigned int __C, unsigned short __D) { return __builtin_ia32_crc32hi(__C, __D); @@ -320,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D) * \returns The result of adding operand \a __C to the CRC-32C checksum of * operand \a __D. */ -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42 __crc32d(unsigned int __C, unsigned int __D) { return __builtin_ia32_crc32si(__C, __D); @@ -342,20 +346,20 @@ __crc32d(unsigned int __C, unsigned int __D) * \returns The result of adding operand \a __C to the CRC-32C checksum of * operand \a __D. */ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42 __crc32q(unsigned long long __C, unsigned long long __D) { return __builtin_ia32_crc32di(__C, __D); } #endif /* __x86_64__ */ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS __rdpmc(int __A) { return __builtin_ia32_rdpmc(__A); } /* __rdtscp */ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS __rdtscp(unsigned int *__A) { return __builtin_ia32_rdtscp(__A); } @@ -364,48 +368,48 @@ __rdtscp(unsigned int *__A) { #define _rdpmc(A) __rdpmc(A) -static __inline__ void __attribute__((__always_inline__, __nodebug__)) +static __inline__ void __DEFAULT_FN_ATTRS _wbinvd(void) { __builtin_ia32_wbinvd(); } -static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR __rolb(unsigned char __X, int __C) { return __builtin_rotateleft8(__X, __C); } -static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR __rorb(unsigned char __X, int __C) { return __builtin_rotateright8(__X, __C); } -static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR __rolw(unsigned short __X, int __C) { return __builtin_rotateleft16(__X, __C); } -static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR __rorw(unsigned short __X, int __C) { return __builtin_rotateright16(__X, __C); } -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR __rold(unsigned int __X, int __C) { return __builtin_rotateleft32(__X, __C); } -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR __rord(unsigned int __X, int __C) { return __builtin_rotateright32(__X, __C); } #ifdef __x86_64__ -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR __rolq(unsigned long long __X, int __C) { return __builtin_rotateleft64(__X, __C); } -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__)) +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR __rorq(unsigned long long __X, int __C) { return __builtin_rotateright64(__X, __C); } @@ -429,4 +433,9 @@ __rorq(unsigned long long __X, int __C) { #define _rotwl(a,b) __rolw((a), (b)) #define _rotwr(a,b) __rorw((a), (b)) +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CAST +#undef __DEFAULT_FN_ATTRS_SSE42 +#undef __DEFAULT_FN_ATTRS_CONSTEXPR + #endif /* __IA32INTRIN_H */ diff --git a/lib/include/immintrin.h b/lib/include/immintrin.h index e9dff2310f..22f7a520c9 100644 --- a/lib/include/immintrin.h +++ b/lib/include/immintrin.h @@ -10,6 +10,8 @@ #ifndef __IMMINTRIN_H #define __IMMINTRIN_H +#include + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__MMX__) #include @@ -143,6 +145,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXVNNI__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512DQ__) #include @@ -471,6 +478,11 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__KL__) || defined(__WIDEKL__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__) #include diff --git a/lib/include/intrin.h b/lib/include/intrin.h index 871b47ca82..a78b96997d 100644 --- a/lib/include/intrin.h +++ b/lib/include/intrin.h @@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char); void __addfsdword(unsigned long, unsigned long); void __addfsword(unsigned long, unsigned short); void __code_seg(const char *); -static __inline__ void __cpuid(int[4], int); -static __inline__ void __cpuidex(int[4], int, int); -static __inline__ __int64 __emul(int, int); -static __inline__ unsigned __int64 __emulu(unsigned int, unsigned int); unsigned int __getcallerseflags(void); -static __inline__ void __halt(void); unsigned char __inbyte(unsigned short); void __inbytestring(unsigned short, unsigned char *, unsigned long); @@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long); void __lidt(void *); unsigned __int64 __ll_lshift(unsigned __int64, int); __int64 __ll_rshift(__int64, int); -static __inline__ void __movsb(unsigned char *, unsigned char const *, size_t); -static __inline__ void __movsd(unsigned long *, unsigned long const *, size_t); -static __inline__ void __movsw(unsigned short *, unsigned short const *, size_t); -static __inline__ void __nop(void); void __nvreg_restore_fence(void); void __nvreg_save_fence(void); @@ -105,23 +96,16 @@ unsigned long __readcr4(void); unsigned long __readcr8(void); unsigned int __readdr(unsigned int); #ifdef __i386__ -static __inline__ unsigned char __readfsbyte(unsigned long); -static __inline__ unsigned __int64 __readfsqword(unsigned long); -static __inline__ unsigned short __readfsword(unsigned long); #endif -static __inline__ unsigned __int64 __readmsr(unsigned long); unsigned __int64 __readpmc(unsigned long); unsigned long __segmentlimit(unsigned long); void __sidt(void *); -static __inline__ void __stosb(unsigned char *, unsigned char, size_t); -static __inline__ void __stosd(unsigned long *, unsigned long, size_t); -static __inline__ void __stosw(unsigned short *, unsigned short, size_t); void __svm_clgi(void); void __svm_invlpga(void *, int); @@ -136,7 +120,6 @@ void __vmx_off(void); void __vmx_vmptrst(unsigned __int64 *); void __wbinvd(void); void __writecr0(unsigned int); -static __inline__ void __writecr3(unsigned __INTPTR_TYPE__); void __writecr4(unsigned int); void __writecr8(unsigned int); @@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long); void __writefsqword(unsigned long, unsigned __int64); void __writefsword(unsigned long, unsigned short); void __writemsr(unsigned long, unsigned __int64); -static __inline__ void *_AddressOfReturnAddress(void); -static __inline__ unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); -static __inline__ unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); unsigned char _bittest(long const *, long); unsigned char _bittestandcomplement(long *, long); @@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long); long _InterlockedExchangeAdd_HLERelease(long volatile *, long); __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64); __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64); -static __inline__ void -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_ReadBarrier(void); -static __inline__ void -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_ReadWriteBarrier(void); +void __attribute__((__deprecated__( + "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void); +void __attribute__((__deprecated__( + "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void); unsigned int _rorx_u32(unsigned int, const unsigned int); int _sarx_i32(int, unsigned int); #if __STDC_HOSTED__ @@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int); void _Store_HLERelease(long volatile *, long); void _Store64_HLERelease(__int64 volatile *, __int64); void _StorePointer_HLERelease(void *volatile *, void *); -static __inline__ void -__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead"))) -_WriteBarrier(void); +void __attribute__((__deprecated__( + "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void); unsigned __int32 xbegin(void); void _xend(void); @@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char); void __addgsdword(unsigned long, unsigned long); void __addgsqword(unsigned long, unsigned __int64); void __addgsword(unsigned long, unsigned short); -static __inline__ void __faststorefence(void); void __incgsbyte(unsigned long); void __incgsdword(unsigned long); void __incgsqword(unsigned long); void __incgsword(unsigned long); -static __inline__ void __movsq(unsigned long long *, unsigned long long const *, size_t); -static __inline__ unsigned char __readgsbyte(unsigned long); -static __inline__ unsigned long __readgsdword(unsigned long); -static __inline__ unsigned __int64 __readgsqword(unsigned long); unsigned short __readgsword(unsigned long); unsigned __int64 __shiftleft128(unsigned __int64 _LowPart, @@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart, unsigned __int64 __shiftright128(unsigned __int64 _LowPart, unsigned __int64 _HighPart, unsigned char _Shift); -static __inline__ void __stosq(unsigned __int64 *, unsigned __int64, size_t); unsigned char __vmx_on(unsigned __int64 *); unsigned char __vmx_vmclear(unsigned __int64 *); @@ -243,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64); unsigned char _interlockedbittestandset64(__int64 volatile *, __int64); long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange, long _Comparand); -unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination, - __int64 _ExchangeHigh, - __int64 _ExchangeLow, - __int64 *_CompareandResult); unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination, __int64 _ExchangeHigh, __int64 _ExchangeLow, @@ -269,13 +236,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int); __int64 _sarx_i64(__int64, unsigned int); unsigned __int64 _shlx_u64(unsigned __int64, unsigned int); unsigned __int64 _shrx_u64(unsigned __int64, unsigned int); -static __inline__ __int64 __mulh(__int64, __int64); -static __inline__ unsigned __int64 __umulh(unsigned __int64, unsigned __int64); -static __inline__ __int64 _mul128(__int64, __int64, __int64*); -static __inline__ unsigned __int64 _umul128(unsigned __int64, unsigned __int64, unsigned __int64*); @@ -284,29 +247,19 @@ unsigned __int64 _umul128(unsigned __int64, #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) -static __inline__ unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); -static __inline__ unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); #endif #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) -static __inline__ __int64 _InterlockedDecrement64(__int64 volatile *_Addend); -static __inline__ __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value); -static __inline__ __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value); -static __inline__ __int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value); -static __inline__ __int64 _InterlockedIncrement64(__int64 volatile *_Addend); -static __inline__ __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask); -static __inline__ __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask); -static __inline__ __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask); #endif @@ -470,45 +423,81 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination, __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination, __int64 _Exchange, __int64 _Comparand); #endif +#if defined(__x86_64__) || defined(__aarch64__) +unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination, + __int64 _ExchangeHigh, + __int64 _ExchangeLow, + __int64 *_ComparandResult); +#endif +#if defined(__aarch64__) +unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination, + __int64 _ExchangeHigh, + __int64 _ExchangeLow, + __int64 *_ComparandResult); +unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination, + __int64 _ExchangeHigh, + __int64 _ExchangeLow, + __int64 *_ComparandResult); +unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination, + __int64 _ExchangeHigh, + __int64 _ExchangeLow, + __int64 *_ComparandResult); +#endif /*----------------------------------------------------------------------------*\ |* movs, stos \*----------------------------------------------------------------------------*/ #if defined(__i386__) || defined(__x86_64__) -static __inline__ void __DEFAULT_FN_ATTRS -__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) { +static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst, + unsigned char const *__src, + size_t __n) { __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n) : : "memory"); } -static __inline__ void __DEFAULT_FN_ATTRS -__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) { - __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n) - : : "memory"); -} -static __inline__ void __DEFAULT_FN_ATTRS -__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) { - __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n) - : : "memory"); -} -static __inline__ void __DEFAULT_FN_ATTRS -__stosd(unsigned long *__dst, unsigned long __x, size_t __n) { - __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x) +static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst, + unsigned long const *__src, + size_t __n) { + __asm__ __volatile__("rep movsl" + : "+D"(__dst), "+S"(__src), "+c"(__n) + : : "memory"); } -static __inline__ void __DEFAULT_FN_ATTRS -__stosw(unsigned short *__dst, unsigned short __x, size_t __n) { - __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x) +static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst, + unsigned short const *__src, + size_t __n) { + __asm__ __volatile__("rep movsw" + : "+D"(__dst), "+S"(__src), "+c"(__n) + : + : "memory"); +} +static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst, + unsigned long __x, + size_t __n) { + __asm__ __volatile__("rep stosl" + : "+D"(__dst), "+c"(__n) + : "a"(__x) + : "memory"); +} +static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst, + unsigned short __x, + size_t __n) { + __asm__ __volatile__("rep stosw" + : "+D"(__dst), "+c"(__n) + : "a"(__x) : "memory"); } #endif #ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) { - __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n) - : : "memory"); +static __inline__ void __DEFAULT_FN_ATTRS __movsq( + unsigned long long *__dst, unsigned long long const *__src, size_t __n) { + __asm__ __volatile__("rep movsq" + : "+D"(__dst), "+S"(__src), "+c"(__n) + : + : "memory"); } -static __inline__ void __DEFAULT_FN_ATTRS -__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) { +static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst, + unsigned __int64 __x, + size_t __n) { __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x) : "memory"); } @@ -518,26 +507,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) { |* Misc \*----------------------------------------------------------------------------*/ #if defined(__i386__) || defined(__x86_64__) -static __inline__ void __DEFAULT_FN_ATTRS -__cpuid(int __info[4], int __level) { - __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3]) - : "a"(__level), "c"(0)); +static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) { + __asm__("cpuid" + : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3]) + : "a"(__level), "c"(0)); } -static __inline__ void __DEFAULT_FN_ATTRS -__cpuidex(int __info[4], int __level, int __ecx) { - __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3]) - : "a"(__level), "c"(__ecx)); +static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level, + int __ecx) { + __asm__("cpuid" + : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3]) + : "a"(__level), "c"(__ecx)); } -static __inline__ void __DEFAULT_FN_ATTRS -__halt(void) { - __asm__ volatile ("hlt"); +static __inline__ void __DEFAULT_FN_ATTRS __halt(void) { + __asm__ volatile("hlt"); } #endif #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) -static __inline__ void __DEFAULT_FN_ATTRS -__nop(void) { - __asm__ volatile ("nop"); +static __inline__ void __DEFAULT_FN_ATTRS __nop(void) { + __asm__ volatile("nop"); } #endif @@ -574,8 +562,7 @@ __readmsr(unsigned long __register) { } #endif -static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS -__readcr3(void) { +static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) { unsigned __LPTRINT_TYPE__ __cr3_val; __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory"); return __cr3_val; diff --git a/lib/include/keylockerintrin.h b/lib/include/keylockerintrin.h new file mode 100644 index 0000000000..c15d39c8e3 --- /dev/null +++ b/lib/include/keylockerintrin.h @@ -0,0 +1,506 @@ +/*===----------------- keylockerintrin.h - KL Intrinsics -------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _KEYLOCKERINTRIN_H +#define _KEYLOCKERINTRIN_H + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__KL__) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl"),\ + __min_vector_width__(128))) + +/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl +/// will assigned to EAX, whch specifies the KeySource and whether backing up +/// the key is permitted. The 256-bit encryption key is loaded from the two +/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is +/// loaded from the implicit operand XMM0 which assigned by __intkey. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LOADIWKEY instructions. +/// +/// \operation +/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode) +/// GP (0) +/// FI +/// IF “LOADIWKEY exiting” VM execution control set +/// VMexit +/// FI +/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used +/// GP (0) +/// FI +/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set +/// GP (0) +/// FI +/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 0) // KeySource of 0. +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]: +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // KeySource of 1. See RDSEED definition for details of randomness +/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0] +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128] +/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded +/// ZF := 1 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_mm_loadiwkey (unsigned int __ctl, __m128i __intkey, + __m128i __enkey_lo, __m128i __enkey_hi) { + __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl); +} + +/// Wrap a 128-bit AES key from __key into a key handle and output in +/// ((__m128i*)__h) to ((__m128i*)__h) + 5 and a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY128 instructions. +/// +/// \operation +/// InputKey[127:0] := __key[127:0] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText +/// MEM[__h+511:__h+384] := 0 // Reserved for future usage +/// MEM[__h+639:__h+512] := 0 // Reserved for future usage +/// MEM[__h+767:__h+640] := 0 // Reserved for future usage +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) { + return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h); +} + +/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then +/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and +/// a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY256 instructions. +/// +/// \operation +/// InputKey[127:0] := __key_lo[127:0] +/// InputKey[255:128] := __key_hi[255:128] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0] +/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128] +/// MEM[__h+639:__h+512] := 0 // Reserved for future usage +/// MEM[__h+767:__h+640] := 0 // Reserved for future usage +/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi, + void *__h) { + return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo, + (__v2di)__key_hi, __h); +} + +/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ + || defined(__KL__) */ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__WIDEKL__) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\ + __min_vector_width__(128))) + +/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE128KL instructions. +/// +/// \operation +/// Handle := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 ) +/// If (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ + || defined(__WIDEKL__) */ + +#endif /* _KEYLOCKERINTRIN_H */ diff --git a/lib/include/mm_malloc.h b/lib/include/mm_malloc.h index 0ea32517ae..933dbaacad 100644 --- a/lib/include/mm_malloc.h +++ b/lib/include/mm_malloc.h @@ -54,7 +54,13 @@ _mm_malloc(size_t __size, size_t __align) static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_free(void *__p) { +#if defined(__MINGW32__) + __mingw_aligned_free(__p); +#elif defined(_WIN32) + _aligned_free(__p); +#else free(__p); +#endif } #endif diff --git a/lib/include/opencl-c-base.h b/lib/include/opencl-c-base.h index 430e07d36f..e8dcd70377 100644 --- a/lib/include/opencl-c-base.h +++ b/lib/include/opencl-c-base.h @@ -9,6 +9,21 @@ #ifndef _OPENCL_BASE_H_ #define _OPENCL_BASE_H_ +// Define extension macros + +#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200) +// For SPIR all extensions are supported. +#if defined(__SPIR__) +#define cl_khr_subgroup_extended_types 1 +#define cl_khr_subgroup_non_uniform_vote 1 +#define cl_khr_subgroup_ballot 1 +#define cl_khr_subgroup_non_uniform_arithmetic 1 +#define cl_khr_subgroup_shuffle 1 +#define cl_khr_subgroup_shuffle_relative 1 +#define cl_khr_subgroup_clustered_reduce 1 +#endif // defined(__SPIR__) +#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200) + // built-in scalar data types: /** @@ -568,4 +583,7 @@ typedef struct { #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end #endif // cl_intel_device_side_avc_motion_estimation +// Disable any extensions we may have enabled previously. +#pragma OPENCL EXTENSION all : disable + #endif //_OPENCL_BASE_H_ diff --git a/lib/include/opencl-c.h b/lib/include/opencl-c.h index 66e18bdd47..ab665628c8 100644 --- a/lib/include/opencl-c.h +++ b/lib/include/opencl-c.h @@ -4633,6 +4633,7 @@ float16 __ovld __cnfn convert_float16(float16); // Conversions with double data type parameters or return value. #ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable char __ovld __cnfn convert_char(double); char __ovld __cnfn convert_char_rte(double); char __ovld __cnfn convert_char_rtn(double); @@ -5455,6 +5456,7 @@ double16 __ovld __cnfn convert_double16_rtz(ushort16); #endif //cl_khr_fp64 #ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable // Convert half types to non-double types. uchar __ovld __cnfn convert_uchar(half); uchar __ovld __cnfn convert_uchar_rte(half); diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath index bd6011eb6f..1aff66af7d 100644 --- a/lib/include/openmp_wrappers/cmath +++ b/lib/include/openmp_wrappers/cmath @@ -24,8 +24,11 @@ // which might live in cstdlib. #include +// We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`. +#include + #pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any, allow_templates)}) #define __CUDA__ #define __OPENMP_NVPTX__ diff --git a/lib/include/openmp_wrappers/complex b/lib/include/openmp_wrappers/complex index d8dcd41670..142e526b81 100644 --- a/lib/include/openmp_wrappers/complex +++ b/lib/include/openmp_wrappers/complex @@ -25,3 +25,28 @@ // Grab the host header too. #include_next + + +#ifdef __cplusplus + +// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set +// after including above. Since the complex header we use is a +// simplified version of the libc++, we don't need it in this case. If we +// compile against libstdc++, or any other standard library, we will overload +// the (hopefully template) functions in the header with the ones we +// got from libc++ which decomposes math functions, like `std::sin`, into +// arithmetic and calls to non-complex functions, all of which we can then +// handle. +#ifndef _LIBCPP_STD_VER + +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, \ + implementation = {extension(match_any, allow_templates)}) + +#include + +#pragma omp end declare variant + +#endif + +#endif diff --git a/lib/include/openmp_wrappers/complex_cmath.h b/lib/include/openmp_wrappers/complex_cmath.h new file mode 100644 index 0000000000..e3d9aebbbc --- /dev/null +++ b/lib/include/openmp_wrappers/complex_cmath.h @@ -0,0 +1,388 @@ +//===------------------------- __complex_cmath.h --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// std::complex header copied from the libcxx source and simplified for use in +// OpenMP target offload regions. +// +//===----------------------------------------------------------------------===// + +#ifndef _OPENMP +#error "This file is for OpenMP compilation only." +#endif + +#ifndef __cplusplus +#error "This file is for C++ compilation only." +#endif + +#ifndef _LIBCPP_COMPLEX +#define _LIBCPP_COMPLEX + +#include +#include + +#define __DEVICE__ static constexpr __attribute__((nothrow)) + +namespace std { + +// abs + +template __DEVICE__ _Tp abs(const std::complex<_Tp> &__c) { + return hypot(__c.real(), __c.imag()); +} + +// arg + +template __DEVICE__ _Tp arg(const std::complex<_Tp> &__c) { + return atan2(__c.imag(), __c.real()); +} + +template +typename enable_if::value || is_same<_Tp, double>::value, + double>::type +arg(_Tp __re) { + return atan2(0., __re); +} + +template +typename enable_if::value, float>::type arg(_Tp __re) { + return atan2f(0.F, __re); +} + +// norm + +template __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) { + if (std::isinf(__c.real())) + return abs(__c.real()); + if (std::isinf(__c.imag())) + return abs(__c.imag()); + return __c.real() * __c.real() + __c.imag() * __c.imag(); +} + +// conj + +template std::complex<_Tp> conj(const std::complex<_Tp> &__c) { + return std::complex<_Tp>(__c.real(), -__c.imag()); +} + +// proj + +template std::complex<_Tp> proj(const std::complex<_Tp> &__c) { + std::complex<_Tp> __r = __c; + if (std::isinf(__c.real()) || std::isinf(__c.imag())) + __r = std::complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag())); + return __r; +} + +// polar + +template +complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp()) { + if (std::isnan(__rho) || signbit(__rho)) + return std::complex<_Tp>(_Tp(NAN), _Tp(NAN)); + if (std::isnan(__theta)) { + if (std::isinf(__rho)) + return std::complex<_Tp>(__rho, __theta); + return std::complex<_Tp>(__theta, __theta); + } + if (std::isinf(__theta)) { + if (std::isinf(__rho)) + return std::complex<_Tp>(__rho, _Tp(NAN)); + return std::complex<_Tp>(_Tp(NAN), _Tp(NAN)); + } + _Tp __x = __rho * cos(__theta); + if (std::isnan(__x)) + __x = 0; + _Tp __y = __rho * sin(__theta); + if (std::isnan(__y)) + __y = 0; + return std::complex<_Tp>(__x, __y); +} + +// log + +template std::complex<_Tp> log(const std::complex<_Tp> &__x) { + return std::complex<_Tp>(log(abs(__x)), arg(__x)); +} + +// log10 + +template std::complex<_Tp> log10(const std::complex<_Tp> &__x) { + return log(__x) / log(_Tp(10)); +} + +// sqrt + +template +__DEVICE__ std::complex<_Tp> sqrt(const std::complex<_Tp> &__x) { + if (std::isinf(__x.imag())) + return std::complex<_Tp>(_Tp(INFINITY), __x.imag()); + if (std::isinf(__x.real())) { + if (__x.real() > _Tp(0)) + return std::complex<_Tp>(__x.real(), std::isnan(__x.imag()) + ? __x.imag() + : copysign(_Tp(0), __x.imag())); + return std::complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0), + copysign(__x.real(), __x.imag())); + } + return polar(sqrt(abs(__x)), arg(__x) / _Tp(2)); +} + +// exp + +template +__DEVICE__ std::complex<_Tp> exp(const std::complex<_Tp> &__x) { + _Tp __i = __x.imag(); + if (std::isinf(__x.real())) { + if (__x.real() < _Tp(0)) { + if (!std::isfinite(__i)) + __i = _Tp(1); + } else if (__i == 0 || !std::isfinite(__i)) { + if (std::isinf(__i)) + __i = _Tp(NAN); + return std::complex<_Tp>(__x.real(), __i); + } + } else if (std::isnan(__x.real()) && __x.imag() == 0) + return __x; + _Tp __e = exp(__x.real()); + return std::complex<_Tp>(__e * cos(__i), __e * sin(__i)); +} + +// pow + +template +std::complex<_Tp> pow(const std::complex<_Tp> &__x, + const std::complex<_Tp> &__y) { + return exp(__y * log(__x)); +} + +// __sqr, computes pow(x, 2) + +template std::complex<_Tp> __sqr(const std::complex<_Tp> &__x) { + return std::complex<_Tp>((__x.real() - __x.imag()) * + (__x.real() + __x.imag()), + _Tp(2) * __x.real() * __x.imag()); +} + +// asinh + +template +__DEVICE__ std::complex<_Tp> asinh(const std::complex<_Tp> &__x) { + const _Tp __pi(atan2(+0., -0.)); + if (std::isinf(__x.real())) { + if (std::isnan(__x.imag())) + return __x; + if (std::isinf(__x.imag())) + return std::complex<_Tp>(__x.real(), + copysign(__pi * _Tp(0.25), __x.imag())); + return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag())); + } + if (std::isnan(__x.real())) { + if (std::isinf(__x.imag())) + return std::complex<_Tp>(__x.imag(), __x.real()); + if (__x.imag() == 0) + return __x; + return std::complex<_Tp>(__x.real(), __x.real()); + } + if (std::isinf(__x.imag())) + return std::complex<_Tp>(copysign(__x.imag(), __x.real()), + copysign(__pi / _Tp(2), __x.imag())); + std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1))); + return std::complex<_Tp>(copysign(__z.real(), __x.real()), + copysign(__z.imag(), __x.imag())); +} + +// acosh + +template +__DEVICE__ std::complex<_Tp> acosh(const std::complex<_Tp> &__x) { + const _Tp __pi(atan2(+0., -0.)); + if (std::isinf(__x.real())) { + if (std::isnan(__x.imag())) + return std::complex<_Tp>(abs(__x.real()), __x.imag()); + if (std::isinf(__x.imag())) { + if (__x.real() > 0) + return std::complex<_Tp>(__x.real(), + copysign(__pi * _Tp(0.25), __x.imag())); + else + return std::complex<_Tp>(-__x.real(), + copysign(__pi * _Tp(0.75), __x.imag())); + } + if (__x.real() < 0) + return std::complex<_Tp>(-__x.real(), copysign(__pi, __x.imag())); + return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag())); + } + if (std::isnan(__x.real())) { + if (std::isinf(__x.imag())) + return std::complex<_Tp>(abs(__x.imag()), __x.real()); + return std::complex<_Tp>(__x.real(), __x.real()); + } + if (std::isinf(__x.imag())) + return std::complex<_Tp>(abs(__x.imag()), + copysign(__pi / _Tp(2), __x.imag())); + std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1))); + return std::complex<_Tp>(copysign(__z.real(), _Tp(0)), + copysign(__z.imag(), __x.imag())); +} + +// atanh + +template +__DEVICE__ std::complex<_Tp> atanh(const std::complex<_Tp> &__x) { + const _Tp __pi(atan2(+0., -0.)); + if (std::isinf(__x.imag())) { + return std::complex<_Tp>(copysign(_Tp(0), __x.real()), + copysign(__pi / _Tp(2), __x.imag())); + } + if (std::isnan(__x.imag())) { + if (std::isinf(__x.real()) || __x.real() == 0) + return std::complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag()); + return std::complex<_Tp>(__x.imag(), __x.imag()); + } + if (std::isnan(__x.real())) { + return std::complex<_Tp>(__x.real(), __x.real()); + } + if (std::isinf(__x.real())) { + return std::complex<_Tp>(copysign(_Tp(0), __x.real()), + copysign(__pi / _Tp(2), __x.imag())); + } + if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) { + return std::complex<_Tp>(copysign(_Tp(INFINITY), __x.real()), + copysign(_Tp(0), __x.imag())); + } + std::complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2); + return std::complex<_Tp>(copysign(__z.real(), __x.real()), + copysign(__z.imag(), __x.imag())); +} + +// sinh + +template +__DEVICE__ std::complex<_Tp> sinh(const std::complex<_Tp> &__x) { + if (std::isinf(__x.real()) && !std::isfinite(__x.imag())) + return std::complex<_Tp>(__x.real(), _Tp(NAN)); + if (__x.real() == 0 && !std::isfinite(__x.imag())) + return std::complex<_Tp>(__x.real(), _Tp(NAN)); + if (__x.imag() == 0 && !std::isfinite(__x.real())) + return __x; + return std::complex<_Tp>(sinh(__x.real()) * cos(__x.imag()), + cosh(__x.real()) * sin(__x.imag())); +} + +// cosh + +template +__DEVICE__ std::complex<_Tp> cosh(const std::complex<_Tp> &__x) { + if (std::isinf(__x.real()) && !std::isfinite(__x.imag())) + return std::complex<_Tp>(abs(__x.real()), _Tp(NAN)); + if (__x.real() == 0 && !std::isfinite(__x.imag())) + return std::complex<_Tp>(_Tp(NAN), __x.real()); + if (__x.real() == 0 && __x.imag() == 0) + return std::complex<_Tp>(_Tp(1), __x.imag()); + if (__x.imag() == 0 && !std::isfinite(__x.real())) + return std::complex<_Tp>(abs(__x.real()), __x.imag()); + return std::complex<_Tp>(cosh(__x.real()) * cos(__x.imag()), + sinh(__x.real()) * sin(__x.imag())); +} + +// tanh + +template +__DEVICE__ std::complex<_Tp> tanh(const std::complex<_Tp> &__x) { + if (std::isinf(__x.real())) { + if (!std::isfinite(__x.imag())) + return std::complex<_Tp>(_Tp(1), _Tp(0)); + return std::complex<_Tp>(_Tp(1), + copysign(_Tp(0), sin(_Tp(2) * __x.imag()))); + } + if (std::isnan(__x.real()) && __x.imag() == 0) + return __x; + _Tp __2r(_Tp(2) * __x.real()); + _Tp __2i(_Tp(2) * __x.imag()); + _Tp __d(cosh(__2r) + cos(__2i)); + _Tp __2rsh(sinh(__2r)); + if (std::isinf(__2rsh) && std::isinf(__d)) + return std::complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), + __2i > _Tp(0) ? _Tp(0) : _Tp(-0.)); + return std::complex<_Tp>(__2rsh / __d, sin(__2i) / __d); +} + +// asin + +template +__DEVICE__ std::complex<_Tp> asin(const std::complex<_Tp> &__x) { + std::complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real())); + return std::complex<_Tp>(__z.imag(), -__z.real()); +} + +// acos + +template +__DEVICE__ std::complex<_Tp> acos(const std::complex<_Tp> &__x) { + const _Tp __pi(atan2(+0., -0.)); + if (std::isinf(__x.real())) { + if (std::isnan(__x.imag())) + return std::complex<_Tp>(__x.imag(), __x.real()); + if (std::isinf(__x.imag())) { + if (__x.real() < _Tp(0)) + return std::complex<_Tp>(_Tp(0.75) * __pi, -__x.imag()); + return std::complex<_Tp>(_Tp(0.25) * __pi, -__x.imag()); + } + if (__x.real() < _Tp(0)) + return std::complex<_Tp>(__pi, + signbit(__x.imag()) ? -__x.real() : __x.real()); + return std::complex<_Tp>(_Tp(0), + signbit(__x.imag()) ? __x.real() : -__x.real()); + } + if (std::isnan(__x.real())) { + if (std::isinf(__x.imag())) + return std::complex<_Tp>(__x.real(), -__x.imag()); + return std::complex<_Tp>(__x.real(), __x.real()); + } + if (std::isinf(__x.imag())) + return std::complex<_Tp>(__pi / _Tp(2), -__x.imag()); + if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag()))) + return std::complex<_Tp>(__pi / _Tp(2), -__x.imag()); + std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1))); + if (signbit(__x.imag())) + return std::complex<_Tp>(abs(__z.imag()), abs(__z.real())); + return std::complex<_Tp>(abs(__z.imag()), -abs(__z.real())); +} + +// atan + +template +__DEVICE__ std::complex<_Tp> atan(const std::complex<_Tp> &__x) { + std::complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real())); + return std::complex<_Tp>(__z.imag(), -__z.real()); +} + +// sin + +template +__DEVICE__ std::complex<_Tp> sin(const std::complex<_Tp> &__x) { + std::complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real())); + return std::complex<_Tp>(__z.imag(), -__z.real()); +} + +// cos + +template std::complex<_Tp> cos(const std::complex<_Tp> &__x) { + return cosh(complex<_Tp>(-__x.imag(), __x.real())); +} + +// tan + +template +__DEVICE__ std::complex<_Tp> tan(const std::complex<_Tp> &__x) { + std::complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real())); + return std::complex<_Tp>(__z.imag(), -__z.real()); +} + +} // namespace std + +#endif diff --git a/lib/include/popcntintrin.h b/lib/include/popcntintrin.h index 3129010147..0aa94aecda 100644 --- a/lib/include/popcntintrin.h +++ b/lib/include/popcntintrin.h @@ -13,6 +13,12 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /// Counts the number of bits in the source operand having a value of 1. /// /// \headerfile @@ -23,7 +29,7 @@ /// An unsigned 32-bit integer operand. /// \returns A 32-bit integer containing the number of bits with value 1 in the /// source operand. -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_popcnt_u32(unsigned int __A) { return __builtin_popcount(__A); @@ -40,7 +46,7 @@ _mm_popcnt_u32(unsigned int __A) /// An unsigned 64-bit integer operand. /// \returns A 64-bit integer containing the number of bits with value 1 in the /// source operand. -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR _mm_popcnt_u64(unsigned long long __A) { return __builtin_popcountll(__A); @@ -48,5 +54,6 @@ _mm_popcnt_u64(unsigned long long __A) #endif /* __x86_64__ */ #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif /* __POPCNTINTRIN_H */ diff --git a/lib/include/ppc_wrappers/smmintrin.h b/lib/include/ppc_wrappers/smmintrin.h index 56ef6ba76b..64f0c76199 100644 --- a/lib/include/ppc_wrappers/smmintrin.h +++ b/lib/include/ppc_wrappers/smmintrin.h @@ -78,6 +78,30 @@ extern __inline __m128i return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); } +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { + __v16qi result = (__v16qi)__A; + result[__N & 0xf] = __D; + return (__m128i)result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { + __v4si result = (__v4si)__A; + result[__N & 3] = __D; + return (__m128i)result; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { + __v2di result = (__v2di)__A; + result[__N & 1] = __D; + return (__m128i)result; +} + #else #include_next #endif /* defined(__linux__) && defined(__ppc64__) */ diff --git a/lib/include/uintrintrin.h b/lib/include/uintrintrin.h new file mode 100644 index 0000000000..78aa8779c3 --- /dev/null +++ b/lib/include/uintrintrin.h @@ -0,0 +1,150 @@ +/*===------------------ uintrintrin.h - UINTR intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86GPRINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __UINTRINTRIN_H +#define __UINTRINTRIN_H + +/* Define the default attributes for the functions in this file */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("uintr"))) + +#ifdef __x86_64__ + +/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a +/// user interrupt cannot be delivered on the instruction boundary following +/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in +/// 64-bit mode, and software is not executing inside an enclave; otherwise, +/// each causes an invalid-opcode exception. Causes a transactional abort if +/// executed inside a transactional region; the abort loads EAX as it would +/// had it been due to an execution of CLI. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLUI instruction. +/// +/// \operation +/// UIF := 0 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_clui (void) +{ + __builtin_ia32_clui(); +} + +/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a +/// user interrupt may be delivered on the instruction boundary following +/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in +/// 64-bit mode, and software is not executing inside an enclave; otherwise, +/// each causes an invalid-opcode exception. Causes a transactional abort if +/// executed inside a transactional region; the abort loads EAX as it would +/// had it been due to an execution of STI. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the STUI instruction. +/// +/// \operation +/// UIF := 1 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_stui (void) +{ + __builtin_ia32_stui(); +} + +/// Get the current value of the user interrupt flag (UIF). Can be executed +/// regardless of CPL and inside a transactional region. Can be executed only +/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is +/// not executing inside an enclave; otherwise, it causes an invalid-opcode +/// exception. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TESTUI instruction. +/// +/// \returns The current value of the user interrupt flag (UIF). +/// +/// \operation +/// CF := UIF +/// ZF := 0 +/// AF := 0 +/// OF := 0 +/// PF := 0 +/// SF := 0 +/// dst := CF +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_testui (void) +{ + return __builtin_ia32_testui(); +} + +/// Send interprocessor user interrupt. Can be executed only if +/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode, +/// and software is not executing inside an enclave; otherwise, it causes an +/// invalid-opcode exception. May be executed at any privilege level, all of +/// its memory accesses are performed with supervisor privilege. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the SENDUIPI instruction +/// +/// \param __a +/// Index of user-interrupt target table entry in user-interrupt target +/// table. +/// +/// \operation +/// IF __a > UITTSZ +/// GP (0) +/// FI +/// tempUITTE := MEM[UITTADDR + (a<<4)] +/// // tempUITTE must be valid, and can't have any reserved bit set +/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0) +/// GP (0) +/// FI +/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock +/// // tempUPID can't have any reserved bit set +/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0) +/// GP (0) // release lock +/// FI +/// tempUPID.PIR[tempUITTE.UV] := 1; +/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0) +/// tempUPID.ON := 1 +/// sendNotify := 1 +/// ELSE +/// sendNotify := 0 +/// FI +/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock +/// IF sendNotify == 1 +/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode +/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC +/// // ID tempUPID.NDST +/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST) +/// ELSE +/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC +/// // ID tempUPID.NDST[15:8] +/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8]) +/// FI +/// FI +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_senduipi (unsigned long long __a) +{ + __builtin_ia32_senduipi(__a); +} + +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __UINTRINTRIN_H */ diff --git a/lib/include/wasm_simd128.h b/lib/include/wasm_simd128.h index b78123834b..ac88516ac9 100644 --- a/lib/include/wasm_simd128.h +++ b/lib/include/wasm_simd128.h @@ -18,8 +18,7 @@ typedef int32_t v128_t __attribute__((__vector_size__(16), __aligned__(16))); // Internal types determined by clang builtin definitions typedef int32_t __v128_u __attribute__((__vector_size__(16), __aligned__(1))); -typedef char __i8x16 __attribute__((__vector_size__(16), __aligned__(16))); -typedef signed char __s8x16 +typedef signed char __i8x16 __attribute__((__vector_size__(16), __aligned__(16))); typedef unsigned char __u8x16 __attribute__((__vector_size__(16), __aligned__(16))); @@ -35,6 +34,13 @@ typedef unsigned long long __u64x2 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16))); typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16))); +typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned char __u8x8 + __attribute__((__vector_size__(8), __aligned__(8))); +typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8))); +typedef unsigned short __u16x4 + __attribute__((__vector_size__(8), __aligned__(8))); + #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ __min_vector_width__(128))) @@ -273,7 +279,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) { (__builtin_wasm_extract_lane_s_i8x16((__i8x16)(__a), __i)) #define wasm_u8x16_extract_lane(__a, __i) \ - (__builtin_wasm_extract_lane_u_i8x16((__i8x16)(__a), __i)) + (__builtin_wasm_extract_lane_u_i8x16((__u8x16)(__a), __i)) #define wasm_i8x16_replace_lane(__a, __i, __b) \ ((v128_t)__builtin_wasm_replace_lane_i8x16((__i8x16)(__a), __i, __b)) @@ -286,7 +292,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) { (__builtin_wasm_extract_lane_s_i16x8((__i16x8)(__a), __i)) #define wasm_u16x8_extract_lane(__a, __i) \ - (__builtin_wasm_extract_lane_u_i16x8((__i16x8)(__a), __i)) + (__builtin_wasm_extract_lane_u_i16x8((__u16x8)(__a), __i)) #define wasm_i16x8_replace_lane(__a, __i, __b) \ ((v128_t)__builtin_wasm_replace_lane_i16x8((__i16x8)(__a), __i, __b)) @@ -333,17 +339,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_splat(double __a) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_eq(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a == (__s8x16)__b); + return (v128_t)((__i8x16)__a == (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ne(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a != (__s8x16)__b); + return (v128_t)((__i8x16)__a != (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_lt(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a < (__s8x16)__b); + return (v128_t)((__i8x16)__a < (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a, @@ -353,7 +359,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_gt(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a > (__s8x16)__b); + return (v128_t)((__i8x16)__a > (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a, @@ -363,7 +369,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_le(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a <= (__s8x16)__b); + return (v128_t)((__i8x16)__a <= (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a, @@ -373,7 +379,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ge(v128_t __a, v128_t __b) { - return (v128_t)((__s8x16)__a >= (__s8x16)__b); + return (v128_t)((__i8x16)__a >= (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_ge(v128_t __a, @@ -595,7 +601,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a, int32_t __b) { - return (v128_t)((__s8x16)__a >> __b); + return (v128_t)((__i8x16)__a >> __b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a, @@ -616,8 +622,8 @@ wasm_i8x16_add_saturate(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__i8x16)__a, - (__i8x16)__b); + return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__u8x16)__a, + (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a, @@ -633,8 +639,8 @@ wasm_i8x16_sub_saturate(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__i8x16)__a, - (__i8x16)__b); + return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__u8x16)__a, + (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a, @@ -644,7 +650,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a, @@ -654,12 +660,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_avgr_u_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_wasm_avgr_u_i8x16((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_abs(v128_t __a) { @@ -706,8 +712,8 @@ wasm_i16x8_add_saturate(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__i16x8)__a, - (__i16x8)__b); + return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__u16x8)__a, + (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a, @@ -723,8 +729,8 @@ wasm_i16x8_sub_saturate(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__i16x8)__a, - (__i16x8)__b); + return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__u16x8)__a, + (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a, @@ -739,7 +745,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a, @@ -749,12 +755,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_avgr_u_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_wasm_avgr_u_i16x8((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_abs(v128_t __a) { @@ -810,7 +816,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i32x4((__i32x4)__a, (__i32x4)__b); + return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a, @@ -820,7 +826,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i32x4((__i32x4)__a, (__i32x4)__b); + return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_neg(v128_t __a) { @@ -1071,8 +1077,8 @@ wasm_i8x16_narrow_i16x8(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_narrow_i16x8(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__i16x8)__a, - (__i16x8)__b); + return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__u16x8)__a, + (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS @@ -1083,48 +1089,76 @@ wasm_i16x8_narrow_i32x4(v128_t __a, v128_t __b) { static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_narrow_i32x4(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__i32x4)__a, - (__i32x4)__b); + return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__u32x4)__a, + (__u32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2], + ((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5], + ((__i8x16)__a)[6], ((__i8x16)__a)[7]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_i8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10], + ((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13], + ((__i8x16)__a)[14], ((__i8x16)__a)[15]}, + __i16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_low_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2], + ((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5], + ((__u8x16)__a)[6], ((__u8x16)__a)[7]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_widen_high_u8x16(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a); + return (v128_t) __builtin_convertvector( + (__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10], + ((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13], + ((__u8x16)__a)[14], ((__u8x16)__a)[15]}, + __u16x8); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2], + ((__i16x8)__a)[3]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_i16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6], + ((__i16x8)__a)[7]}, + __i32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_low_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2], + ((__u16x8)__a)[3]}, + __u32x4); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_widen_high_u16x8(v128_t __a) { - return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a); + return (v128_t) __builtin_convertvector( + (__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6], + ((__u16x8)__a)[7]}, + __u32x4); } // Undefine helper macros diff --git a/lib/include/x86gprintrin.h b/lib/include/x86gprintrin.h new file mode 100644 index 0000000000..1fc6cab4b2 --- /dev/null +++ b/lib/include/x86gprintrin.h @@ -0,0 +1,23 @@ +/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86GPRINTRIN_H +#define __X86GPRINTRIN_H + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__HRESET__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__UINTR__) +#include +#endif + +#endif /* __X86GPRINTRIN_H */ From 1e66ac5755ebe8614b8b3a278eabcfca5f78c3c2 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 16 Dec 2020 11:48:54 +0100 Subject: [PATCH 04/67] Update libunwind llvm commit b2851aea80e5a8f0cfd6c3c5a56a6b00fb28c6b6 --- lib/libunwind/include/libunwind.h | 8 +- lib/libunwind/src/AddressSpace.hpp | 262 +++---- lib/libunwind/src/DwarfInstructions.hpp | 9 +- lib/libunwind/src/DwarfParser.hpp | 803 +++++++++++---------- lib/libunwind/src/FrameHeaderCache.hpp | 2 +- lib/libunwind/src/Registers.hpp | 16 +- lib/libunwind/src/Unwind-seh.cpp | 20 +- lib/libunwind/src/UnwindCursor.hpp | 115 ++- lib/libunwind/src/UnwindLevel1.c | 4 +- lib/libunwind/src/UnwindRegistersRestore.S | 25 +- lib/libunwind/src/UnwindRegistersSave.S | 9 +- lib/libunwind/src/Unwind_AppleExtras.cpp | 70 -- lib/libunwind/src/assembly.h | 24 +- lib/libunwind/src/config.h | 54 +- 14 files changed, 664 insertions(+), 757 deletions(-) diff --git a/lib/libunwind/include/libunwind.h b/lib/libunwind/include/libunwind.h index 23ef47f4ac..6ec649a460 100644 --- a/lib/libunwind/include/libunwind.h +++ b/lib/libunwind/include/libunwind.h @@ -43,6 +43,12 @@ #define LIBUNWIND_AVAIL #endif +#if defined(_WIN32) && defined(__SEH__) + #define LIBUNWIND_CURSOR_ALIGNMENT_ATTR __attribute__((__aligned__(16))) +#else + #define LIBUNWIND_CURSOR_ALIGNMENT_ATTR +#endif + /* error codes */ enum { UNW_ESUCCESS = 0, /* no error */ @@ -68,7 +74,7 @@ typedef struct unw_context_t unw_context_t; struct unw_cursor_t { uint64_t data[_LIBUNWIND_CURSOR_SIZE]; -}; +} LIBUNWIND_CURSOR_ALIGNMENT_ATTR; typedef struct unw_cursor_t unw_cursor_t; typedef struct unw_addr_space *unw_addr_space_t; diff --git a/lib/libunwind/src/AddressSpace.hpp b/lib/libunwind/src/AddressSpace.hpp index 93395ffb3b..171318ff63 100644 --- a/lib/libunwind/src/AddressSpace.hpp +++ b/lib/libunwind/src/AddressSpace.hpp @@ -17,6 +17,12 @@ #include #include +#include "libunwind.h" +#include "config.h" +#include "dwarf2.h" +#include "EHHeaderParser.hpp" +#include "Registers.hpp" + #ifndef _LIBUNWIND_USE_DLADDR #if !defined(_LIBUNWIND_IS_BAREMETAL) && !defined(_WIN32) #define _LIBUNWIND_USE_DLADDR 1 @@ -39,19 +45,6 @@ struct EHABIIndexEntry { }; #endif -#ifdef __APPLE__ -#include -namespace libunwind { - bool checkKeyMgrRegisteredFDEs(uintptr_t targetAddr, void *&fde); -} -#endif - -#include "libunwind.h" -#include "config.h" -#include "dwarf2.h" -#include "EHHeaderParser.hpp" -#include "Registers.hpp" - #ifdef __APPLE__ struct dyld_unwind_sections @@ -62,43 +55,9 @@ namespace libunwind { const void* compact_unwind_section; uintptr_t compact_unwind_section_length; }; - #if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) \ - && (__MAC_OS_X_VERSION_MIN_REQUIRED >= 1070)) \ - || defined(__IPHONE_OS_VERSION_MIN_REQUIRED) - // In 10.7.0 or later, libSystem.dylib implements this function. - extern "C" bool _dyld_find_unwind_sections(void *, dyld_unwind_sections *); - #else - // In 10.6.x and earlier, we need to implement this functionality. Note - // that this requires a newer version of libmacho (from cctools) than is - // present in libSystem on 10.6.x (for getsectiondata). - static inline bool _dyld_find_unwind_sections(void* addr, - dyld_unwind_sections* info) { - // Find mach-o image containing address. - Dl_info dlinfo; - if (!dladdr(addr, &dlinfo)) - return false; -#if __LP64__ - const struct mach_header_64 *mh = (const struct mach_header_64 *)dlinfo.dli_fbase; -#else - const struct mach_header *mh = (const struct mach_header *)dlinfo.dli_fbase; -#endif - // Initialize the return struct - info->mh = (const struct mach_header *)mh; - info->dwarf_section = getsectiondata(mh, "__TEXT", "__eh_frame", &info->dwarf_section_length); - info->compact_unwind_section = getsectiondata(mh, "__TEXT", "__unwind_info", &info->compact_unwind_section_length); - - if (!info->dwarf_section) { - info->dwarf_section_length = 0; - } - - if (!info->compact_unwind_section) { - info->compact_unwind_section_length = 0; - } - - return true; - } - #endif + // In 10.7.0 or later, libSystem.dylib implements this function. + extern "C" bool _dyld_find_unwind_sections(void *, dyld_unwind_sections *); #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) @@ -139,22 +98,15 @@ extern char __eh_frame_hdr_end; extern char __exidx_start; extern char __exidx_end; -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32) -// ELF-based systems may use dl_iterate_phdr() to access sections -// containing unwinding information. The ElfW() macro for pointer-size -// independent ELF header traversal is not provided by on some -// systems (e.g., FreeBSD). On these systems the data structures are -// just called Elf_XXX. Define ElfW() locally. -#ifndef _WIN32 -#include -#else #include #include -#endif -#if !defined(ElfW) -#define ElfW(type) Elf_##type -#endif + +#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) || \ + defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX) + +#include #endif @@ -162,11 +114,15 @@ namespace libunwind { /// Used by findUnwindSections() to return info about needed sections. struct UnwindInfoSections { -#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) || defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) || \ - defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND) - // No dso_base for SEH or ARM EHABI. +#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) || \ + defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND) || \ + defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) + // No dso_base for SEH. uintptr_t dso_base; #endif +#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) + uintptr_t text_segment_length; +#endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) uintptr_t dwarf_section; uintptr_t dwarf_section_length; @@ -290,11 +246,11 @@ inline int64_t LocalAddressSpace::getSLEB128(pint_t &addr, pint_t end) { if (p == pend) _LIBUNWIND_ABORT("truncated sleb128 expression"); byte = *p++; - result |= ((byte & 0x7f) << bit); + result |= (uint64_t)(byte & 0x7f) << bit; bit += 7; } while (byte & 0x80); // sign extend negative numbers - if ((byte & 0x40) != 0) + if ((byte & 0x40) != 0 && bit < 64) result |= (-1ULL) << bit; addr = (pint_t) p; return result; @@ -392,23 +348,14 @@ LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding, return result; } -#ifdef __APPLE__ -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(_LIBUNWIND_IS_BAREMETAL) -#elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_WIN32) -#elif defined(_LIBUNWIND_SUPPORT_SEH_UNWIND) && defined(_WIN32) -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__) -// Code inside findUnwindSections handles all these cases. -// -// Although the above ifdef chain is ugly, there doesn't seem to be a cleaner -// way to handle it. The generalized boolean expression is: -// -// A OR (B AND C) OR (D AND C) OR (B AND E) OR (F AND E) OR (D AND G) -// -// Running it through various boolean expression simplifiers gives expressions -// that don't help at all. -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) +// The ElfW() macro for pointer-size independent ELF header traversal is not +// provided by on some systems (e.g., FreeBSD). On these systems the +// data structures are just called Elf_XXX. Define ElfW() locally. +#if !defined(ElfW) + #define ElfW(type) Elf_##type +#endif #if !defined(Elf_Half) typedef ElfW(Half) Elf_Half; #endif @@ -447,16 +394,12 @@ struct _LIBUNWIND_HIDDEN dl_iterate_cb_data { uintptr_t targetAddr; }; -#if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) - #if !defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) - #error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform." - #endif - #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) #include "FrameHeaderCache.hpp" -// There should be just one of these per process. -static FrameHeaderCache ProcessFrameHeaderCache; +// Typically there is one cache per process, but when libunwind is built as a +// hermetic static library, then each shared object may have its own cache. +static FrameHeaderCache TheFrameHeaderCache; #endif static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base, @@ -466,95 +409,93 @@ static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base, uintptr_t end = begin + phdr->p_memsz; if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) { cbdata->sects->dso_base = begin; - cbdata->sects->dwarf_section_length = phdr->p_memsz; + cbdata->sects->text_segment_length = phdr->p_memsz; return true; } } return false; } +static bool checkForUnwindInfoSegment(const Elf_Phdr *phdr, size_t image_base, + dl_iterate_cb_data *cbdata) { +#if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX) + if (phdr->p_type == PT_GNU_EH_FRAME) { + EHHeaderParser::EHHeaderInfo hdrInfo; + uintptr_t eh_frame_hdr_start = image_base + phdr->p_vaddr; + cbdata->sects->dwarf_index_section = eh_frame_hdr_start; + cbdata->sects->dwarf_index_section_length = phdr->p_memsz; + if (EHHeaderParser::decodeEHHdr( + *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz, + hdrInfo)) { + // .eh_frame_hdr records the start of .eh_frame, but not its size. + // Rely on a zero terminator to find the end of the section. + cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr; + cbdata->sects->dwarf_section_length = UINTPTR_MAX; + return true; + } + } + return false; +#elif defined(_LIBUNWIND_ARM_EHABI) + if (phdr->p_type == PT_ARM_EXIDX) { + uintptr_t exidx_start = image_base + phdr->p_vaddr; + cbdata->sects->arm_section = exidx_start; + cbdata->sects->arm_section_length = phdr->p_memsz; + return true; + } + return false; +#else +#error Need one of _LIBUNWIND_SUPPORT_DWARF_INDEX or _LIBUNWIND_ARM_EHABI +#endif +} + static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size, void *data) { auto cbdata = static_cast(data); if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr) return 0; #if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) - if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data)) + if (TheFrameHeaderCache.find(pinfo, pinfo_size, data)) return 1; +#else + // Avoid warning about unused variable. + (void)pinfo_size; #endif Elf_Addr image_base = calculateImageBase(pinfo); - bool found_obj = false; - bool found_hdr = false; - // Third phdr is usually the executable phdr. - if (pinfo->dlpi_phnum > 2) - found_obj = checkAddrInSegment(&pinfo->dlpi_phdr[2], image_base, cbdata); - - // PT_GNU_EH_FRAME is usually near the end. Iterate backward. We already know - // that there is one or more phdrs. - for (Elf_Half i = pinfo->dlpi_phnum; i > 0; i--) { - const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i - 1]; - if (!found_hdr && phdr->p_type == PT_GNU_EH_FRAME) { - EHHeaderParser::EHHeaderInfo hdrInfo; - uintptr_t eh_frame_hdr_start = image_base + phdr->p_vaddr; - cbdata->sects->dwarf_index_section = eh_frame_hdr_start; - cbdata->sects->dwarf_index_section_length = phdr->p_memsz; - found_hdr = EHHeaderParser::decodeEHHdr( - *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz, - hdrInfo); - if (found_hdr) - cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr; - } else if (!found_obj) { - found_obj = checkAddrInSegment(phdr, image_base, cbdata); - } - if (found_obj && found_hdr) { -#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) - ProcessFrameHeaderCache.add(cbdata->sects); -#endif - return 1; + // Most shared objects seen in this callback function likely don't contain the + // target address, so optimize for that. Scan for a matching PT_LOAD segment + // first and bail when it isn't found. + bool found_text = false; + for (Elf_Half i = 0; i < pinfo->dlpi_phnum; ++i) { + if (checkAddrInSegment(&pinfo->dlpi_phdr[i], image_base, cbdata)) { + found_text = true; + break; } } - cbdata->sects->dwarf_section_length = 0; - return 0; -} - -#else // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND) -// Given all the #ifdef's above, the code here is for -// defined(LIBUNWIND_ARM_EHABI) - -static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t, - void *data) { - auto *cbdata = static_cast(data); - bool found_obj = false; - bool found_hdr = false; - - assert(cbdata); - assert(cbdata->sects); - - if (cbdata->targetAddr < pinfo->dlpi_addr) + if (!found_text) return 0; - Elf_Addr image_base = calculateImageBase(pinfo); - - for (Elf_Half i = 0; i < pinfo->dlpi_phnum; i++) { - const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i]; - if (phdr->p_type == PT_LOAD) { - uintptr_t begin = image_base + phdr->p_vaddr; - uintptr_t end = begin + phdr->p_memsz; - if (cbdata->targetAddr >= begin && cbdata->targetAddr < end) - found_obj = true; - } else if (phdr->p_type == PT_ARM_EXIDX) { - uintptr_t exidx_start = image_base + phdr->p_vaddr; - cbdata->sects->arm_section = exidx_start; - cbdata->sects->arm_section_length = phdr->p_memsz; - found_hdr = true; + // PT_GNU_EH_FRAME and PT_ARM_EXIDX are usually near the end. Iterate + // backward. + bool found_unwind = false; + for (Elf_Half i = pinfo->dlpi_phnum; i > 0; i--) { + const Elf_Phdr *phdr = &pinfo->dlpi_phdr[i - 1]; + if (checkForUnwindInfoSegment(phdr, image_base, cbdata)) { + found_unwind = true; + break; } } - return found_obj && found_hdr; + if (!found_unwind) + return 0; + +#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE) + TheFrameHeaderCache.add(cbdata->sects); +#endif + return 1; } -#endif // defined(LIBUNWIND_SUPPORT_DWARF_UNWIND) -#endif // defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) + +#endif // defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, @@ -572,6 +513,7 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, return true; } #elif defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) && defined(_LIBUNWIND_IS_BAREMETAL) + info.dso_base = 0; // Bare metal is statically linked, so no need to ask the dynamic loader info.dwarf_section_length = (uintptr_t)(&__eh_frame_end - &__eh_frame_start); info.dwarf_section = (uintptr_t)(&__eh_frame_start); @@ -638,16 +580,14 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, (void)targetAddr; (void)info; return true; -#elif defined(_LIBUNWIND_ARM_EHABI) && defined(__BIONIC__) - // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After - // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster. +#elif defined(_LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX) int length = 0; info.arm_section = (uintptr_t)dl_unwind_find_exidx((_Unwind_Ptr)targetAddr, &length); info.arm_section_length = (uintptr_t)length * sizeof(EHABIIndexEntry); if (info.arm_section && info.arm_section_length) return true; -#elif defined(_LIBUNWIND_ARM_EHABI) || defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +#elif defined(_LIBUNWIND_USE_DL_ITERATE_PHDR) dl_iterate_cb_data cb_data = {this, &info, targetAddr}; int found = dl_iterate_phdr(findUnwindSectionsByPhdr, &cb_data); return static_cast(found); @@ -658,14 +598,10 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr, inline bool LocalAddressSpace::findOtherFDE(pint_t targetAddr, pint_t &fde) { -#ifdef __APPLE__ - return checkKeyMgrRegisteredFDEs(targetAddr, *((void**)&fde)); -#else // TO DO: if OS has way to dynamically register FDEs, check that. (void)targetAddr; (void)fde; return false; -#endif } inline bool LocalAddressSpace::findFunctionName(pint_t addr, char *buf, diff --git a/lib/libunwind/src/DwarfInstructions.hpp b/lib/libunwind/src/DwarfInstructions.hpp index ee98f538d4..c39cabe1f7 100644 --- a/lib/libunwind/src/DwarfInstructions.hpp +++ b/lib/libunwind/src/DwarfInstructions.hpp @@ -93,7 +93,8 @@ typename A::pint_t DwarfInstructions::getSavedRegister( case CFI_Parser::kRegisterInRegister: return registers.getRegister((int)savedReg.value); - + case CFI_Parser::kRegisterUndefined: + return 0; case CFI_Parser::kRegisterUnused: case CFI_Parser::kRegisterOffsetFromCFA: // FIX ME @@ -117,6 +118,7 @@ double DwarfInstructions::getSavedFloatRegister( case CFI_Parser::kRegisterIsExpression: case CFI_Parser::kRegisterUnused: + case CFI_Parser::kRegisterUndefined: case CFI_Parser::kRegisterOffsetFromCFA: case CFI_Parser::kRegisterInRegister: // FIX ME @@ -140,6 +142,7 @@ v128 DwarfInstructions::getSavedVectorRegister( case CFI_Parser::kRegisterIsExpression: case CFI_Parser::kRegisterUnused: + case CFI_Parser::kRegisterUndefined: case CFI_Parser::kRegisterOffsetFromCFA: case CFI_Parser::kRegisterInRegister: // FIX ME @@ -190,6 +193,10 @@ int DwarfInstructions::stepWithDwarf(A &addressSpace, pint_t pc, prolog.savedRegisters[i])); else return UNW_EBADREG; + } else if (i == (int)cieInfo.returnAddressRegister) { + // Leaf function keeps the return address in register and there is no + // explicit intructions how to restore it. + returnAddress = registers.getRegister(cieInfo.returnAddressRegister); } } diff --git a/lib/libunwind/src/DwarfParser.hpp b/lib/libunwind/src/DwarfParser.hpp index d05ac46836..de0eb6de9d 100644 --- a/lib/libunwind/src/DwarfParser.hpp +++ b/lib/libunwind/src/DwarfParser.hpp @@ -69,6 +69,7 @@ public: }; enum RegisterSavedWhere { kRegisterUnused, + kRegisterUndefined, kRegisterInCFA, kRegisterOffsetFromCFA, kRegisterInRegister, @@ -87,9 +88,6 @@ public: int32_t cfaRegisterOffset; // CFA = (cfaRegister)+cfaRegisterOffset int64_t cfaExpression; // CFA = expression uint32_t spExtraArgSize; - uint32_t codeOffsetAtStackDecrement; - bool registersInOtherRegisters; - bool sameValueUsed; RegisterLocation savedRegisters[kMaxRegisterNumber + 1]; enum class InitializeTime { kLazy, kNormal }; @@ -134,8 +132,26 @@ public: PrologInfo info; }; + struct RememberStack { + PrologInfoStackEntry *entry; + RememberStack() : entry(nullptr) {} + ~RememberStack() { +#if defined(_LIBUNWIND_REMEMBER_CLEANUP_NEEDED) + // Clean up rememberStack. Even in the case where every + // DW_CFA_remember_state is paired with a DW_CFA_restore_state, + // parseInstructions can skip restore opcodes if it reaches the target PC + // and stops interpreting, so we have to make sure we don't leak memory. + while (entry) { + PrologInfoStackEntry *next = entry->next; + _LIBUNWIND_REMEMBER_FREE(entry); + entry = next; + } +#endif + } + }; + static bool findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, - uint32_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, + uintptr_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, CIE_Info *cieInfo); static const char *decodeFDE(A &addressSpace, pint_t fdeStart, FDE_Info *fdeInfo, CIE_Info *cieInfo); @@ -144,13 +160,6 @@ public: int arch, PrologInfo *results); static const char *parseCIE(A &addressSpace, pint_t cie, CIE_Info *cieInfo); - -private: - static bool parseInstructions(A &addressSpace, pint_t instructions, - pint_t instructionsEnd, const CIE_Info &cieInfo, - pint_t pcoffset, - PrologInfoStackEntry *&rememberStack, int arch, - PrologInfo *results); }; /// Parse a FDE into a CIE_Info and an FDE_Info @@ -166,7 +175,7 @@ const char *CFI_Parser::decodeFDE(A &addressSpace, pint_t fdeStart, p += 8; } if (cfiLength == 0) - return "FDE has zero length"; // end marker + return "FDE has zero length"; // zero terminator uint32_t ciePointer = addressSpace.get32(p); if (ciePointer == 0) return "FDE is really a CIE"; // this is a CIE not an FDE @@ -211,11 +220,13 @@ const char *CFI_Parser::decodeFDE(A &addressSpace, pint_t fdeStart, /// Scan an eh_frame section to find an FDE for a pc template bool CFI_Parser::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, - uint32_t sectionLength, pint_t fdeHint, + uintptr_t sectionLength, pint_t fdeHint, FDE_Info *fdeInfo, CIE_Info *cieInfo) { //fprintf(stderr, "findFDE(0x%llX)\n", (long long)pc); pint_t p = (fdeHint != 0) ? fdeHint : ehSectionStart; - const pint_t ehSectionEnd = p + sectionLength; + const pint_t ehSectionEnd = (sectionLength == UINTPTR_MAX) + ? static_cast(-1) + : (ehSectionStart + sectionLength); while (p < ehSectionEnd) { pint_t currentCFI = p; //fprintf(stderr, "findFDE() CFI at 0x%llX\n", (long long)p); @@ -227,7 +238,7 @@ bool CFI_Parser::findFDE(A &addressSpace, pint_t pc, pint_t ehSectionStart, p += 8; } if (cfiLength == 0) - return false; // end marker + return false; // zero terminator uint32_t id = addressSpace.get32(p); if (id == 0) { // Skip over CIEs. @@ -336,7 +347,8 @@ const char *CFI_Parser::parseCIE(A &addressSpace, pint_t cie, // parse data alignment factor cieInfo->dataAlignFactor = (int)addressSpace.getSLEB128(p, cieContentEnd); // parse return address register - uint64_t raReg = addressSpace.getULEB128(p, cieContentEnd); + uint64_t raReg = (version == 1) ? addressSpace.get8(p++) + : addressSpace.getULEB128(p, cieContentEnd); assert(raReg < 255 && "return address register too large"); cieInfo->returnAddressRegister = (uint8_t)raReg; // parse augmentation data based on augmentation string @@ -390,418 +402,409 @@ bool CFI_Parser::parseFDEInstructions(A &addressSpace, const FDE_Info &fdeInfo, const CIE_Info &cieInfo, pint_t upToPC, int arch, PrologInfo *results) { - PrologInfoStackEntry *rememberStack = NULL; + // Alloca is used for the allocation of the rememberStack entries. It removes + // the dependency on new/malloc but the below for loop can not be refactored + // into functions. Entry could be saved during the processing of a CIE and + // restored by an FDE. + RememberStack rememberStack; - // parse CIE then FDE instructions - bool returnValue = - parseInstructions(addressSpace, cieInfo.cieInstructions, - cieInfo.cieStart + cieInfo.cieLength, cieInfo, - (pint_t)(-1), rememberStack, arch, results) && - parseInstructions(addressSpace, fdeInfo.fdeInstructions, - fdeInfo.fdeStart + fdeInfo.fdeLength, cieInfo, - upToPC - fdeInfo.pcStart, rememberStack, arch, results); + struct ParseInfo { + pint_t instructions; + pint_t instructionsEnd; + pint_t pcoffset; + }; -#if !defined(_LIBUNWIND_NO_HEAP) - // Clean up rememberStack. Even in the case where every DW_CFA_remember_state - // is paired with a DW_CFA_restore_state, parseInstructions can skip restore - // opcodes if it reaches the target PC and stops interpreting, so we have to - // make sure we don't leak memory. - while (rememberStack) { - PrologInfoStackEntry *next = rememberStack->next; - free(rememberStack); - rememberStack = next; - } -#endif + ParseInfo parseInfoArray[] = { + {cieInfo.cieInstructions, cieInfo.cieStart + cieInfo.cieLength, + (pint_t)(-1)}, + {fdeInfo.fdeInstructions, fdeInfo.fdeStart + fdeInfo.fdeLength, + upToPC - fdeInfo.pcStart}}; - return returnValue; -} + for (const auto &info : parseInfoArray) { + pint_t p = info.instructions; + pint_t instructionsEnd = info.instructionsEnd; + pint_t pcoffset = info.pcoffset; + pint_t codeOffset = 0; -/// "run" the DWARF instructions -template -bool CFI_Parser::parseInstructions(A &addressSpace, pint_t instructions, - pint_t instructionsEnd, - const CIE_Info &cieInfo, pint_t pcoffset, - PrologInfoStackEntry *&rememberStack, - int arch, PrologInfo *results) { - pint_t p = instructions; - pint_t codeOffset = 0; - // initialState initialized as registers in results are modified. Use - // PrologInfo accessor functions to avoid reading uninitialized data. - PrologInfo initialState(PrologInfo::InitializeTime::kLazy); + // initialState initialized as registers in results are modified. Use + // PrologInfo accessor functions to avoid reading uninitialized data. + PrologInfo initialState(PrologInfo::InitializeTime::kLazy); - _LIBUNWIND_TRACE_DWARF("parseInstructions(instructions=0x%0" PRIx64 ")\n", - static_cast(instructionsEnd)); + _LIBUNWIND_TRACE_DWARF("parseFDEInstructions(instructions=0x%0" PRIx64 + ")\n", + static_cast(instructionsEnd)); - // see DWARF Spec, section 6.4.2 for details on unwind opcodes - while ((p < instructionsEnd) && (codeOffset < pcoffset)) { - uint64_t reg; - uint64_t reg2; - int64_t offset; - uint64_t length; - uint8_t opcode = addressSpace.get8(p); - uint8_t operand; -#if !defined(_LIBUNWIND_NO_HEAP) - PrologInfoStackEntry *entry; -#endif - ++p; - switch (opcode) { - case DW_CFA_nop: - _LIBUNWIND_TRACE_DWARF("DW_CFA_nop\n"); - break; - case DW_CFA_set_loc: - codeOffset = - addressSpace.getEncodedP(p, instructionsEnd, cieInfo.pointerEncoding); - _LIBUNWIND_TRACE_DWARF("DW_CFA_set_loc\n"); - break; - case DW_CFA_advance_loc1: - codeOffset += (addressSpace.get8(p) * cieInfo.codeAlignFactor); - p += 1; - _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc1: new offset=%" PRIu64 "\n", - static_cast(codeOffset)); - break; - case DW_CFA_advance_loc2: - codeOffset += (addressSpace.get16(p) * cieInfo.codeAlignFactor); - p += 2; - _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc2: new offset=%" PRIu64 "\n", - static_cast(codeOffset)); - break; - case DW_CFA_advance_loc4: - codeOffset += (addressSpace.get32(p) * cieInfo.codeAlignFactor); - p += 4; - _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc4: new offset=%" PRIu64 "\n", - static_cast(codeOffset)); - break; - case DW_CFA_offset_extended: - reg = addressSpace.getULEB128(p, instructionsEnd); - offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) - * cieInfo.dataAlignFactor; - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_offset_extended DWARF unwind, reg too big"); - return false; + // see DWARF Spec, section 6.4.2 for details on unwind opcodes + while ((p < instructionsEnd) && (codeOffset < pcoffset)) { + uint64_t reg; + uint64_t reg2; + int64_t offset; + uint64_t length; + uint8_t opcode = addressSpace.get8(p); + uint8_t operand; + + ++p; + switch (opcode) { + case DW_CFA_nop: + _LIBUNWIND_TRACE_DWARF("DW_CFA_nop\n"); + break; + case DW_CFA_set_loc: + codeOffset = addressSpace.getEncodedP(p, instructionsEnd, + cieInfo.pointerEncoding); + _LIBUNWIND_TRACE_DWARF("DW_CFA_set_loc\n"); + break; + case DW_CFA_advance_loc1: + codeOffset += (addressSpace.get8(p) * cieInfo.codeAlignFactor); + p += 1; + _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc1: new offset=%" PRIu64 "\n", + static_cast(codeOffset)); + break; + case DW_CFA_advance_loc2: + codeOffset += (addressSpace.get16(p) * cieInfo.codeAlignFactor); + p += 2; + _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc2: new offset=%" PRIu64 "\n", + static_cast(codeOffset)); + break; + case DW_CFA_advance_loc4: + codeOffset += (addressSpace.get32(p) * cieInfo.codeAlignFactor); + p += 4; + _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc4: new offset=%" PRIu64 "\n", + static_cast(codeOffset)); + break; + case DW_CFA_offset_extended: + reg = addressSpace.getULEB128(p, instructionsEnd); + offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_offset_extended DWARF unwind, reg too big"); + return false; + } + results->setRegister(reg, kRegisterInCFA, offset, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended(reg=%" PRIu64 ", " + "offset=%" PRId64 ")\n", + reg, offset); + break; + case DW_CFA_restore_extended: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_restore_extended DWARF unwind, reg too big"); + return false; + } + results->restoreRegisterToInitialState(reg, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_extended(reg=%" PRIu64 ")\n", + reg); + break; + case DW_CFA_undefined: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_undefined DWARF unwind, reg too big"); + return false; + } + results->setRegisterLocation(reg, kRegisterUndefined, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_undefined(reg=%" PRIu64 ")\n", reg); + break; + case DW_CFA_same_value: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_same_value DWARF unwind, reg too big"); + return false; + } + // DW_CFA_same_value unsupported + // "same value" means register was stored in frame, but its current + // value has not changed, so no need to restore from frame. + // We model this as if the register was never saved. + results->setRegisterLocation(reg, kRegisterUnused, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_same_value(reg=%" PRIu64 ")\n", reg); + break; + case DW_CFA_register: + reg = addressSpace.getULEB128(p, instructionsEnd); + reg2 = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_register DWARF unwind, reg too big"); + return false; + } + if (reg2 > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_register DWARF unwind, reg2 too big"); + return false; + } + results->setRegister(reg, kRegisterInRegister, (int64_t)reg2, + initialState); + _LIBUNWIND_TRACE_DWARF( + "DW_CFA_register(reg=%" PRIu64 ", reg2=%" PRIu64 ")\n", reg, reg2); + break; + case DW_CFA_remember_state: { + // Avoid operator new because that would be an upward dependency. + // Avoid malloc because it needs heap allocation. + PrologInfoStackEntry *entry = + (PrologInfoStackEntry *)_LIBUNWIND_REMEMBER_ALLOC( + sizeof(PrologInfoStackEntry)); + if (entry != NULL) { + entry->next = rememberStack.entry; + entry->info = *results; + rememberStack.entry = entry; + } else { + return false; + } + _LIBUNWIND_TRACE_DWARF("DW_CFA_remember_state\n"); + break; } - results->setRegister(reg, kRegisterInCFA, offset, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended(reg=%" PRIu64 ", " - "offset=%" PRId64 ")\n", - reg, offset); - break; - case DW_CFA_restore_extended: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_restore_extended DWARF unwind, reg too big"); - return false; - } - results->restoreRegisterToInitialState(reg, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_extended(reg=%" PRIu64 ")\n", reg); - break; - case DW_CFA_undefined: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_undefined DWARF unwind, reg too big"); - return false; - } - results->setRegisterLocation(reg, kRegisterUnused, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_undefined(reg=%" PRIu64 ")\n", reg); - break; - case DW_CFA_same_value: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_same_value DWARF unwind, reg too big"); - return false; - } - // DW_CFA_same_value unsupported - // "same value" means register was stored in frame, but its current - // value has not changed, so no need to restore from frame. - // We model this as if the register was never saved. - results->setRegisterLocation(reg, kRegisterUnused, initialState); - // set flag to disable conversion to compact unwind - results->sameValueUsed = true; - _LIBUNWIND_TRACE_DWARF("DW_CFA_same_value(reg=%" PRIu64 ")\n", reg); - break; - case DW_CFA_register: - reg = addressSpace.getULEB128(p, instructionsEnd); - reg2 = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_register DWARF unwind, reg too big"); - return false; - } - if (reg2 > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_register DWARF unwind, reg2 too big"); - return false; - } - results->setRegister(reg, kRegisterInRegister, (int64_t)reg2, - initialState); - // set flag to disable conversion to compact unwind - results->registersInOtherRegisters = true; - _LIBUNWIND_TRACE_DWARF( - "DW_CFA_register(reg=%" PRIu64 ", reg2=%" PRIu64 ")\n", reg, reg2); - break; -#if !defined(_LIBUNWIND_NO_HEAP) - case DW_CFA_remember_state: - // avoid operator new, because that would be an upward dependency - entry = (PrologInfoStackEntry *)malloc(sizeof(PrologInfoStackEntry)); - if (entry != NULL) { - entry->next = rememberStack; - entry->info = *results; - rememberStack = entry; - } else { - return false; - } - _LIBUNWIND_TRACE_DWARF("DW_CFA_remember_state\n"); - break; - case DW_CFA_restore_state: - if (rememberStack != NULL) { - PrologInfoStackEntry *top = rememberStack; - *results = top->info; - rememberStack = top->next; - free((char *)top); - } else { - return false; - } - _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_state\n"); - break; -#endif - case DW_CFA_def_cfa: - reg = addressSpace.getULEB128(p, instructionsEnd); - offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0("malformed DW_CFA_def_cfa DWARF unwind, reg too big"); - return false; - } - results->cfaRegister = (uint32_t)reg; - results->cfaRegisterOffset = (int32_t)offset; - _LIBUNWIND_TRACE_DWARF( - "DW_CFA_def_cfa(reg=%" PRIu64 ", offset=%" PRIu64 ")\n", reg, offset); - break; - case DW_CFA_def_cfa_register: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_def_cfa_register DWARF unwind, reg too big"); - return false; - } - results->cfaRegister = (uint32_t)reg; - _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_register(%" PRIu64 ")\n", reg); - break; - case DW_CFA_def_cfa_offset: - results->cfaRegisterOffset = (int32_t) - addressSpace.getULEB128(p, instructionsEnd); - results->codeOffsetAtStackDecrement = (uint32_t)codeOffset; - _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset(%d)\n", - results->cfaRegisterOffset); - break; - case DW_CFA_def_cfa_expression: - results->cfaRegister = 0; - results->cfaExpression = (int64_t)p; - length = addressSpace.getULEB128(p, instructionsEnd); - assert(length < static_cast(~0) && "pointer overflow"); - p += static_cast(length); - _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_expression(expression=0x%" PRIx64 - ", length=%" PRIu64 ")\n", - results->cfaExpression, length); - break; - case DW_CFA_expression: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_expression DWARF unwind, reg too big"); - return false; - } - results->setRegister(reg, kRegisterAtExpression, (int64_t)p, - initialState); - length = addressSpace.getULEB128(p, instructionsEnd); - assert(length < static_cast(~0) && "pointer overflow"); - p += static_cast(length); - _LIBUNWIND_TRACE_DWARF("DW_CFA_expression(reg=%" PRIu64 ", " - "expression=0x%" PRIx64 ", " - "length=%" PRIu64 ")\n", - reg, results->savedRegisters[reg].value, length); - break; - case DW_CFA_offset_extended_sf: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_offset_extended_sf DWARF unwind, reg too big"); - return false; - } - offset = - addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor; - results->setRegister(reg, kRegisterInCFA, offset, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended_sf(reg=%" PRIu64 ", " - "offset=%" PRId64 ")\n", - reg, offset); - break; - case DW_CFA_def_cfa_sf: - reg = addressSpace.getULEB128(p, instructionsEnd); - offset = - addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor; - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_def_cfa_sf DWARF unwind, reg too big"); - return false; - } - results->cfaRegister = (uint32_t)reg; - results->cfaRegisterOffset = (int32_t)offset; - _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_sf(reg=%" PRIu64 ", " - "offset=%" PRId64 ")\n", - reg, offset); - break; - case DW_CFA_def_cfa_offset_sf: - results->cfaRegisterOffset = (int32_t) - (addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor); - results->codeOffsetAtStackDecrement = (uint32_t)codeOffset; - _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset_sf(%d)\n", - results->cfaRegisterOffset); - break; - case DW_CFA_val_offset: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG( - "malformed DW_CFA_val_offset DWARF unwind, reg (%" PRIu64 - ") out of range\n", - reg); - return false; - } - offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) - * cieInfo.dataAlignFactor; - results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset(reg=%" PRIu64 ", " - "offset=%" PRId64 "\n", - reg, offset); - break; - case DW_CFA_val_offset_sf: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_val_offset_sf DWARF unwind, reg too big"); - return false; - } - offset = - addressSpace.getSLEB128(p, instructionsEnd) * cieInfo.dataAlignFactor; - results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset_sf(reg=%" PRIu64 ", " - "offset=%" PRId64 "\n", - reg, offset); - break; - case DW_CFA_val_expression: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0( - "malformed DW_CFA_val_expression DWARF unwind, reg too big"); - return false; - } - results->setRegister(reg, kRegisterIsExpression, (int64_t)p, - initialState); - length = addressSpace.getULEB128(p, instructionsEnd); - assert(length < static_cast(~0) && "pointer overflow"); - p += static_cast(length); - _LIBUNWIND_TRACE_DWARF("DW_CFA_val_expression(reg=%" PRIu64 ", " - "expression=0x%" PRIx64 ", length=%" PRIu64 ")\n", - reg, results->savedRegisters[reg].value, length); - break; - case DW_CFA_GNU_args_size: - length = addressSpace.getULEB128(p, instructionsEnd); - results->spExtraArgSize = (uint32_t)length; - _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_args_size(%" PRIu64 ")\n", length); - break; - case DW_CFA_GNU_negative_offset_extended: - reg = addressSpace.getULEB128(p, instructionsEnd); - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG0("malformed DW_CFA_GNU_negative_offset_extended DWARF " - "unwind, reg too big"); - return false; - } - offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) - * cieInfo.dataAlignFactor; - results->setRegister(reg, kRegisterInCFA, -offset, initialState); - _LIBUNWIND_TRACE_DWARF( - "DW_CFA_GNU_negative_offset_extended(%" PRId64 ")\n", offset); - break; + case DW_CFA_restore_state: + if (rememberStack.entry != NULL) { + PrologInfoStackEntry *top = rememberStack.entry; + *results = top->info; + rememberStack.entry = top->next; + _LIBUNWIND_REMEMBER_FREE(top); + } else { + return false; + } + _LIBUNWIND_TRACE_DWARF("DW_CFA_restore_state\n"); + break; + case DW_CFA_def_cfa: + reg = addressSpace.getULEB128(p, instructionsEnd); + offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0("malformed DW_CFA_def_cfa DWARF unwind, reg too big"); + return false; + } + results->cfaRegister = (uint32_t)reg; + results->cfaRegisterOffset = (int32_t)offset; + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa(reg=%" PRIu64 ", offset=%" PRIu64 + ")\n", + reg, offset); + break; + case DW_CFA_def_cfa_register: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_def_cfa_register DWARF unwind, reg too big"); + return false; + } + results->cfaRegister = (uint32_t)reg; + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_register(%" PRIu64 ")\n", reg); + break; + case DW_CFA_def_cfa_offset: + results->cfaRegisterOffset = + (int32_t)addressSpace.getULEB128(p, instructionsEnd); + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset(%d)\n", + results->cfaRegisterOffset); + break; + case DW_CFA_def_cfa_expression: + results->cfaRegister = 0; + results->cfaExpression = (int64_t)p; + length = addressSpace.getULEB128(p, instructionsEnd); + assert(length < static_cast(~0) && "pointer overflow"); + p += static_cast(length); + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_expression(expression=0x%" PRIx64 + ", length=%" PRIu64 ")\n", + results->cfaExpression, length); + break; + case DW_CFA_expression: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_expression DWARF unwind, reg too big"); + return false; + } + results->setRegister(reg, kRegisterAtExpression, (int64_t)p, + initialState); + length = addressSpace.getULEB128(p, instructionsEnd); + assert(length < static_cast(~0) && "pointer overflow"); + p += static_cast(length); + _LIBUNWIND_TRACE_DWARF("DW_CFA_expression(reg=%" PRIu64 ", " + "expression=0x%" PRIx64 ", " + "length=%" PRIu64 ")\n", + reg, results->savedRegisters[reg].value, length); + break; + case DW_CFA_offset_extended_sf: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_offset_extended_sf DWARF unwind, reg too big"); + return false; + } + offset = addressSpace.getSLEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + results->setRegister(reg, kRegisterInCFA, offset, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_offset_extended_sf(reg=%" PRIu64 ", " + "offset=%" PRId64 ")\n", + reg, offset); + break; + case DW_CFA_def_cfa_sf: + reg = addressSpace.getULEB128(p, instructionsEnd); + offset = addressSpace.getSLEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_def_cfa_sf DWARF unwind, reg too big"); + return false; + } + results->cfaRegister = (uint32_t)reg; + results->cfaRegisterOffset = (int32_t)offset; + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_sf(reg=%" PRIu64 ", " + "offset=%" PRId64 ")\n", + reg, offset); + break; + case DW_CFA_def_cfa_offset_sf: + results->cfaRegisterOffset = + (int32_t)(addressSpace.getSLEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor); + _LIBUNWIND_TRACE_DWARF("DW_CFA_def_cfa_offset_sf(%d)\n", + results->cfaRegisterOffset); + break; + case DW_CFA_val_offset: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG( + "malformed DW_CFA_val_offset DWARF unwind, reg (%" PRIu64 + ") out of range\n", + reg); + return false; + } + offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset(reg=%" PRIu64 ", " + "offset=%" PRId64 "\n", + reg, offset); + break; + case DW_CFA_val_offset_sf: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_val_offset_sf DWARF unwind, reg too big"); + return false; + } + offset = addressSpace.getSLEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + results->setRegister(reg, kRegisterOffsetFromCFA, offset, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_val_offset_sf(reg=%" PRIu64 ", " + "offset=%" PRId64 "\n", + reg, offset); + break; + case DW_CFA_val_expression: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0( + "malformed DW_CFA_val_expression DWARF unwind, reg too big"); + return false; + } + results->setRegister(reg, kRegisterIsExpression, (int64_t)p, + initialState); + length = addressSpace.getULEB128(p, instructionsEnd); + assert(length < static_cast(~0) && "pointer overflow"); + p += static_cast(length); + _LIBUNWIND_TRACE_DWARF("DW_CFA_val_expression(reg=%" PRIu64 ", " + "expression=0x%" PRIx64 ", length=%" PRIu64 + ")\n", + reg, results->savedRegisters[reg].value, length); + break; + case DW_CFA_GNU_args_size: + length = addressSpace.getULEB128(p, instructionsEnd); + results->spExtraArgSize = (uint32_t)length; + _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_args_size(%" PRIu64 ")\n", length); + break; + case DW_CFA_GNU_negative_offset_extended: + reg = addressSpace.getULEB128(p, instructionsEnd); + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG0("malformed DW_CFA_GNU_negative_offset_extended DWARF " + "unwind, reg too big"); + return false; + } + offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + results->setRegister(reg, kRegisterInCFA, -offset, initialState); + _LIBUNWIND_TRACE_DWARF( + "DW_CFA_GNU_negative_offset_extended(%" PRId64 ")\n", offset); + break; #if defined(_LIBUNWIND_TARGET_AARCH64) || defined(_LIBUNWIND_TARGET_SPARC) - // The same constant is used to represent different instructions on - // AArch64 (negate_ra_state) and SPARC (window_save). - static_assert(DW_CFA_AARCH64_negate_ra_state == DW_CFA_GNU_window_save, - "uses the same constant"); - case DW_CFA_AARCH64_negate_ra_state: - switch (arch) { + // The same constant is used to represent different instructions on + // AArch64 (negate_ra_state) and SPARC (window_save). + static_assert(DW_CFA_AARCH64_negate_ra_state == DW_CFA_GNU_window_save, + "uses the same constant"); + case DW_CFA_AARCH64_negate_ra_state: + switch (arch) { #if defined(_LIBUNWIND_TARGET_AARCH64) case REGISTERS_ARM64: { int64_t value = results->savedRegisters[UNW_ARM64_RA_SIGN_STATE].value ^ 0x1; - results->setRegisterValue(UNW_ARM64_RA_SIGN_STATE, value, initialState); + results->setRegisterValue(UNW_ARM64_RA_SIGN_STATE, value, + initialState); _LIBUNWIND_TRACE_DWARF("DW_CFA_AARCH64_negate_ra_state\n"); } break; #endif #if defined(_LIBUNWIND_TARGET_SPARC) - // case DW_CFA_GNU_window_save: - case REGISTERS_SPARC: - _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_window_save()\n"); - for (reg = UNW_SPARC_O0; reg <= UNW_SPARC_O7; reg++) { - results->setRegister(reg, kRegisterInRegister, - ((int64_t)reg - UNW_SPARC_O0) + UNW_SPARC_I0, - initialState); - } + // case DW_CFA_GNU_window_save: + case REGISTERS_SPARC: + _LIBUNWIND_TRACE_DWARF("DW_CFA_GNU_window_save()\n"); + for (reg = UNW_SPARC_O0; reg <= UNW_SPARC_O7; reg++) { + results->setRegister(reg, kRegisterInRegister, + ((int64_t)reg - UNW_SPARC_O0) + UNW_SPARC_I0, + initialState); + } - for (reg = UNW_SPARC_L0; reg <= UNW_SPARC_I7; reg++) { - results->setRegister(reg, kRegisterInCFA, - ((int64_t)reg - UNW_SPARC_L0) * 4, initialState); + for (reg = UNW_SPARC_L0; reg <= UNW_SPARC_I7; reg++) { + results->setRegister(reg, kRegisterInCFA, + ((int64_t)reg - UNW_SPARC_L0) * 4, + initialState); + } + break; +#endif } break; -#endif - } - break; #else - (void)arch; + (void)arch; #endif - default: - operand = opcode & 0x3F; - switch (opcode & 0xC0) { - case DW_CFA_offset: - reg = operand; - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG("malformed DW_CFA_offset DWARF unwind, reg (%" PRIu64 - ") out of range", - reg); - return false; - } - offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) - * cieInfo.dataAlignFactor; - results->setRegister(reg, kRegisterInCFA, offset, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_offset(reg=%d, offset=%" PRId64 ")\n", - operand, offset); - break; - case DW_CFA_advance_loc: - codeOffset += operand * cieInfo.codeAlignFactor; - _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc: new offset=%" PRIu64 "\n", - static_cast(codeOffset)); - break; - case DW_CFA_restore: - reg = operand; - if (reg > kMaxRegisterNumber) { - _LIBUNWIND_LOG("malformed DW_CFA_restore DWARF unwind, reg (%" PRIu64 - ") out of range", - reg); - return false; - } - results->restoreRegisterToInitialState(reg, initialState); - _LIBUNWIND_TRACE_DWARF("DW_CFA_restore(reg=%" PRIu64 ")\n", - static_cast(operand)); - break; default: - _LIBUNWIND_TRACE_DWARF("unknown CFA opcode 0x%02X\n", opcode); - return false; + operand = opcode & 0x3F; + switch (opcode & 0xC0) { + case DW_CFA_offset: + reg = operand; + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG("malformed DW_CFA_offset DWARF unwind, reg (%" PRIu64 + ") out of range", + reg); + return false; + } + offset = (int64_t)addressSpace.getULEB128(p, instructionsEnd) * + cieInfo.dataAlignFactor; + results->setRegister(reg, kRegisterInCFA, offset, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_offset(reg=%d, offset=%" PRId64 ")\n", + operand, offset); + break; + case DW_CFA_advance_loc: + codeOffset += operand * cieInfo.codeAlignFactor; + _LIBUNWIND_TRACE_DWARF("DW_CFA_advance_loc: new offset=%" PRIu64 "\n", + static_cast(codeOffset)); + break; + case DW_CFA_restore: + reg = operand; + if (reg > kMaxRegisterNumber) { + _LIBUNWIND_LOG( + "malformed DW_CFA_restore DWARF unwind, reg (%" PRIu64 + ") out of range", + reg); + return false; + } + results->restoreRegisterToInitialState(reg, initialState); + _LIBUNWIND_TRACE_DWARF("DW_CFA_restore(reg=%" PRIu64 ")\n", + static_cast(operand)); + break; + default: + _LIBUNWIND_TRACE_DWARF("unknown CFA opcode 0x%02X\n", opcode); + return false; + } } } } - return true; } diff --git a/lib/libunwind/src/FrameHeaderCache.hpp b/lib/libunwind/src/FrameHeaderCache.hpp index 813fcd408b..54d5d33c3c 100644 --- a/lib/libunwind/src/FrameHeaderCache.hpp +++ b/lib/libunwind/src/FrameHeaderCache.hpp @@ -32,7 +32,7 @@ class _LIBUNWIND_HIDDEN FrameHeaderCache { struct CacheEntry { uintptr_t LowPC() { return Info.dso_base; }; - uintptr_t HighPC() { return Info.dso_base + Info.dwarf_section_length; }; + uintptr_t HighPC() { return Info.dso_base + Info.text_segment_length; }; UnwindInfoSections Info; CacheEntry *Next; }; diff --git a/lib/libunwind/src/Registers.hpp b/lib/libunwind/src/Registers.hpp index c76b05bf31..e0cb84f00e 100644 --- a/lib/libunwind/src/Registers.hpp +++ b/lib/libunwind/src/Registers.hpp @@ -39,6 +39,8 @@ enum { }; #if defined(_LIBUNWIND_TARGET_I386) +class _LIBUNWIND_HIDDEN Registers_x86; +extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *); /// Registers_x86 holds the register state of a thread in a 32-bit intel /// process. class _LIBUNWIND_HIDDEN Registers_x86 { @@ -56,7 +58,7 @@ public: v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_x86_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86; } static int getArch() { return REGISTERS_X86; } @@ -248,6 +250,8 @@ inline void Registers_x86::setVectorRegister(int, v128) { #if defined(_LIBUNWIND_TARGET_X86_64) /// Registers_x86_64 holds the register state of a thread in a 64-bit intel /// process. +class _LIBUNWIND_HIDDEN Registers_x86_64; +extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *); class _LIBUNWIND_HIDDEN Registers_x86_64 { public: Registers_x86_64(); @@ -263,7 +267,7 @@ public: v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_x86_64_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_X86_64; } static int getArch() { return REGISTERS_X86_64; } @@ -1510,12 +1514,12 @@ inline void Registers_ppc64::setFloatRegister(int regNum, double value) { } inline bool Registers_ppc64::validVectorRegister(int regNum) const { -#ifdef PPC64_HAS_VMX +#if defined(__VSX__) if (regNum >= UNW_PPC64_VS0 && regNum <= UNW_PPC64_VS31) return true; if (regNum >= UNW_PPC64_VS32 && regNum <= UNW_PPC64_VS63) return true; -#else +#elif defined(__ALTIVEC__) if (regNum >= UNW_PPC64_V0 && regNum <= UNW_PPC64_V31) return true; #endif @@ -1771,6 +1775,8 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) { #if defined(_LIBUNWIND_TARGET_AARCH64) /// Registers_arm64 holds the register state of a thread in a 64-bit arm /// process. +class _LIBUNWIND_HIDDEN Registers_arm64; +extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); class _LIBUNWIND_HIDDEN Registers_arm64 { public: Registers_arm64(); @@ -1786,7 +1792,7 @@ public: v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto(); + void jumpto() { __libunwind_Registers_arm64_jumpto(this); } static int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64; } static int getArch() { return REGISTERS_ARM64; } diff --git a/lib/libunwind/src/Unwind-seh.cpp b/lib/libunwind/src/Unwind-seh.cpp index 403ab2d771..6e2b4e73e4 100644 --- a/lib/libunwind/src/Unwind-seh.cpp +++ b/lib/libunwind/src/Unwind-seh.cpp @@ -46,18 +46,6 @@ using namespace libunwind; /// handling. #define STATUS_GCC_UNWIND MAKE_GCC_EXCEPTION(1) // 0x21474343 -/// Class of foreign exceptions based on unrecognized SEH exceptions. -static const uint64_t kSEHExceptionClass = 0x434C4E4753454800; // CLNGSEH\0 - -/// Exception cleanup routine used by \c _GCC_specific_handler to -/// free foreign exceptions. -static void seh_exc_cleanup(_Unwind_Reason_Code urc, _Unwind_Exception *exc) { - (void)urc; - if (exc->exception_class != kSEHExceptionClass) - _LIBUNWIND_ABORT("SEH cleanup called on non-SEH exception"); - free(exc); -} - static int __unw_init_seh(unw_cursor_t *cursor, CONTEXT *ctx); static DISPATCHER_CONTEXT *__unw_seh_get_disp_ctx(unw_cursor_t *cursor); static void __unw_seh_set_disp_ctx(unw_cursor_t *cursor, @@ -108,10 +96,10 @@ _GCC_specific_handler(PEXCEPTION_RECORD ms_exc, PVOID frame, PCONTEXT ms_ctx, } } else { // Foreign exception. - exc = (_Unwind_Exception *)malloc(sizeof(_Unwind_Exception)); - exc->exception_class = kSEHExceptionClass; - exc->exception_cleanup = seh_exc_cleanup; - memset(exc->private_, 0, sizeof(exc->private_)); + // We can't interact with them (we don't know the original target frame + // that we should pass on to RtlUnwindEx in _Unwind_Resume), so just + // pass without calling our destructors here. + return ExceptionContinueSearch; } if (!ctx) { __unw_init_seh(&cursor, disp->ContextRecord); diff --git a/lib/libunwind/src/UnwindCursor.hpp b/lib/libunwind/src/UnwindCursor.hpp index f346c720d2..9f8fa65107 100644 --- a/lib/libunwind/src/UnwindCursor.hpp +++ b/lib/libunwind/src/UnwindCursor.hpp @@ -81,6 +81,7 @@ template class _LIBUNWIND_HIDDEN DwarfFDECache { typedef typename A::pint_t pint_t; public: + static constexpr pint_t kSearchAll = static_cast(-1); static pint_t findFDE(pint_t mh, pint_t pc); static void add(pint_t mh, pint_t ip_start, pint_t ip_end, pint_t fde); static void removeAllIn(pint_t mh); @@ -138,7 +139,7 @@ typename A::pint_t DwarfFDECache::findFDE(pint_t mh, pint_t pc) { pint_t result = 0; _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared()); for (entry *p = _buffer; p < _bufferUsed; ++p) { - if ((mh == p->mh) || (mh == 0)) { + if ((mh == p->mh) || (mh == kSearchAll)) { if ((p->ip_start <= pc) && (pc < p->ip_end)) { result = p->fde; break; @@ -530,6 +531,8 @@ UnwindCursor::UnwindCursor(unw_context_t *context, A &as) : _addressSpace(as), _unwindInfoMissing(false) { static_assert((check_fit, unw_cursor_t>::does_fit), "UnwindCursor<> does not fit in unw_cursor_t"); + static_assert((alignof(UnwindCursor) <= alignof(unw_cursor_t)), + "UnwindCursor<> requires more alignment than unw_cursor_t"); memset(&_info, 0, sizeof(_info)); memset(&_histTable, 0, sizeof(_histTable)); _dispContext.ContextRecord = &_msContext; @@ -923,6 +926,9 @@ private: #endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) + bool getInfoFromFdeCie(const typename CFI_Parser::FDE_Info &fdeInfo, + const typename CFI_Parser::CIE_Info &cieInfo, + pint_t pc, uintptr_t dso_base); bool getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, uint32_t fdeSectionOffsetHint=0); int stepWithDwarfFDE() { @@ -1182,6 +1188,8 @@ UnwindCursor::UnwindCursor(unw_context_t *context, A &as) _isSignalFrame(false) { static_assert((check_fit, unw_cursor_t>::does_fit), "UnwindCursor<> does not fit in unw_cursor_t"); + static_assert((alignof(UnwindCursor) <= alignof(unw_cursor_t)), + "UnwindCursor<> requires more alignment than unw_cursor_t"); memset(&_info, 0, sizeof(_info)); } @@ -1472,6 +1480,32 @@ bool UnwindCursor::getInfoFromEHABISection( #endif #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) +template +bool UnwindCursor::getInfoFromFdeCie( + const typename CFI_Parser::FDE_Info &fdeInfo, + const typename CFI_Parser::CIE_Info &cieInfo, pint_t pc, + uintptr_t dso_base) { + typename CFI_Parser::PrologInfo prolog; + if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, + R::getArch(), &prolog)) { + // Save off parsed FDE info + _info.start_ip = fdeInfo.pcStart; + _info.end_ip = fdeInfo.pcEnd; + _info.lsda = fdeInfo.lsda; + _info.handler = cieInfo.personality; + // Some frameless functions need SP altered when resuming in function, so + // propagate spExtraArgSize. + _info.gp = prolog.spExtraArgSize; + _info.flags = 0; + _info.format = dwarfEncoding(); + _info.unwind_info = fdeInfo.fdeStart; + _info.unwind_info_size = static_cast(fdeInfo.fdeLength); + _info.extra = static_cast(dso_base); + return true; + } + return false; +} + template bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections §s, @@ -1483,7 +1517,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, // If compact encoding table gave offset into dwarf section, go directly there if (fdeSectionOffsetHint != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, + sects.dwarf_section_length, sects.dwarf_section + fdeSectionOffsetHint, &fdeInfo, &cieInfo); } @@ -1500,7 +1534,7 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, if (cachedFDE != 0) { foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, + sects.dwarf_section_length, cachedFDE, &fdeInfo, &cieInfo); foundInCache = foundFDE; } @@ -1508,25 +1542,11 @@ bool UnwindCursor::getInfoFromDwarfSection(pint_t pc, if (!foundFDE) { // Still not found, do full scan of __eh_frame section. foundFDE = CFI_Parser::findFDE(_addressSpace, pc, sects.dwarf_section, - (uint32_t)sects.dwarf_section_length, 0, + sects.dwarf_section_length, 0, &fdeInfo, &cieInfo); } if (foundFDE) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, pc, - R::getArch(), &prolog)) { - // Save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = (unw_word_t) sects.dso_base; - + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, sects.dso_base)) { // Add to cache (to make next lookup faster) if we had no hint // and there was no index. if (!foundInCache && (fdeSectionOffsetHint == 0)) { @@ -1759,12 +1779,12 @@ bool UnwindCursor::getInfoFromCompactEncodingSection(pint_t pc, } } - // extact personality routine, if encoding says function has one + // extract personality routine, if encoding says function has one uint32_t personalityIndex = (encoding & UNWIND_PERSONALITY_MASK) >> (__builtin_ctz(UNWIND_PERSONALITY_MASK)); if (personalityIndex != 0) { --personalityIndex; // change 1-based to zero-based index - if (personalityIndex > sectionHeader.personalityArrayCount()) { + if (personalityIndex >= sectionHeader.personalityArrayCount()) { _LIBUNWIND_DEBUG_LOG("found encoding 0x%08X with personality index %d, " "but personality table has only %d entries", encoding, personalityIndex, @@ -1926,60 +1946,27 @@ void UnwindCursor::setInfoBasedOnIPRegister(bool isReturnAddress) { #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) // There is no static unwind info for this pc. Look to see if an FDE was // dynamically registered for it. - pint_t cachedFDE = DwarfFDECache::findFDE(0, pc); + pint_t cachedFDE = DwarfFDECache::findFDE(DwarfFDECache::kSearchAll, + pc); if (cachedFDE != 0) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; - const char *msg = CFI_Parser::decodeFDE(_addressSpace, - cachedFDE, &fdeInfo, &cieInfo); - if (msg == NULL) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, - pc, R::getArch(), &prolog)) { - // save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - // Some frameless functions need SP - // altered when resuming in function. - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = 0; + typename CFI_Parser::FDE_Info fdeInfo; + typename CFI_Parser::CIE_Info cieInfo; + if (!CFI_Parser::decodeFDE(_addressSpace, cachedFDE, &fdeInfo, &cieInfo)) + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0)) return; - } - } } // Lastly, ask AddressSpace object about platform specific ways to locate // other FDEs. pint_t fde; if (_addressSpace.findOtherFDE(pc, fde)) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; + typename CFI_Parser::FDE_Info fdeInfo; + typename CFI_Parser::CIE_Info cieInfo; if (!CFI_Parser::decodeFDE(_addressSpace, fde, &fdeInfo, &cieInfo)) { // Double check this FDE is for a function that includes the pc. - if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { - typename CFI_Parser::PrologInfo prolog; - if (CFI_Parser::parseFDEInstructions(_addressSpace, fdeInfo, cieInfo, - pc, R::getArch(), &prolog)) { - // save off parsed FDE info - _info.start_ip = fdeInfo.pcStart; - _info.end_ip = fdeInfo.pcEnd; - _info.lsda = fdeInfo.lsda; - _info.handler = cieInfo.personality; - _info.gp = prolog.spExtraArgSize; - _info.flags = 0; - _info.format = dwarfEncoding(); - _info.unwind_info = fdeInfo.fdeStart; - _info.unwind_info_size = (uint32_t)fdeInfo.fdeLength; - _info.extra = 0; + if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) + if (getInfoFromFdeCie(fdeInfo, cieInfo, pc, 0)) return; - } - } } } #endif // #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND) diff --git a/lib/libunwind/src/UnwindLevel1.c b/lib/libunwind/src/UnwindLevel1.c index 3e75b5f13c..68e5e48b8c 100644 --- a/lib/libunwind/src/UnwindLevel1.c +++ b/lib/libunwind/src/UnwindLevel1.c @@ -39,8 +39,7 @@ unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except __unw_init_local(cursor, uc); // Walk each frame looking for a place to stop. - bool handlerNotFound = true; - while (handlerNotFound) { + while (true) { // Ask libunwind to get next frame (skip over first which is // _Unwind_RaiseException). int stepResult = __unw_step(cursor); @@ -102,7 +101,6 @@ unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except case _URC_HANDLER_FOUND: // found a catch clause or locals that need destructing in this frame // stop search and remember stack pointer at the frame - handlerNotFound = false; __unw_get_reg(cursor, UNW_REG_SP, &sp); exception_object->private_2 = (uintptr_t)sp; _LIBUNWIND_TRACE_UNWINDING( diff --git a/lib/libunwind/src/UnwindRegistersRestore.S b/lib/libunwind/src/UnwindRegistersRestore.S index 5d54432152..289afe98b0 100644 --- a/lib/libunwind/src/UnwindRegistersRestore.S +++ b/lib/libunwind/src/UnwindRegistersRestore.S @@ -13,14 +13,10 @@ #if !defined(__USING_SJLJ_EXCEPTIONS__) #if defined(__i386__) -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto) # -# void libunwind::Registers_x86::jumpto() +# extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *); # -#if defined(_WIN32) -# On windows, the 'this' pointer is passed in ecx instead of on the stack - movl %ecx, %eax -#else # On entry: # + + # +-----------------------+ @@ -30,7 +26,6 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) # +-----------------------+ <-- SP # + + movl 4(%esp), %eax -#endif # set up eax and ret on new stack location movl 28(%eax), %edx # edx holds new stack pointer subl $8,%edx @@ -60,9 +55,9 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_x866jumptoEv) #elif defined(__x86_64__) -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind16Registers_x86_646jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_64_jumpto) # -# void libunwind::Registers_x86_64::jumpto() +# extern "C" void __libunwind_Registers_x86_64_jumpto(Registers_x86_64 *); # #if defined(_WIN64) # On entry, thread_state pointer is in rcx; move it into rdi @@ -175,7 +170,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv) PPC64_LR(30) PPC64_LR(31) -#ifdef PPC64_HAS_VMX +#if defined(__VSX__) // restore VS registers // (note that this also restores floating point registers and V registers, @@ -317,6 +312,7 @@ PPC64_CLVS_BOTTOM(n) PPC64_LF(30) PPC64_LF(31) +#if defined(__ALTIVEC__) // restore vector registers if any are in use ld %r5, PPC64_OFFS_VRSAVE(%r3) // test VRsave cmpwi %r5, 0 @@ -378,6 +374,7 @@ PPC64_CLV_UNALIGNED_BOTTOM(n) PPC64_CLV_UNALIGNEDh(31) #endif +#endif Lnovec: ld %r0, PPC64_OFFS_CR(%r3) @@ -436,6 +433,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv) lwz %r30,128(%r3) lwz %r31,132(%r3) +#ifndef __NO_FPRS__ // restore float registers lfd %f0, 160(%r3) lfd %f1, 168(%r3) @@ -469,7 +467,9 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv) lfd %f29,392(%r3) lfd %f30,400(%r3) lfd %f31,408(%r3) +#endif +#if defined(__ALTIVEC__) // restore vector registers if any are in use lwz %r5, 156(%r3) // test VRsave cmpwi %r5, 0 @@ -542,6 +542,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv) LOAD_VECTOR_UNALIGNEDh(29) LOAD_VECTOR_UNALIGNEDh(30) LOAD_VECTOR_UNALIGNEDh(31) +#endif Lnovec: lwz %r0, 136(%r3) // __cr @@ -560,13 +561,13 @@ Lnovec: #elif defined(__aarch64__) // -// void libunwind::Registers_arm64::jumpto() +// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); // // On entry: // thread_state pointer is in x0 // .p2align 2 -DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_arm646jumptoEv) +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) // skip restore of x0,x1 for now ldp x2, x3, [x0, #0x010] ldp x4, x5, [x0, #0x020] diff --git a/lib/libunwind/src/UnwindRegistersSave.S b/lib/libunwind/src/UnwindRegistersSave.S index 51bb9b0688..94fc836545 100644 --- a/lib/libunwind/src/UnwindRegistersSave.S +++ b/lib/libunwind/src/UnwindRegistersSave.S @@ -384,7 +384,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) mfvrsave %r0 std %r0, PPC64_OFFS_VRSAVE(%r3) -#ifdef PPC64_HAS_VMX +#if defined(__VSX__) // save VS registers // (note that this also saves floating point registers and V registers, // because part of VS is mapped to these registers) @@ -501,6 +501,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) PPC64_STF(30) PPC64_STF(31) +#if defined(__ALTIVEC__) // save vector registers // Use 16-bytes below the stack pointer as an @@ -548,6 +549,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) PPC64_STV_UNALIGNED(30) PPC64_STV_UNALIGNED(31) +#endif #endif li %r3, 0 // return UNW_ESUCCESS @@ -608,6 +610,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) mfctr %r0 stw %r0, 148(%r3) +#if !defined(__NO_FPRS__) // save float registers stfd %f0, 160(%r3) stfd %f1, 168(%r3) @@ -641,8 +644,9 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) stfd %f29,392(%r3) stfd %f30,400(%r3) stfd %f31,408(%r3) +#endif - +#if defined(__ALTIVEC__) // save vector registers subi %r4, %r1, 16 @@ -692,6 +696,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) SAVE_VECTOR_UNALIGNED(%v29, 424+0x1D0) SAVE_VECTOR_UNALIGNED(%v30, 424+0x1E0) SAVE_VECTOR_UNALIGNED(%v31, 424+0x1F0) +#endif li %r3, 0 // return UNW_ESUCCESS blr diff --git a/lib/libunwind/src/Unwind_AppleExtras.cpp b/lib/libunwind/src/Unwind_AppleExtras.cpp index 536303993e..e3d41ca2b4 100644 --- a/lib/libunwind/src/Unwind_AppleExtras.cpp +++ b/lib/libunwind/src/Unwind_AppleExtras.cpp @@ -8,35 +8,6 @@ //===----------------------------------------------------------------------===// #include "config.h" -#include "AddressSpace.hpp" -#include "DwarfParser.hpp" - - -// private keymgr stuff -#define KEYMGR_GCC3_DW2_OBJ_LIST 302 -extern "C" { - extern void _keymgr_set_and_unlock_processwide_ptr(int key, void *ptr); - extern void *_keymgr_get_and_lock_processwide_ptr(int key); -} - -// undocumented libgcc "struct object" -struct libgcc_object { - void *start; - void *unused1; - void *unused2; - void *fde; - unsigned long encoding; - void *fde_end; - libgcc_object *next; -}; - -// undocumented libgcc "struct km_object_info" referenced by -// KEYMGR_GCC3_DW2_OBJ_LIST -struct libgcc_object_info { - libgcc_object *seen_objects; - libgcc_object *unseen_objects; - unsigned spare[2]; -}; // static linker symbols to prevent wrong two level namespace for _Unwind symbols @@ -140,44 +111,3 @@ NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Resume_or_Rethrow) NOT_HERE_BEFORE_5_0(_Unwind_SjLj_Unregister) #endif // defined(_LIBUNWIND_BUILD_SJLJ_APIS) - - -namespace libunwind { - -_LIBUNWIND_HIDDEN -bool checkKeyMgrRegisteredFDEs(uintptr_t pc, void *&fde) { -#if __MAC_OS_X_VERSION_MIN_REQUIRED - // lastly check for old style keymgr registration of dynamically generated - // FDEs acquire exclusive access to libgcc_object_info - libgcc_object_info *head = (libgcc_object_info *) - _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST); - if (head != NULL) { - // look at each FDE in keymgr - for (libgcc_object *ob = head->unseen_objects; ob != NULL; ob = ob->next) { - CFI_Parser::FDE_Info fdeInfo; - CFI_Parser::CIE_Info cieInfo; - const char *msg = CFI_Parser::decodeFDE( - LocalAddressSpace::sThisAddressSpace, - (uintptr_t)ob->fde, &fdeInfo, &cieInfo); - if (msg == NULL) { - // Check if this FDE is for a function that includes the pc - if ((fdeInfo.pcStart <= pc) && (pc < fdeInfo.pcEnd)) { - fde = (void*)fdeInfo.pcStart; - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, - head); - return true; - } - } - } - } - // release libgcc_object_info - _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, head); -#else - (void)pc; - (void)fde; -#endif - return false; -} - -} - diff --git a/lib/libunwind/src/assembly.h b/lib/libunwind/src/assembly.h index 4cf179e13e..f2f7c84830 100644 --- a/lib/libunwind/src/assembly.h +++ b/lib/libunwind/src/assembly.h @@ -25,9 +25,6 @@ #define PPC64_OFFS_VRSAVE 304 #define PPC64_OFFS_FP 312 #define PPC64_OFFS_V 824 -#ifdef _ARCH_PWR8 -#define PPC64_HAS_VMX -#endif #elif defined(__APPLE__) && defined(__aarch64__) #define SEPARATOR %% #else @@ -48,6 +45,24 @@ #define PPC64_OPD2 #endif +#if defined(__ARM_FEATURE_BTI_DEFAULT) + .pushsection ".note.gnu.property", "a" SEPARATOR \ + .balign 8 SEPARATOR \ + .long 4 SEPARATOR \ + .long 0x10 SEPARATOR \ + .long 0x5 SEPARATOR \ + .asciz "GNU" SEPARATOR \ + .long 0xc0000000 SEPARATOR /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ \ + .long 4 SEPARATOR \ + .long 3 SEPARATOR /* GNU_PROPERTY_AARCH64_FEATURE_1_BTI AND */ \ + /* GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ \ + .long 0 SEPARATOR \ + .popsection SEPARATOR +#define AARCH64_BTI bti c +#else +#define AARCH64_BTI +#endif + #define GLUE2(a, b) a ## b #define GLUE(a, b) GLUE2(a, b) #define SYMBOL_NAME(name) GLUE(__USER_LABEL_PREFIX__, name) @@ -144,7 +159,8 @@ SYMBOL_IS_FUNC(SYMBOL_NAME(name)) SEPARATOR \ PPC64_OPD1 \ SYMBOL_NAME(name): \ - PPC64_OPD2 + PPC64_OPD2 \ + AARCH64_BTI #if defined(__arm__) #if !defined(__ARM_ARCH) diff --git a/lib/libunwind/src/config.h b/lib/libunwind/src/config.h index 842fd829af..9efed05405 100644 --- a/lib/libunwind/src/config.h +++ b/lib/libunwind/src/config.h @@ -18,23 +18,15 @@ #include #include -// Define static_assert() unless already defined by compiler. -#ifndef __has_feature - #define __has_feature(__x) 0 -#endif -#if !(__has_feature(cxx_static_assert)) && !defined(static_assert) - #define static_assert(__b, __m) \ - extern int compile_time_assert_failed[ ( __b ) ? 1 : -1 ] \ - __attribute__( ( unused ) ); -#endif +#include <__libunwind_config.h> // Platform specific configuration defines. #ifdef __APPLE__ #if defined(FOR_DYLD) - #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND + #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 #else - #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND - #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 + #define _LIBUNWIND_SUPPORT_COMPACT_UNWIND 1 + #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #endif #elif defined(_WIN32) #ifdef __SEH__ @@ -42,8 +34,19 @@ #else #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #endif +#elif defined(_LIBUNWIND_IS_BAREMETAL) + #if !defined(_LIBUNWIND_ARM_EHABI) + #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 + #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1 + #endif +#elif defined(__BIONIC__) && defined(_LIBUNWIND_ARM_EHABI) + // For ARM EHABI, Bionic didn't implement dl_iterate_phdr until API 21. After + // API 21, dl_iterate_phdr exists, but dl_unwind_find_exidx is much faster. + #define _LIBUNWIND_USE_DL_UNWIND_FIND_EXIDX 1 #else - #if defined(__ARM_DWARF_EH__) || !defined(__arm__) + // Assume an ELF system with a dl_iterate_phdr function. + #define _LIBUNWIND_USE_DL_ITERATE_PHDR 1 + #if !defined(_LIBUNWIND_ARM_EHABI) #define _LIBUNWIND_SUPPORT_DWARF_UNWIND 1 #define _LIBUNWIND_SUPPORT_DWARF_INDEX 1 #endif @@ -91,6 +94,8 @@ #error Unsupported target #endif +// Apple/armv7k defaults to DWARF/Compact unwinding, but its libunwind also +// needs to include the SJLJ APIs. #if (defined(__APPLE__) && defined(__arm__)) || defined(__USING_SJLJ_EXCEPTIONS__) #define _LIBUNWIND_BUILD_SJLJ_APIS #endif @@ -111,8 +116,27 @@ #endif #endif -#if defined(__powerpc64__) && defined(_ARCH_PWR8) -#define PPC64_HAS_VMX +#ifndef _LIBUNWIND_REMEMBER_HEAP_ALLOC +#if defined(_LIBUNWIND_REMEMBER_STACK_ALLOC) || defined(__APPLE__) || \ + defined(__linux__) || defined(__ANDROID__) || defined(__MINGW32__) || \ + defined(_LIBUNWIND_IS_BAREMETAL) +#define _LIBUNWIND_REMEMBER_ALLOC(_size) alloca(_size) +#define _LIBUNWIND_REMEMBER_FREE(_ptr) \ + do { \ + } while (0) +#elif defined(_WIN32) +#define _LIBUNWIND_REMEMBER_ALLOC(_size) _malloca(_size) +#define _LIBUNWIND_REMEMBER_FREE(_ptr) _freea(_ptr) +#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED +#else +#define _LIBUNWIND_REMEMBER_ALLOC(_size) malloc(_size) +#define _LIBUNWIND_REMEMBER_FREE(_ptr) free(_ptr) +#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED +#endif +#else /* _LIBUNWIND_REMEMBER_HEAP_ALLOC */ +#define _LIBUNWIND_REMEMBER_ALLOC(_size) malloc(_size) +#define _LIBUNWIND_REMEMBER_FREE(_ptr) free(_ptr) +#define _LIBUNWIND_REMEMBER_CLEANUP_NEEDED #endif #if defined(NDEBUG) && defined(_LIBUNWIND_IS_BAREMETAL) From f9a11fbfafa7e056c08350f740af8a96a722a235 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 16 Dec 2020 11:52:51 +0100 Subject: [PATCH 05/67] Update libcxx llvm commit b2851aea80e5a8f0cfd6c3c5a56a6b00fb28c6b6 --- lib/libcxx/include/__availability | 206 ++ lib/libcxx/include/__config | 273 +-- lib/libcxx/include/__config_site.in | 5 +- lib/libcxx/include/__debug | 39 +- lib/libcxx/include/__functional_03 | 48 +- lib/libcxx/include/__functional_base | 54 +- lib/libcxx/include/__functional_base_03 | 24 +- lib/libcxx/include/__hash_table | 238 +-- lib/libcxx/include/__libcpp_version | 2 +- lib/libcxx/include/__locale | 268 ++- .../include/__memory/allocator_traits.h | 589 ++++++ lib/libcxx/include/__memory/base.h | 127 ++ lib/libcxx/include/__memory/pointer_traits.h | 169 ++ lib/libcxx/include/__memory/utilities.h | 88 + lib/libcxx/include/__mutex_base | 7 +- lib/libcxx/include/__split_buffer | 24 +- lib/libcxx/include/__sso_allocator | 9 +- lib/libcxx/include/__string | 96 +- lib/libcxx/include/__threading_support | 62 +- lib/libcxx/include/__tree | 130 +- lib/libcxx/include/algorithm | 521 ++--- lib/libcxx/include/any | 30 +- lib/libcxx/include/array | 14 +- lib/libcxx/include/atomic | 170 +- lib/libcxx/include/barrier | 12 +- lib/libcxx/include/bit | 29 +- lib/libcxx/include/bitset | 10 +- lib/libcxx/include/charconv | 27 +- lib/libcxx/include/chrono | 7 +- lib/libcxx/include/cmath | 4 +- lib/libcxx/include/codecvt | 24 + lib/libcxx/include/compare | 13 +- lib/libcxx/include/complex | 17 +- lib/libcxx/include/ctime | 16 +- lib/libcxx/include/deque | 16 +- lib/libcxx/include/exception | 2 + .../include/experimental/memory_resource | 5 +- lib/libcxx/include/experimental/simd | 5 + lib/libcxx/include/ext/hash_map | 2 +- lib/libcxx/include/filesystem | 121 +- lib/libcxx/include/forward_list | 6 +- lib/libcxx/include/fstream | 136 +- lib/libcxx/include/functional | 98 +- lib/libcxx/include/future | 236 +-- lib/libcxx/include/iomanip | 2 +- lib/libcxx/include/ios | 31 +- lib/libcxx/include/iosfwd | 59 + lib/libcxx/include/istream | 34 +- lib/libcxx/include/iterator | 121 +- lib/libcxx/include/latch | 8 + lib/libcxx/include/list | 186 +- lib/libcxx/include/locale | 130 +- lib/libcxx/include/locale.h | 6 +- lib/libcxx/include/map | 50 +- lib/libcxx/include/memory | 1756 ++++------------- lib/libcxx/include/new | 187 +- lib/libcxx/include/numbers | 2 +- lib/libcxx/include/numeric | 191 +- lib/libcxx/include/optional | 5 +- lib/libcxx/include/ostream | 28 +- lib/libcxx/include/random | 222 ++- lib/libcxx/include/regex | 173 +- lib/libcxx/include/semaphore | 9 +- lib/libcxx/include/shared_mutex | 1 + lib/libcxx/include/span | 7 +- lib/libcxx/include/sstream | 431 ++-- lib/libcxx/include/stdexcept | 4 +- lib/libcxx/include/streambuf | 14 +- lib/libcxx/include/string | 322 +-- lib/libcxx/include/string_view | 35 +- lib/libcxx/include/strstream | 16 +- lib/libcxx/include/support/ibm/nanosleep.h | 38 + lib/libcxx/include/support/ibm/xlocale.h | 13 +- lib/libcxx/include/support/nuttx/xlocale.h | 18 + .../include/support/win32/locale_win32.h | 4 +- lib/libcxx/include/system_error | 4 +- lib/libcxx/include/thread | 45 +- lib/libcxx/include/tuple | 2 +- lib/libcxx/include/type_traits | 331 +--- lib/libcxx/include/typeinfo | 29 +- lib/libcxx/include/unordered_map | 270 ++- lib/libcxx/include/unordered_set | 160 +- lib/libcxx/include/utility | 38 +- lib/libcxx/include/valarray | 168 +- lib/libcxx/include/variant | 54 +- lib/libcxx/include/vector | 99 +- lib/libcxx/include/version | 36 +- lib/libcxx/include/wctype.h | 4 +- lib/libcxx/src/atomic.cpp | 2 - lib/libcxx/src/barrier.cpp | 12 +- lib/libcxx/src/chrono.cpp | 97 +- .../src/experimental/memory_resource.cpp | 10 - lib/libcxx/src/filesystem/filesystem_common.h | 92 +- lib/libcxx/src/filesystem/operations.cpp | 199 +- lib/libcxx/src/include/config_elast.h | 4 + lib/libcxx/src/include/refstring.h | 25 +- lib/libcxx/src/ios.cpp | 16 - lib/libcxx/src/ios.instantiations.cpp | 43 + lib/libcxx/src/iostream.cpp | 2 +- lib/libcxx/src/locale.cpp | 208 +- lib/libcxx/src/memory.cpp | 4 - lib/libcxx/src/new.cpp | 37 +- lib/libcxx/src/optional.cpp | 1 + .../support/runtime/exception_fallback.ipp | 4 - lib/libcxx/src/thread.cpp | 18 +- src/libcxx.zig | 1 + 106 files changed, 5346 insertions(+), 4751 deletions(-) create mode 100644 lib/libcxx/include/__availability create mode 100644 lib/libcxx/include/__memory/allocator_traits.h create mode 100644 lib/libcxx/include/__memory/base.h create mode 100644 lib/libcxx/include/__memory/pointer_traits.h create mode 100644 lib/libcxx/include/__memory/utilities.h create mode 100644 lib/libcxx/include/support/ibm/nanosleep.h create mode 100644 lib/libcxx/include/support/nuttx/xlocale.h create mode 100644 lib/libcxx/src/ios.instantiations.cpp diff --git a/lib/libcxx/include/__availability b/lib/libcxx/include/__availability new file mode 100644 index 0000000000..db2267c8eb --- /dev/null +++ b/lib/libcxx/include/__availability @@ -0,0 +1,206 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___AVAILABILITY +#define _LIBCPP___AVAILABILITY + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +// Libc++ is shipped by various vendors. In particular, it is used as a system +// library on macOS, iOS and other Apple platforms. In order for users to be +// able to compile a binary that is intended to be deployed to an older version +// of a platform, Clang provides availability attributes [1]. These attributes +// can be placed on declarations and are used to describe the life cycle of a +// symbol in the library. +// +// The main goal is to ensure a compile-time error if a symbol that hasn't been +// introduced in a previously released library is used in a program that targets +// that previously released library. Normally, this would be a load-time error +// when one tries to launch the program against the older library. +// +// For example, the filesystem library was introduced in the dylib in macOS 10.15. +// If a user compiles on a macOS 10.15 host but targets macOS 10.13 with their +// program, the compiler would normally not complain (because the required +// declarations are in the headers), but the dynamic loader would fail to find +// the symbols when actually trying to launch the program on macOS 10.13. To +// turn this into a compile-time issue instead, declarations are annotated with +// when they were introduced, and the compiler can produce a diagnostic if the +// program references something that isn't available on the deployment target. +// +// This mechanism is general in nature, and any vendor can add their markup to +// the library (see below). Whenever a new feature is added that requires support +// in the shared library, a macro should be added below to mark this feature +// as unavailable. When vendors decide to ship the feature as part of their +// shared library, they can update the markup appropriately. +// +// Note that this mechanism is disabled by default in the "upstream" libc++. +// Availability annotations are only meaningful when shipping libc++ inside +// a platform (i.e. as a system library), and so vendors that want them should +// turn those annotations on at CMake configuration time. +// +// [1]: https://clang.llvm.org/docs/AttributeReference.html#availability + + +// For backwards compatibility, allow users to define _LIBCPP_DISABLE_AVAILABILITY +// for a while. +#if defined(_LIBCPP_DISABLE_AVAILABILITY) +# if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) +# define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS +# endif +#endif + +// Availability markup is disabled when building the library, or when the compiler +// doesn't support the proper attributes. +#if defined(_LIBCPP_BUILDING_LIBRARY) || \ + defined(_LIBCXXABI_BUILDING_LIBRARY) || \ + !__has_feature(attribute_availability_with_strict) || \ + !__has_feature(attribute_availability_in_templates) || \ + !__has_extension(pragma_clang_attribute_external_declaration) +# if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) +# define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS +# endif +#endif + +#if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS) + + // This controls the availability of std::shared_mutex and std::shared_timed_mutex, + // which were added to the dylib later. +# define _LIBCPP_AVAILABILITY_SHARED_MUTEX + + // These macros control the availability of std::bad_optional_access and + // other exception types. These were put in the shared library to prevent + // code bloat from every user program defining the vtable for these exception + // types. +# define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS +# define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS +# define _LIBCPP_AVAILABILITY_BAD_ANY_CAST + + // This controls the availability of std::uncaught_exceptions(). +# define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS + + // This controls the availability of the sized version of ::operator delete, + // which was added to the dylib later. +# define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE + + // This controls the availability of the std::future_error exception. +# define _LIBCPP_AVAILABILITY_FUTURE_ERROR + + // This controls the availability of std::type_info's vtable. + // I can't imagine how using std::type_info can work at all if + // this isn't supported. +# define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE + + // This controls the availability of std::locale::category members + // (e.g. std::locale::collate), which are defined in the dylib. +# define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY + + // This controls the availability of atomic operations on std::shared_ptr + // (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared + // lock table located in the dylib. +# define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR + + // These macros control the availability of all parts of that + // depend on something in the dylib. +# define _LIBCPP_AVAILABILITY_FILESYSTEM +# define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH +# define _LIBCPP_AVAILABILITY_FILESYSTEM_POP + + // This controls the availability of std::to_chars. +# define _LIBCPP_AVAILABILITY_TO_CHARS + + // This controls the availability of the C++20 synchronization library, + // which requires shared library support for various operations + // (see libcxx/src/atomic.cpp). +# define _LIBCPP_AVAILABILITY_SYNC + +#elif defined(__APPLE__) + +# define _LIBCPP_AVAILABILITY_SHARED_MUTEX \ + __attribute__((availability(macosx,strict,introduced=10.12))) \ + __attribute__((availability(ios,strict,introduced=10.0))) \ + __attribute__((availability(tvos,strict,introduced=10.0))) \ + __attribute__((availability(watchos,strict,introduced=3.0))) +# define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS \ + __attribute__((availability(macosx,strict,introduced=10.13))) \ + __attribute__((availability(ios,strict,introduced=11.0))) \ + __attribute__((availability(tvos,strict,introduced=11.0))) \ + __attribute__((availability(watchos,strict,introduced=4.0))) +# define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS \ + _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS +# define _LIBCPP_AVAILABILITY_BAD_ANY_CAST \ + _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS +# define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS \ + __attribute__((availability(macosx,strict,introduced=10.12))) \ + __attribute__((availability(ios,strict,introduced=10.0))) \ + __attribute__((availability(tvos,strict,introduced=10.0))) \ + __attribute__((availability(watchos,strict,introduced=3.0))) +# define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE \ + __attribute__((availability(macosx,strict,introduced=10.12))) \ + __attribute__((availability(ios,strict,introduced=10.0))) \ + __attribute__((availability(tvos,strict,introduced=10.0))) \ + __attribute__((availability(watchos,strict,introduced=3.0))) +# define _LIBCPP_AVAILABILITY_FUTURE_ERROR \ + __attribute__((availability(ios,strict,introduced=6.0))) +# define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE \ + __attribute__((availability(macosx,strict,introduced=10.9))) \ + __attribute__((availability(ios,strict,introduced=7.0))) +# define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY \ + __attribute__((availability(macosx,strict,introduced=10.9))) \ + __attribute__((availability(ios,strict,introduced=7.0))) +# define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR \ + __attribute__((availability(macosx,strict,introduced=10.9))) \ + __attribute__((availability(ios,strict,introduced=7.0))) +# define _LIBCPP_AVAILABILITY_FILESYSTEM \ + __attribute__((availability(macosx,strict,introduced=10.15))) \ + __attribute__((availability(ios,strict,introduced=13.0))) \ + __attribute__((availability(tvos,strict,introduced=13.0))) \ + __attribute__((availability(watchos,strict,introduced=6.0))) +# define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH \ + _Pragma("clang attribute push(__attribute__((availability(macosx,strict,introduced=10.15))), apply_to=any(function,record))") \ + _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))") \ + _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))") \ + _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))") +# define _LIBCPP_AVAILABILITY_FILESYSTEM_POP \ + _Pragma("clang attribute pop") \ + _Pragma("clang attribute pop") \ + _Pragma("clang attribute pop") \ + _Pragma("clang attribute pop") +# define _LIBCPP_AVAILABILITY_TO_CHARS \ + _LIBCPP_AVAILABILITY_FILESYSTEM +# define _LIBCPP_AVAILABILITY_SYNC \ + __attribute__((unavailable)) + +#else + +// ...New vendors can add availability markup here... + +# error "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!" + +#endif + +// Define availability attributes that depend on _LIBCPP_NO_EXCEPTIONS. +// Those are defined in terms of the availability attributes above, and +// should not be vendor-specific. +#if defined(_LIBCPP_NO_EXCEPTIONS) +# define _LIBCPP_AVAILABILITY_FUTURE +# define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST +# define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS +# define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS +#else +# define _LIBCPP_AVAILABILITY_FUTURE _LIBCPP_AVAILABILITY_FUTURE_ERROR +# define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST _LIBCPP_AVAILABILITY_BAD_ANY_CAST +# define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS +# define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS +#endif + +#endif // _LIBCPP___AVAILABILITY diff --git a/lib/libcxx/include/__config b/lib/libcxx/include/__config index 575147cead..033cd8aea0 100644 --- a/lib/libcxx/include/__config +++ b/lib/libcxx/include/__config @@ -32,13 +32,13 @@ # define _GNUC_VER_NEW 0 #endif -#define _LIBCPP_VERSION 11000 +#define _LIBCPP_VERSION 12000 #ifndef _LIBCPP_ABI_VERSION # define _LIBCPP_ABI_VERSION 1 #endif -#ifndef __STDC_HOSTED__ +#if __STDC_HOSTED__ == 0 # define _LIBCPP_FREESTANDING #endif @@ -63,7 +63,7 @@ #elif defined(__wasm__) # define _LIBCPP_OBJECT_FORMAT_WASM 1 #else -# error Unknown object file format + // ... add new file formats here ... #endif #if defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2 @@ -105,6 +105,10 @@ // Re-worked external template instantiations for std::string with a focus on // performance and fast-path inlining. # define _LIBCPP_ABI_STRING_OPTIMIZED_EXTERNAL_INSTANTIATION +// Enable clang::trivial_abi on std::unique_ptr. +# define _LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI +// Enable clang::trivial_abi on std::shared_ptr and std::weak_ptr +# define _LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI #elif _LIBCPP_ABI_VERSION == 1 # if !defined(_LIBCPP_OBJECT_FORMAT_COFF) // Enable compiling copies of now inline methods into the dylib to support @@ -121,9 +125,11 @@ # endif #endif -#ifdef _LIBCPP_TRIVIAL_PAIR_COPY_CTOR -#error "_LIBCPP_TRIVIAL_PAIR_COPY_CTOR" is no longer supported. \ - use _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR instead +#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCPP_ABI_UNSTABLE) || _LIBCPP_ABI_VERSION >= 2 +// Enable additional explicit instantiations of iostreams components. This +// reduces the number of weak definitions generated in programs that use +// iostreams by providing a single strong definition in the shared library. +# define _LIBCPP_ABI_ENABLE_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 #endif #define _LIBCPP_CONCAT1(_LIBCPP_X,_LIBCPP_Y) _LIBCPP_X##_LIBCPP_Y @@ -344,13 +350,11 @@ # if defined(__FreeBSD__) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT -# define _LIBCPP_HAS_C11_FEATURES # if __FreeBSD_version >= 1300064 || \ (__FreeBSD_version >= 1201504 && __FreeBSD_version < 1300000) # define _LIBCPP_HAS_TIMESPEC_GET # endif # elif defined(__BIONIC__) -# define _LIBCPP_HAS_C11_FEATURES # if __ANDROID_API__ >= 21 # define _LIBCPP_HAS_QUICK_EXIT # endif @@ -364,7 +368,6 @@ # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # elif defined(__linux__) # if !defined(_LIBCPP_HAS_MUSL_LIBC) # if _LIBCPP_GLIBC_PREREQ(2, 15) || defined(__BIONIC__) @@ -372,16 +375,24 @@ # endif # if _LIBCPP_GLIBC_PREREQ(2, 17) # define _LIBCPP_HAS_ALIGNED_ALLOC -# define _LIBCPP_HAS_C11_FEATURES # define _LIBCPP_HAS_TIMESPEC_GET # endif # else // defined(_LIBCPP_HAS_MUSL_LIBC) # define _LIBCPP_HAS_ALIGNED_ALLOC # define _LIBCPP_HAS_QUICK_EXIT # define _LIBCPP_HAS_TIMESPEC_GET -# define _LIBCPP_HAS_C11_FEATURES # endif -# endif // __linux__ +# elif defined(__APPLE__) + // timespec_get and aligned_alloc were introduced in macOS 10.15 and + // aligned releases +# if (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500 || \ + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000 || \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000) +# define _LIBCPP_HAS_ALIGNED_ALLOC +# define _LIBCPP_HAS_TIMESPEC_GET +# endif +# endif // __APPLE__ #endif #ifndef _LIBCPP_CXX03_LANG @@ -389,9 +400,7 @@ #elif defined(_LIBCPP_COMPILER_CLANG) # define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) #else -// This definition is potentially buggy, but it's only taken with GCC in C++03, -// which we barely support anyway. See llvm.org/PR39713 -# define _LIBCPP_ALIGNOF(_Tp) __alignof(_Tp) +# error "We don't know a correct way to implement alignof(T) in C++03 outside of Clang" #endif #define _LIBCPP_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp) @@ -433,10 +442,6 @@ typedef __char32_t char32_t; # define _LIBCPP_NORETURN __attribute__ ((noreturn)) #endif -#if !(__has_feature(cxx_lambdas)) -#define _LIBCPP_HAS_NO_LAMBDAS -#endif - #if !(__has_feature(cxx_nullptr)) # if (__has_extension(cxx_nullptr) || __has_keyword(__nullptr)) && defined(_LIBCPP_ABI_ALWAYS_USE_CXX11_NULLPTR) # define nullptr __nullptr @@ -445,18 +450,6 @@ typedef __char32_t char32_t; # endif #endif -#if !(__has_feature(cxx_rvalue_references)) -#define _LIBCPP_HAS_NO_RVALUE_REFERENCES -#endif - -#if !(__has_feature(cxx_auto_type)) -#define _LIBCPP_HAS_NO_AUTO_TYPE -#endif - -#if !(__has_feature(cxx_variadic_templates)) -#define _LIBCPP_HAS_NO_VARIADICS -#endif - // Objective-C++ features (opt-in) #if __has_feature(objc_arc) #define _LIBCPP_HAS_OBJC_ARC @@ -754,16 +747,6 @@ typedef __char32_t char32_t; # endif #endif -#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION -# ifdef _LIBCPP_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos. -# define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 2 -# else - // TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398 - // And we should consider defaulting to OFF. -# define _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION 1 -# endif -#endif - #ifndef _LIBCPP_HIDE_FROM_ABI # if _LIBCPP_HIDE_FROM_ABI_PER_TU # define _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDDEN _LIBCPP_INTERNAL_LINKAGE @@ -838,6 +821,12 @@ typedef unsigned int char32_t; # define _LIBCPP_CONSTEXPR constexpr #endif +#ifndef __cpp_consteval +# define _LIBCPP_CONSTEVAL _LIBCPP_CONSTEXPR +#else +# define _LIBCPP_CONSTEVAL consteval +#endif + #ifdef _LIBCPP_CXX03_LANG # define _LIBCPP_DEFAULT {} #else @@ -863,10 +852,6 @@ typedef unsigned int char32_t; # define _LIBCPP_EXPLICIT #endif -#if !__has_builtin(__builtin_operator_new) || !__has_builtin(__builtin_operator_delete) -#define _LIBCPP_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE -#endif - #ifdef _LIBCPP_HAS_NO_STRONG_ENUMS # define _LIBCPP_DECLARE_STRONG_ENUM(x) struct _LIBCPP_TYPE_VIS x { enum __lx # define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x) \ @@ -880,36 +865,33 @@ typedef unsigned int char32_t; # define _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(x) #endif // _LIBCPP_HAS_NO_STRONG_ENUMS -#ifdef _LIBCPP_DEBUG -# if _LIBCPP_DEBUG == 0 -# define _LIBCPP_DEBUG_LEVEL 1 -# elif _LIBCPP_DEBUG == 1 -# define _LIBCPP_DEBUG_LEVEL 2 -# else -# error Supported values for _LIBCPP_DEBUG are 0 and 1 -# endif -# if !defined(_LIBCPP_BUILDING_LIBRARY) -# define _LIBCPP_EXTERN_TEMPLATE(...) -# endif +// _LIBCPP_DEBUG potential values: +// - undefined: No assertions. This is the default. +// - 0: Basic assertions +// - 1: Basic assertions + iterator validity checks. +#if !defined(_LIBCPP_DEBUG) +# define _LIBCPP_DEBUG_LEVEL 0 +#elif _LIBCPP_DEBUG == 0 +# define _LIBCPP_DEBUG_LEVEL 1 +#elif _LIBCPP_DEBUG == 1 +# define _LIBCPP_DEBUG_LEVEL 2 +#else +# error Supported values for _LIBCPP_DEBUG are 0 and 1 #endif -#ifndef _LIBCPP_DEBUG_LEVEL -# define _LIBCPP_DEBUG_LEVEL 0 +// _LIBCPP_DEBUG_LEVEL is always defined to one of [0, 1, 2] at this point +#if _LIBCPP_DEBUG_LEVEL >= 1 +# define _LIBCPP_DISABLE_EXTERN_TEMPLATE #endif #ifdef _LIBCPP_DISABLE_EXTERN_TEMPLATE #define _LIBCPP_EXTERN_TEMPLATE(...) -#define _LIBCPP_EXTERN_TEMPLATE2(...) #endif #ifndef _LIBCPP_EXTERN_TEMPLATE #define _LIBCPP_EXTERN_TEMPLATE(...) extern template __VA_ARGS__; #endif -#ifndef _LIBCPP_EXTERN_TEMPLATE2 -#define _LIBCPP_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__; -#endif - #ifndef _LIBCPP_EXTERN_TEMPLATE_DEFINE #define _LIBCPP_EXTERN_TEMPLATE_DEFINE(...) template __VA_ARGS__; #endif @@ -938,6 +920,8 @@ typedef unsigned int char32_t; // We're deferring to Microsoft's STL to provide aligned new et al. We don't // have it unless the language feature test macro is defined. # define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION +#elif defined(__MVS__) +# define _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION #endif #if defined(__APPLE__) @@ -999,6 +983,18 @@ typedef unsigned int char32_t; # define _LIBCPP_DEPRECATED_IN_CXX17 #endif +#if _LIBCPP_STD_VER > 17 +# define _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_DEPRECATED +#else +# define _LIBCPP_DEPRECATED_IN_CXX20 +#endif + +#if !defined(_LIBCPP_NO_HAS_CHAR8_T) +# define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED +#else +# define _LIBCPP_DEPRECATED_WITH_CHAR8_T +#endif + // Macros to enter and leave a state where deprecation warnings are suppressed. #if !defined(_LIBCPP_SUPPRESS_DEPRECATED_PUSH) && \ (defined(_LIBCPP_COMPILER_CLANG) || defined(_LIBCPP_COMPILER_GCC)) @@ -1037,14 +1033,6 @@ typedef unsigned int char32_t; # define _LIBCPP_CONSTEXPR_AFTER_CXX17 #endif -#if _LIBCPP_STD_VER > 17 && \ - !defined(_LIBCPP_HAS_NO_CXX14_CONSTEXPR) && \ - !defined(_LIBCPP_HAS_NO_BUILTIN_IS_CONSTANT_EVALUATED) -# define _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED constexpr -#else -# define _LIBCPP_CONSTEXPR_AFTER_CXX17_WITH_IS_CONSTANT_EVALUATED -#endif - // The _LIBCPP_NODISCARD_ATTRIBUTE should only be used to define other // NODISCARD macros to the correct attribute. #if __has_cpp_attribute(nodiscard) || defined(_LIBCPP_COMPILER_MSVC) @@ -1079,12 +1067,6 @@ typedef unsigned int char32_t; # define _LIBCPP_INLINE_VAR #endif -#ifdef _LIBCPP_HAS_NO_RVALUE_REFERENCES -# define _LIBCPP_EXPLICIT_MOVE(x) _VSTD::move(x) -#else -# define _LIBCPP_EXPLICIT_MOVE(x) (x) -#endif - #ifndef _LIBCPP_CONSTEXPR_IF_NODEBUG #if defined(_LIBCPP_DEBUG) || defined(_LIBCPP_HAS_NO_CXX14_CONSTEXPR) #define _LIBCPP_CONSTEXPR_IF_NODEBUG @@ -1125,11 +1107,13 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( # if defined(__FreeBSD__) || \ defined(__wasi__) || \ defined(__NetBSD__) || \ + defined(__NuttX__) || \ defined(__linux__) || \ defined(__GNU__) || \ defined(__APPLE__) || \ defined(__CloudABI__) || \ defined(__sun__) || \ + defined(__MVS__) || \ (defined(__MINGW32__) && __has_include()) # define _LIBCPP_HAS_THREAD_API_PTHREAD # elif defined(__Fuchsia__) @@ -1167,10 +1151,6 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( _LIBCPP_HAS_NO_THREADS is defined. #endif -#if defined(__STDCPP_THREADS__) && defined(_LIBCPP_HAS_NO_THREADS) -#error _LIBCPP_HAS_NO_THREADS cannot be set when __STDCPP_THREADS__ is set. -#endif - #if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(__STDCPP_THREADS__) #define __STDCPP_THREADS__ 1 #endif @@ -1227,8 +1207,9 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( # endif #endif -#if defined(__BIONIC__) || defined(__CloudABI__) || \ - defined(__Fuchsia__) || defined(__wasi__) || defined(_LIBCPP_HAS_MUSL_LIBC) +#if defined(__BIONIC__) || defined(__CloudABI__) || defined(__NuttX__) || \ + defined(__Fuchsia__) || defined(__wasi__) || defined(_LIBCPP_HAS_MUSL_LIBC) || \ + defined(__MVS__) #define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE #endif @@ -1337,6 +1318,12 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( #endif #endif // !defined(_LIBCPP_NODEBUG_TYPE) +#if __has_attribute(__preferred_name__) +#define _LIBCPP_PREFERRED_NAME(x) __attribute__((__preferred_name__(x))) +#else +#define _LIBCPP_PREFERRED_NAME(x) +#endif + #if defined(_LIBCPP_ABI_MICROSOFT) && \ (defined(_LIBCPP_COMPILER_MSVC) || __has_declspec_attribute(empty_bases)) # define _LIBCPP_DECLSPEC_EMPTY_BASES __declspec(empty_bases) @@ -1367,120 +1354,6 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( #define _LIBCPP_HAS_NO_SPACESHIP_OPERATOR #endif -// Decide whether to use availability macros. -#if !defined(_LIBCPP_BUILDING_LIBRARY) && \ - !defined(_LIBCXXABI_BUILDING_LIBRARY) && \ - !defined(_LIBCPP_DISABLE_AVAILABILITY) && \ - __has_feature(attribute_availability_with_strict) && \ - __has_feature(attribute_availability_in_templates) && \ - __has_extension(pragma_clang_attribute_external_declaration) -# ifdef __APPLE__ -# define _LIBCPP_USE_AVAILABILITY_APPLE -# endif -#endif - -// Define availability macros. -#if defined(_LIBCPP_USE_AVAILABILITY_APPLE) -# define _LIBCPP_AVAILABILITY_SHARED_MUTEX \ - __attribute__((availability(macosx,strict,introduced=10.12))) \ - __attribute__((availability(ios,strict,introduced=10.0))) \ - __attribute__((availability(tvos,strict,introduced=10.0))) \ - __attribute__((availability(watchos,strict,introduced=3.0))) -# define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS \ - __attribute__((availability(macosx,strict,introduced=10.13))) \ - __attribute__((availability(ios,strict,introduced=11.0))) \ - __attribute__((availability(tvos,strict,introduced=11.0))) \ - __attribute__((availability(watchos,strict,introduced=4.0))) -# define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS \ - _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS -# define _LIBCPP_AVAILABILITY_BAD_ANY_CAST \ - _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS -# define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS \ - __attribute__((availability(macosx,strict,introduced=10.12))) \ - __attribute__((availability(ios,strict,introduced=10.0))) \ - __attribute__((availability(tvos,strict,introduced=10.0))) \ - __attribute__((availability(watchos,strict,introduced=3.0))) -# define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE \ - __attribute__((availability(macosx,strict,introduced=10.12))) \ - __attribute__((availability(ios,strict,introduced=10.0))) \ - __attribute__((availability(tvos,strict,introduced=10.0))) \ - __attribute__((availability(watchos,strict,introduced=3.0))) -# define _LIBCPP_AVAILABILITY_FUTURE_ERROR \ - __attribute__((availability(ios,strict,introduced=6.0))) -# define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE \ - __attribute__((availability(macosx,strict,introduced=10.9))) \ - __attribute__((availability(ios,strict,introduced=7.0))) -# define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY \ - __attribute__((availability(macosx,strict,introduced=10.9))) \ - __attribute__((availability(ios,strict,introduced=7.0))) -# define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR \ - __attribute__((availability(macosx,strict,introduced=10.9))) \ - __attribute__((availability(ios,strict,introduced=7.0))) -# define _LIBCPP_AVAILABILITY_FILESYSTEM \ - __attribute__((availability(macosx,strict,introduced=10.15))) \ - __attribute__((availability(ios,strict,introduced=13.0))) \ - __attribute__((availability(tvos,strict,introduced=13.0))) \ - __attribute__((availability(watchos,strict,introduced=6.0))) -# define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH \ - _Pragma("clang attribute push(__attribute__((availability(macosx,strict,introduced=10.15))), apply_to=any(function,record))") \ - _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))") \ - _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))") \ - _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))") -# define _LIBCPP_AVAILABILITY_FILESYSTEM_POP \ - _Pragma("clang attribute pop") \ - _Pragma("clang attribute pop") \ - _Pragma("clang attribute pop") \ - _Pragma("clang attribute pop") -# define _LIBCPP_AVAILABILITY_TO_CHARS \ - _LIBCPP_AVAILABILITY_FILESYSTEM -# define _LIBCPP_AVAILABILITY_SYNC \ - __attribute__((unavailable)) -#else -# define _LIBCPP_AVAILABILITY_SHARED_MUTEX -# define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS -# define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS -# define _LIBCPP_AVAILABILITY_BAD_ANY_CAST -# define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS -# define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE -# define _LIBCPP_AVAILABILITY_FUTURE_ERROR -# define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE -# define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY -# define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR -# define _LIBCPP_AVAILABILITY_FILESYSTEM -# define _LIBCPP_AVAILABILITY_FILESYSTEM_PUSH -# define _LIBCPP_AVAILABILITY_FILESYSTEM_POP -# define _LIBCPP_AVAILABILITY_TO_CHARS -# define _LIBCPP_AVAILABILITY_SYNC -#endif - -// Define availability that depends on _LIBCPP_NO_EXCEPTIONS. -#ifdef _LIBCPP_NO_EXCEPTIONS -# define _LIBCPP_AVAILABILITY_FUTURE -# define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST -# define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS -# define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS -#else -# define _LIBCPP_AVAILABILITY_FUTURE _LIBCPP_AVAILABILITY_FUTURE_ERROR -# define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST _LIBCPP_AVAILABILITY_BAD_ANY_CAST -# define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS -# define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS -#endif - -// The stream API was dropped and re-added in the dylib shipped on macOS -// and iOS. We can only assume the dylib to provide these definitions for -// macosx >= 10.9 and ios >= 7.0. Otherwise, the definitions are available -// from the headers, but not from the dylib. Explicit instantiation -// declarations for streams exist conditionally to this; if we provide -// an explicit instantiation declaration and we try to deploy to a dylib -// that does not provide those symbols, we'll get a load-time error. -#if !defined(_LIBCPP_BUILDING_LIBRARY) && \ - ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \ - __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1090) || \ - (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && \ - __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000)) -# define _LIBCPP_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB -#endif - #if defined(_LIBCPP_COMPILER_IBM) #define _LIBCPP_HAS_NO_PRAGMA_PUSH_POP_MACRO #endif @@ -1547,6 +1420,12 @@ _LIBCPP_FUNC_VIS extern "C" void __sanitizer_annotate_contiguous_container( #define _LIBCPP_HAS_NO_FGETPOS_FSETPOS #endif +#if __has_attribute(init_priority) +# define _LIBCPP_INIT_PRIORITY_MAX __attribute__((init_priority(101))) +#else +# define _LIBCPP_INIT_PRIORITY_MAX +#endif + #endif // __cplusplus #endif // _LIBCPP_CONFIG diff --git a/lib/libcxx/include/__config_site.in b/lib/libcxx/include/__config_site.in index a6984b2eef..6089fb7d01 100644 --- a/lib/libcxx/include/__config_site.in +++ b/lib/libcxx/include/__config_site.in @@ -26,12 +26,13 @@ #cmakedefine _LIBCPP_HAS_THREAD_API_WIN32 #cmakedefine _LIBCPP_HAS_THREAD_LIBRARY_EXTERNAL #cmakedefine _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS +#cmakedefine _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS #cmakedefine _LIBCPP_NO_VCRUNTIME -#ifndef _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION #cmakedefine _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION @_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION@ -#endif #cmakedefine _LIBCPP_ABI_NAMESPACE @_LIBCPP_ABI_NAMESPACE@ #cmakedefine _LIBCPP_HAS_PARALLEL_ALGORITHMS +#cmakedefine _LIBCPP_HAS_NO_RANDOM_DEVICE +#cmakedefine _LIBCPP_HAS_NO_LOCALIZATION @_LIBCPP_ABI_DEFINES@ diff --git a/lib/libcxx/include/__debug b/lib/libcxx/include/__debug index 11367413fc..7b5bfb3f83 100644 --- a/lib/libcxx/include/__debug +++ b/lib/libcxx/include/__debug @@ -27,26 +27,21 @@ # include #endif -#if _LIBCPP_DEBUG_LEVEL >= 1 && !defined(_LIBCPP_ASSERT) -# define _LIBCPP_ASSERT(x, m) ((x) ? (void)0 : \ - _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m))) -#endif - -#if _LIBCPP_DEBUG_LEVEL >= 2 -#ifndef _LIBCPP_DEBUG_ASSERT -#define _LIBCPP_DEBUG_ASSERT(x, m) _LIBCPP_ASSERT(x, m) -#endif -#define _LIBCPP_DEBUG_MODE(...) __VA_ARGS__ -#endif - -#ifndef _LIBCPP_ASSERT -# define _LIBCPP_ASSERT(x, m) ((void)0) -#endif -#ifndef _LIBCPP_DEBUG_ASSERT +#if _LIBCPP_DEBUG_LEVEL == 0 # define _LIBCPP_DEBUG_ASSERT(x, m) ((void)0) +# define _LIBCPP_ASSERT_IMPL(x, m) ((void)0) +#elif _LIBCPP_DEBUG_LEVEL == 1 +# define _LIBCPP_DEBUG_ASSERT(x, m) ((void)0) +# define _LIBCPP_ASSERT_IMPL(x, m) ((x) ? (void)0 : _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m))) +#elif _LIBCPP_DEBUG_LEVEL == 2 +# define _LIBCPP_DEBUG_ASSERT(x, m) _LIBCPP_ASSERT(x, m) +# define _LIBCPP_ASSERT_IMPL(x, m) ((x) ? (void)0 : _VSTD::__libcpp_debug_function(_VSTD::__libcpp_debug_info(__FILE__, __LINE__, #x, m))) +#else +# error _LIBCPP_DEBUG_LEVEL must be one of 0, 1, 2 #endif -#ifndef _LIBCPP_DEBUG_MODE -#define _LIBCPP_DEBUG_MODE(...) ((void)0) + +#if !defined(_LIBCPP_ASSERT) +# define _LIBCPP_ASSERT(x, m) _LIBCPP_ASSERT_IMPL(x, m) #endif _LIBCPP_BEGIN_NAMESPACE_STD @@ -59,7 +54,7 @@ struct _LIBCPP_TEMPLATE_VIS __libcpp_debug_info { __libcpp_debug_info(const char* __f, int __l, const char* __p, const char* __m) : __file_(__f), __line_(__l), __pred_(__p), __msg_(__m) {} - _LIBCPP_FUNC_VIS std::string what() const; + _LIBCPP_FUNC_VIS string what() const; const char* __file_; int __line_; @@ -83,7 +78,7 @@ void __libcpp_abort_debug_function(__libcpp_debug_info const&); _LIBCPP_FUNC_VIS bool __libcpp_set_debug_function(__libcpp_debug_function_type __func); -#if _LIBCPP_DEBUG_LEVEL >= 2 || defined(_LIBCPP_BUILDING_LIBRARY) +#if _LIBCPP_DEBUG_LEVEL == 2 || defined(_LIBCPP_BUILDING_LIBRARY) struct _LIBCPP_TYPE_VIS __c_node; @@ -226,7 +221,7 @@ public: template _LIBCPP_INLINE_VISIBILITY static __c_node* __create_C_node(void *__mem, void *__c, __c_node *__next) { - return ::new(__mem) _C_node<_Cont>(__c, __next); + return ::new (__mem) _C_node<_Cont>(__c, __next); } template @@ -271,7 +266,7 @@ _LIBCPP_FUNC_VIS __libcpp_db* __get_db(); _LIBCPP_FUNC_VIS const __libcpp_db* __get_const_db(); -#endif // _LIBCPP_DEBUG_LEVEL >= 2 || defined(_LIBCPP_BUILDING_LIBRARY) +#endif // _LIBCPP_DEBUG_LEVEL == 2 || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_END_NAMESPACE_STD diff --git a/lib/libcxx/include/__functional_03 b/lib/libcxx/include/__functional_03 index bf86428dea..9616480611 100644 --- a/lib/libcxx/include/__functional_03 +++ b/lib/libcxx/include/__functional_03 @@ -126,7 +126,7 @@ __func<_Fp, _Alloc, _Rp()>::__clone() const _Ap __a(__f_.second()); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) __func(__f_.first(), _Alloc(__a)); + ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a)); return __hold.release(); } @@ -134,7 +134,7 @@ template void __func<_Fp, _Alloc, _Rp()>::__clone(__base<_Rp()>* __p) const { - ::new (__p) __func(__f_.first(), __f_.second()); + ::new ((void*)__p) __func(__f_.first(), __f_.second()); } template @@ -212,7 +212,7 @@ __func<_Fp, _Alloc, _Rp(_A0)>::__clone() const _Ap __a(__f_.second()); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) __func(__f_.first(), _Alloc(__a)); + ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a)); return __hold.release(); } @@ -220,7 +220,7 @@ template void __func<_Fp, _Alloc, _Rp(_A0)>::__clone(__base<_Rp(_A0)>* __p) const { - ::new (__p) __func(__f_.first(), __f_.second()); + ::new ((void*)__p) __func(__f_.first(), __f_.second()); } template @@ -298,7 +298,7 @@ __func<_Fp, _Alloc, _Rp(_A0, _A1)>::__clone() const _Ap __a(__f_.second()); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) __func(__f_.first(), _Alloc(__a)); + ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a)); return __hold.release(); } @@ -306,7 +306,7 @@ template void __func<_Fp, _Alloc, _Rp(_A0, _A1)>::__clone(__base<_Rp(_A0, _A1)>* __p) const { - ::new (__p) __func(__f_.first(), __f_.second()); + ::new ((void*)__p) __func(__f_.first(), __f_.second()); } template @@ -384,7 +384,7 @@ __func<_Fp, _Alloc, _Rp(_A0, _A1, _A2)>::__clone() const _Ap __a(__f_.second()); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) __func(__f_.first(), _Alloc(__a)); + ::new ((void*)__hold.get()) __func(__f_.first(), _Alloc(__a)); return __hold.release(); } @@ -392,7 +392,7 @@ template void __func<_Fp, _Alloc, _Rp(_A0, _A1, _A2)>::__clone(__base<_Rp(_A0, _A1, _A2)>* __p) const { - ::new (__p) __func(__f_.first(), __f_.second()); + ::new ((void*)__p) __func(__f_.first(), __f_.second()); } template @@ -554,7 +554,7 @@ function<_Rp()>::function(_Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f); + ::new ((void*)__f_) _FF(__f); } else { @@ -562,7 +562,7 @@ function<_Rp()>::function(_Fp __f, _Ap __a; typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a)); + ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a)); __f_ = __hold.release(); } } @@ -581,7 +581,7 @@ function<_Rp()>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f, __a0); + ::new ((void*)__f_) _FF(__f, __a0); } else { @@ -589,7 +589,7 @@ function<_Rp()>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, _Ap __a(__a0); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, _Alloc(__a)); + ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a)); __f_ = __hold.release(); } } @@ -834,7 +834,7 @@ function<_Rp(_A0)>::function(_Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f); + ::new ((void*)__f_) _FF(__f); } else { @@ -842,7 +842,7 @@ function<_Rp(_A0)>::function(_Fp __f, _Ap __a; typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a)); + ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a)); __f_ = __hold.release(); } } @@ -861,7 +861,7 @@ function<_Rp(_A0)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f, __a0); + ::new ((void*)__f_) _FF(__f, __a0); } else { @@ -869,7 +869,7 @@ function<_Rp(_A0)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, _Ap __a(__a0); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, _Alloc(__a)); + ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a)); __f_ = __hold.release(); } } @@ -1114,7 +1114,7 @@ function<_Rp(_A0, _A1)>::function(_Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f); + ::new ((void*)__f_) _FF(__f); } else { @@ -1122,7 +1122,7 @@ function<_Rp(_A0, _A1)>::function(_Fp __f, _Ap __a; typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a)); + ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a)); __f_ = __hold.release(); } } @@ -1141,7 +1141,7 @@ function<_Rp(_A0, _A1)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f, __a0); + ::new ((void*)__f_) _FF(__f, __a0); } else { @@ -1149,7 +1149,7 @@ function<_Rp(_A0, _A1)>::function(allocator_arg_t, const _Alloc& __a0, _Fp __f, _Ap __a(__a0); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, _Alloc(__a)); + ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a)); __f_ = __hold.release(); } } @@ -1394,7 +1394,7 @@ function<_Rp(_A0, _A1, _A2)>::function(_Fp __f, if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f); + ::new ((void*)__f_) _FF(__f); } else { @@ -1402,7 +1402,7 @@ function<_Rp(_A0, _A1, _A2)>::function(_Fp __f, _Ap __a; typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, allocator<_Fp>(__a)); + ::new ((void*)__hold.get()) _FF(__f, allocator<_Fp>(__a)); __f_ = __hold.release(); } } @@ -1421,7 +1421,7 @@ function<_Rp(_A0, _A1, _A2)>::function(allocator_arg_t, const _Alloc& __a0, _Fp if (sizeof(_FF) <= sizeof(__buf_)) { __f_ = (__base*)&__buf_; - ::new (__f_) _FF(__f, __a0); + ::new ((void*)__f_) _FF(__f, __a0); } else { @@ -1429,7 +1429,7 @@ function<_Rp(_A0, _A1, _A2)>::function(allocator_arg_t, const _Alloc& __a0, _Fp _Ap __a(__a0); typedef __allocator_destructor<_Ap> _Dp; unique_ptr<__base, _Dp> __hold(__a.allocate(1), _Dp(__a, 1)); - ::new (__hold.get()) _FF(__f, _Alloc(__a)); + ::new ((void*)__hold.get()) _FF(__f, _Alloc(__a)); __f_ = __hold.release(); } } diff --git a/lib/libcxx/include/__functional_base b/lib/libcxx/include/__functional_base index f591bf5a9d..c84e7eb115 100644 --- a/lib/libcxx/include/__functional_base +++ b/lib/libcxx/include/__functional_base @@ -298,7 +298,7 @@ struct __weak_result_type<_Rp (_Cp::*)(_A1, _A2, _A3...) const volatile> template struct __invoke_return { - typedef decltype(__invoke(_VSTD::declval<_Tp>(), _VSTD::declval<_Args>()...)) type; + typedef decltype(_VSTD::__invoke(declval<_Tp>(), declval<_Args>()...)) type; }; #else // defined(_LIBCPP_CXX03_LANG) @@ -314,27 +314,27 @@ struct __invoke_void_return_wrapper #ifndef _LIBCPP_CXX03_LANG template static _Ret __call(_Args&&... __args) { - return __invoke(_VSTD::forward<_Args>(__args)...); + return _VSTD::__invoke(_VSTD::forward<_Args>(__args)...); } #else template static _Ret __call(_Fn __f) { - return __invoke(__f); + return _VSTD::__invoke(__f); } template static _Ret __call(_Fn __f, _A0& __a0) { - return __invoke(__f, __a0); + return _VSTD::__invoke(__f, __a0); } template static _Ret __call(_Fn __f, _A0& __a0, _A1& __a1) { - return __invoke(__f, __a0, __a1); + return _VSTD::__invoke(__f, __a0, __a1); } template static _Ret __call(_Fn __f, _A0& __a0, _A1& __a1, _A2& __a2){ - return __invoke(__f, __a0, __a1, __a2); + return _VSTD::__invoke(__f, __a0, __a1, __a2); } #endif }; @@ -345,27 +345,27 @@ struct __invoke_void_return_wrapper #ifndef _LIBCPP_CXX03_LANG template static void __call(_Args&&... __args) { - __invoke(_VSTD::forward<_Args>(__args)...); + _VSTD::__invoke(_VSTD::forward<_Args>(__args)...); } #else template static void __call(_Fn __f) { - __invoke(__f); + _VSTD::__invoke(__f); } template static void __call(_Fn __f, _A0& __a0) { - __invoke(__f, __a0); + _VSTD::__invoke(__f, __a0); } template static void __call(_Fn __f, _A0& __a0, _A1& __a1) { - __invoke(__f, __a0, __a1); + _VSTD::__invoke(__f, __a0, __a1); } template static void __call(_Fn __f, _A0& __a0, _A1& __a1, _A2& __a2) { - __invoke(__f, __a0, __a1, __a2); + _VSTD::__invoke(__f, __a0, __a1, __a2); } #endif }; @@ -398,112 +398,112 @@ public: _LIBCPP_INLINE_VISIBILITY typename __invoke_of::type operator() (_ArgTypes&&... __args) const { - return __invoke(get(), _VSTD::forward<_ArgTypes>(__args)...); + return _VSTD::__invoke(get(), _VSTD::forward<_ArgTypes>(__args)...); } #else _LIBCPP_INLINE_VISIBILITY typename __invoke_return::type operator() () const { - return __invoke(get()); + return _VSTD::__invoke(get()); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return0::type operator() (_A0& __a0) const { - return __invoke(get(), __a0); + return _VSTD::__invoke(get(), __a0); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return0::type operator() (_A0 const& __a0) const { - return __invoke(get(), __a0); + return _VSTD::__invoke(get(), __a0); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return1::type operator() (_A0& __a0, _A1& __a1) const { - return __invoke(get(), __a0, __a1); + return _VSTD::__invoke(get(), __a0, __a1); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return1::type operator() (_A0 const& __a0, _A1& __a1) const { - return __invoke(get(), __a0, __a1); + return _VSTD::__invoke(get(), __a0, __a1); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return1::type operator() (_A0& __a0, _A1 const& __a1) const { - return __invoke(get(), __a0, __a1); + return _VSTD::__invoke(get(), __a0, __a1); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return1::type operator() (_A0 const& __a0, _A1 const& __a1) const { - return __invoke(get(), __a0, __a1); + return _VSTD::__invoke(get(), __a0, __a1); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0& __a0, _A1& __a1, _A2& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0 const& __a0, _A1& __a1, _A2& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0& __a0, _A1 const& __a1, _A2& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0& __a0, _A1& __a1, _A2 const& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0 const& __a0, _A1 const& __a1, _A2& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0 const& __a0, _A1& __a1, _A2 const& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0& __a0, _A1 const& __a1, _A2 const& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } template _LIBCPP_INLINE_VISIBILITY typename __invoke_return2::type operator() (_A0 const& __a0, _A1 const& __a1, _A2 const& __a2) const { - return __invoke(get(), __a0, __a1, __a2); + return _VSTD::__invoke(get(), __a0, __a1, __a2); } #endif // _LIBCPP_CXX03_LANG }; diff --git a/lib/libcxx/include/__functional_base_03 b/lib/libcxx/include/__functional_base_03 index e6dac90c84..9b08bd26a8 100644 --- a/lib/libcxx/include/__functional_base_03 +++ b/lib/libcxx/include/__functional_base_03 @@ -40,7 +40,7 @@ struct __enable_invoke_imp<_Ret, _T1, false, true> { template struct __enable_invoke_imp<_Ret, _T1, false, false> { typedef typename add_lvalue_reference< - typename __apply_cv()), _Ret>::type + typename __apply_cv()), _Ret>::type >::type _Bullet4; typedef _Bullet4 type; }; @@ -142,7 +142,7 @@ __invoke(_Fn __f, _T1& __t1) { template inline _LIBCPP_INLINE_VISIBILITY -decltype(_VSTD::declval<_Fp&>()()) +decltype(declval<_Fp&>()()) __invoke(_Fp& __f) { return __f(); @@ -150,7 +150,7 @@ __invoke(_Fp& __f) template inline _LIBCPP_INLINE_VISIBILITY -decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>())) +decltype(declval<_Fp&>()(declval<_A0&>())) __invoke(_Fp& __f, _A0& __a0) { return __f(__a0); @@ -158,7 +158,7 @@ __invoke(_Fp& __f, _A0& __a0) template inline _LIBCPP_INLINE_VISIBILITY -decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>(), _VSTD::declval<_A1&>())) +decltype(declval<_Fp&>()(declval<_A0&>(), declval<_A1&>())) __invoke(_Fp& __f, _A0& __a0, _A1& __a1) { return __f(__a0, __a1); @@ -166,7 +166,7 @@ __invoke(_Fp& __f, _A0& __a0, _A1& __a1) template inline _LIBCPP_INLINE_VISIBILITY -decltype(_VSTD::declval<_Fp&>()(_VSTD::declval<_A0&>(), _VSTD::declval<_A1&>(), _VSTD::declval<_A2&>())) +decltype(declval<_Fp&>()(declval<_A0&>(), declval<_A1&>(), declval<_A2&>())) __invoke(_Fp& __f, _A0& __a0, _A1& __a1, _A2& __a2) { return __f(__a0, __a1, __a2); @@ -181,13 +181,13 @@ struct __invoke_return template struct __invoke_return<_Fp, false> { - typedef decltype(__invoke(_VSTD::declval<_Fp&>())) type; + typedef decltype(_VSTD::__invoke(declval<_Fp&>())) type; }; template struct __invoke_return0 { - typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>())) type; + typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>())) type; }; template @@ -199,8 +199,8 @@ struct __invoke_return0<_Rp _Tp::*, _A0> template struct __invoke_return1 { - typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>(), - _VSTD::declval<_A1&>())) type; + typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>(), + declval<_A1&>())) type; }; template @@ -211,9 +211,9 @@ struct __invoke_return1<_Rp _Class::*, _A0, _A1> { template struct __invoke_return2 { - typedef decltype(__invoke(_VSTD::declval<_Tp&>(), _VSTD::declval<_A0&>(), - _VSTD::declval<_A1&>(), - _VSTD::declval<_A2&>())) type; + typedef decltype(_VSTD::__invoke(declval<_Tp&>(), declval<_A0&>(), + declval<_A1&>(), + declval<_A2&>())) type; }; template diff --git a/lib/libcxx/include/__hash_table b/lib/libcxx/include/__hash_table index 13ff096897..521ebbf2c4 100644 --- a/lib/libcxx/include/__hash_table +++ b/lib/libcxx/include/__hash_table @@ -34,19 +34,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD template struct __hash_value_type; -#ifndef _LIBCPP_CXX03_LANG template struct __is_hash_value_type_imp : false_type {}; template -struct __is_hash_value_type_imp<__hash_value_type<_Key, _Value>> : true_type {}; +struct __is_hash_value_type_imp<__hash_value_type<_Key, _Value> > : true_type {}; template struct __is_hash_value_type : false_type {}; template struct __is_hash_value_type<_One> : __is_hash_value_type_imp::type> {}; -#endif _LIBCPP_FUNC_VIS size_t __next_prime(size_t __n); @@ -122,7 +120,7 @@ inline _LIBCPP_INLINE_VISIBILITY size_t __next_hash_pow2(size_t __n) { - return __n < 2 ? __n : (size_t(1) << (std::numeric_limits::digits - __libcpp_clz(__n-1))); + return __n < 2 ? __n : (size_t(1) << (numeric_limits::digits - __libcpp_clz(__n-1))); } @@ -155,12 +153,10 @@ struct __hash_key_value_types { static __container_value_type* __get_ptr(__node_value_type& __n) { return _VSTD::addressof(__n); } -#ifndef _LIBCPP_CXX03_LANG _LIBCPP_INLINE_VISIBILITY static __container_value_type&& __move(__node_value_type& __v) { return _VSTD::move(__v); } -#endif }; template @@ -197,13 +193,10 @@ struct __hash_key_value_types<__hash_value_type<_Key, _Tp> > { static __container_value_type* __get_ptr(__node_value_type& __n) { return _VSTD::addressof(__n.__get_value()); } -#ifndef _LIBCPP_CXX03_LANG _LIBCPP_INLINE_VISIBILITY static pair __move(__node_value_type& __v) { return __v.__move(); } -#endif - }; template , @@ -295,10 +288,12 @@ public: typedef typename _NodeTypes::__node_value_type_pointer pointer; _LIBCPP_INLINE_VISIBILITY __hash_iterator() _NOEXCEPT : __node_(nullptr) { - _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__insert_i(this); +#endif } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_iterator(const __hash_iterator& __i) : __node_(__i.__node_) @@ -322,7 +317,7 @@ public: } return *this; } -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY reference operator*() const { @@ -364,7 +359,7 @@ public: {return !(__x == __y);} private: -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_iterator(__next_pointer __node, const void* __c) _NOEXCEPT : __node_(__node) @@ -405,17 +400,21 @@ public: _LIBCPP_INLINE_VISIBILITY __hash_const_iterator() _NOEXCEPT : __node_(nullptr) { - _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__insert_i(this); +#endif } _LIBCPP_INLINE_VISIBILITY __hash_const_iterator(const __non_const_iterator& __x) _NOEXCEPT : __node_(__x.__node_) { - _LIBCPP_DEBUG_MODE(__get_db()->__iterator_copy(this, &__x)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__iterator_copy(this, &__x); +#endif } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_const_iterator(const __hash_const_iterator& __i) : __node_(__i.__node_) @@ -439,7 +438,7 @@ public: } return *this; } -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY reference operator*() const { @@ -480,7 +479,7 @@ public: {return !(__x == __y);} private: -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_const_iterator(__next_pointer __node, const void* __c) _NOEXCEPT : __node_(__node) @@ -518,10 +517,12 @@ public: typedef typename _NodeTypes::__node_value_type_pointer pointer; _LIBCPP_INLINE_VISIBILITY __hash_local_iterator() _NOEXCEPT : __node_(nullptr) { - _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__insert_i(this); +#endif } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_local_iterator(const __hash_local_iterator& __i) : __node_(__i.__node_), @@ -549,7 +550,7 @@ public: } return *this; } -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY reference operator*() const { @@ -593,7 +594,7 @@ public: {return !(__x == __y);} private: -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_local_iterator(__next_pointer __node, size_t __bucket, size_t __bucket_count, const void* __c) _NOEXCEPT @@ -650,7 +651,9 @@ public: _LIBCPP_INLINE_VISIBILITY __hash_const_local_iterator() _NOEXCEPT : __node_(nullptr) { - _LIBCPP_DEBUG_MODE(__get_db()->__insert_i(this)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__insert_i(this); +#endif } _LIBCPP_INLINE_VISIBILITY @@ -659,10 +662,12 @@ public: __bucket_(__x.__bucket_), __bucket_count_(__x.__bucket_count_) { - _LIBCPP_DEBUG_MODE(__get_db()->__iterator_copy(this, &__x)); +#if _LIBCPP_DEBUG_LEVEL == 2 + __get_db()->__iterator_copy(this, &__x); +#endif } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_const_local_iterator(const __hash_const_local_iterator& __i) : __node_(__i.__node_), @@ -690,7 +695,7 @@ public: } return *this; } -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY reference operator*() const { @@ -734,7 +739,7 @@ public: {return !(__x == __y);} private: -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_INLINE_VISIBILITY __hash_const_local_iterator(__next_pointer __node, size_t __bucket, size_t __bucket_count, const void* __c) _NOEXCEPT @@ -783,7 +788,6 @@ public: _NOEXCEPT_(is_nothrow_copy_constructible::value) : __data_(__size, __a) {} -#ifndef _LIBCPP_CXX03_LANG _LIBCPP_INLINE_VISIBILITY __bucket_list_deallocator(__bucket_list_deallocator&& __x) _NOEXCEPT_(is_nothrow_move_constructible::value) @@ -791,7 +795,6 @@ public: { __x.size() = 0; } -#endif _LIBCPP_INLINE_VISIBILITY size_type& size() _NOEXCEPT {return __data_.first();} @@ -1007,7 +1010,6 @@ public: explicit __hash_table(const allocator_type& __a); __hash_table(const __hash_table& __u); __hash_table(const __hash_table& __u, const allocator_type& __a); -#ifndef _LIBCPP_CXX03_LANG __hash_table(__hash_table&& __u) _NOEXCEPT_( is_nothrow_move_constructible<__bucket_list>::value && @@ -1016,11 +1018,9 @@ public: is_nothrow_move_constructible::value && is_nothrow_move_constructible::value); __hash_table(__hash_table&& __u, const allocator_type& __a); -#endif // _LIBCPP_CXX03_LANG ~__hash_table(); __hash_table& operator=(const __hash_table& __u); -#ifndef _LIBCPP_CXX03_LANG _LIBCPP_INLINE_VISIBILITY __hash_table& operator=(__hash_table&& __u) _NOEXCEPT_( @@ -1028,7 +1028,6 @@ public: is_nothrow_move_assignable<__node_allocator>::value && is_nothrow_move_assignable::value && is_nothrow_move_assignable::value); -#endif template void __assign_unique(_InputIterator __first, _InputIterator __last); template @@ -1037,7 +1036,7 @@ public: _LIBCPP_INLINE_VISIBILITY size_type max_size() const _NOEXCEPT { - return std::min( + return _VSTD::min( __node_traits::max_size(__node_alloc()), numeric_limits::max() ); @@ -1066,7 +1065,6 @@ public: iterator __node_insert_multi(const_iterator __p, __node_pointer __nd); -#ifndef _LIBCPP_CXX03_LANG template _LIBCPP_INLINE_VISIBILITY pair __emplace_unique_key_args(_Key const& __k, _Args&&... __args); @@ -1151,15 +1149,6 @@ public: return __emplace_hint_multi(__p, _VSTD::forward<_Pp>(__x)); } -#else // !defined(_LIBCPP_CXX03_LANG) - template - _LIBCPP_INLINE_VISIBILITY - pair __emplace_unique_key_args(_Key const&, _Args& __args); - - iterator __insert_multi(const __container_value_type& __x); - iterator __insert_multi(const_iterator __p, const __container_value_type& __x); -#endif - _LIBCPP_INLINE_VISIBILITY pair __insert_unique(const __container_value_type& __x) { return __emplace_unique_key_args(_NodeTypes::__get_key(__x), __x); @@ -1295,7 +1284,7 @@ public: { _LIBCPP_ASSERT(__n < bucket_count(), "unordered container::begin(n) called with n >= bucket_count()"); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return local_iterator(__bucket_list_[__n], __n, bucket_count(), this); #else return local_iterator(__bucket_list_[__n], __n, bucket_count()); @@ -1308,7 +1297,7 @@ public: { _LIBCPP_ASSERT(__n < bucket_count(), "unordered container::end(n) called with n >= bucket_count()"); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return local_iterator(nullptr, __n, bucket_count(), this); #else return local_iterator(nullptr, __n, bucket_count()); @@ -1321,7 +1310,7 @@ public: { _LIBCPP_ASSERT(__n < bucket_count(), "unordered container::cbegin(n) called with n >= bucket_count()"); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return const_local_iterator(__bucket_list_[__n], __n, bucket_count(), this); #else return const_local_iterator(__bucket_list_[__n], __n, bucket_count()); @@ -1334,35 +1323,30 @@ public: { _LIBCPP_ASSERT(__n < bucket_count(), "unordered container::cend(n) called with n >= bucket_count()"); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return const_local_iterator(nullptr, __n, bucket_count(), this); #else return const_local_iterator(nullptr, __n, bucket_count()); #endif } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 bool __dereferenceable(const const_iterator* __i) const; bool __decrementable(const const_iterator* __i) const; bool __addable(const const_iterator* __i, ptrdiff_t __n) const; bool __subscriptable(const const_iterator* __i, ptrdiff_t __n) const; -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 private: void __rehash(size_type __n); -#ifndef _LIBCPP_CXX03_LANG template __node_holder __construct_node(_Args&& ...__args); template __node_holder __construct_node_hash(size_t __hash, _First&& __f, _Rest&&... __rest); -#else // _LIBCPP_CXX03_LANG - __node_holder __construct_node(const __container_value_type& __v); - __node_holder __construct_node_hash(size_t __hash, const __container_value_type& __v); -#endif _LIBCPP_INLINE_VISIBILITY @@ -1373,7 +1357,6 @@ private: _LIBCPP_INLINE_VISIBILITY void __copy_assign_alloc(const __hash_table&, false_type) {} -#ifndef _LIBCPP_CXX03_LANG void __move_assign(__hash_table& __u, false_type); void __move_assign(__hash_table& __u, true_type) _NOEXCEPT_( @@ -1400,7 +1383,6 @@ private: } _LIBCPP_INLINE_VISIBILITY void __move_assign_alloc(__hash_table&, false_type) _NOEXCEPT {} -#endif // _LIBCPP_CXX03_LANG void __deallocate_node(__next_pointer __np) _NOEXCEPT; __next_pointer __detach() _NOEXCEPT; @@ -1477,8 +1459,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(const __hash_table& __u, { } -#ifndef _LIBCPP_CXX03_LANG - template __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u) _NOEXCEPT_( @@ -1526,8 +1506,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__hash_table(__hash_table&& __u, } } -#endif // _LIBCPP_CXX03_LANG - template __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table() { @@ -1539,7 +1517,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table() #endif __deallocate_node(__p1_.first().__next_); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __get_db()->__erase_c(this); #endif } @@ -1583,7 +1561,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np) while (__np != nullptr) { __next_pointer __next = __np->__next_; -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __c_node* __c = __get_db()->__find_c_and_lock(this); for (__i_node** __p = __c->end_; __p != __c->beg_; ) { @@ -1593,7 +1571,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np) { (*__p)->__c_ = nullptr; if (--__c->end_ != __p) - memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*)); + _VSTD::memmove(__p, __p+1, (__c->end_ - __p)*sizeof(__i_node*)); } } __get_db()->unlock(); @@ -1618,8 +1596,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__detach() _NOEXCEPT return __cache; } -#ifndef _LIBCPP_CXX03_LANG - template void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign( @@ -1646,7 +1622,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign( __u.__p1_.first().__next_ = nullptr; __u.size() = 0; } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __get_db()->swap(this, &__u); #endif } @@ -1714,8 +1690,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(__hash_table&& __u) return *this; } -#endif // _LIBCPP_CXX03_LANG - template template void @@ -1800,7 +1774,7 @@ inline typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::begin() _NOEXCEPT { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator(__p1_.first().__next_, this); #else return iterator(__p1_.first().__next_); @@ -1812,7 +1786,7 @@ inline typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() _NOEXCEPT { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator(nullptr, this); #else return iterator(nullptr); @@ -1824,7 +1798,7 @@ inline typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::const_iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::begin() const _NOEXCEPT { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return const_iterator(__p1_.first().__next_, this); #else return const_iterator(__p1_.first().__next_); @@ -1836,7 +1810,7 @@ inline typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::const_iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() const _NOEXCEPT { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return const_iterator(nullptr, this); #else return const_iterator(nullptr); @@ -1945,7 +1919,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique(__node_pointer __ __existing_node = __nd->__ptr(); __inserted = true; } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return pair(iterator(__existing_node, this), __inserted); #else return pair(iterator(__existing_node), __inserted); @@ -1955,7 +1929,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_unique(__node_pointer __ // Prepare the container for an insertion of the value __cp_val with the hash // __cp_hash. This does a lookup into the container to see if __cp_value is // already present, and performs a rehash if necessary. Returns a pointer to the -// last occurance of __cp_val in the map. +// last occurrence of __cp_val in the map. // // Note that this function does forward exceptions if key_eq() throws, and never // mutates __value or actually inserts into the map. @@ -2043,7 +2017,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi(__node_pointer __c __next_pointer __pn = __node_insert_multi_prepare(__cp->__hash(), __cp->__value_); __node_insert_multi_perform(__cp, __pn); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator(__cp->__ptr(), this); #else return iterator(__cp->__ptr()); @@ -2055,7 +2029,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi( const_iterator __p, __node_pointer __cp) { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this, "unordered container::emplace_hint(const_iterator, args...) called with an iterator not" " referring to this unordered container"); @@ -2078,7 +2052,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi( __cp->__next_ = __np; __pp->__next_ = static_cast<__next_pointer>(__cp); ++size(); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator(static_cast<__next_pointer>(__cp), this); #else return iterator(static_cast<__next_pointer>(__cp)); @@ -2089,17 +2063,10 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_insert_multi( -#ifndef _LIBCPP_CXX03_LANG template template pair::iterator, bool> __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& __k, _Args&&... __args) -#else -template -template -pair::iterator, bool> -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& __k, _Args& __args) -#endif { size_t __hash = hash_function()(__k); @@ -2123,11 +2090,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& } } { -#ifndef _LIBCPP_CXX03_LANG __node_holder __h = __construct_node_hash(__hash, _VSTD::forward<_Args>(__args)...); -#else - __node_holder __h = __construct_node_hash(__hash, __args); -#endif if (size()+1 > __bc * max_load_factor() || __bc == 0) { rehash(_VSTD::max(2 * __bc + !__is_hash_power2(__bc), @@ -2159,15 +2122,13 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& __inserted = true; } __done: -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return pair(iterator(__nd, this), __inserted); #else return pair(iterator(__nd), __inserted); #endif } -#ifndef _LIBCPP_CXX03_LANG - template template pair::iterator, bool> @@ -2197,7 +2158,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_hint_multi( const_iterator __p, _Args&&... __args) { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this, "unordered container::emplace_hint(const_iterator, args...) called with an iterator not" " referring to this unordered container"); @@ -2208,36 +2169,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_hint_multi( return __r; } -#else // _LIBCPP_CXX03_LANG - -template -typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__insert_multi(const __container_value_type& __x) -{ - __node_holder __h = __construct_node(__x); - iterator __r = __node_insert_multi(__h.get()); - __h.release(); - return __r; -} - -template -typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__insert_multi(const_iterator __p, - const __container_value_type& __x) -{ -#if _LIBCPP_DEBUG_LEVEL >= 2 - _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this, - "unordered container::insert(const_iterator, lvalue) called with an iterator not" - " referring to this unordered container"); -#endif - __node_holder __h = __construct_node(__x); - iterator __r = __node_insert_multi(__p, __h.get()); - __h.release(); - return __r; -} - -#endif // _LIBCPP_CXX03_LANG - #if _LIBCPP_STD_VER > 14 template template @@ -2399,9 +2330,9 @@ template void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__rehash(size_type __nbc) { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __get_db()->__invalidate_all(this); -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif __pointer_allocator& __npa = __bucket_list_.get_deleter().__alloc(); __bucket_list_.reset(__nbc > 0 ? __pointer_alloc_traits::allocate(__npa, __nbc) : nullptr); @@ -2470,7 +2401,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) { if ((__nd->__hash() == __hash) && key_eq()(__nd->__upcast()->__value_, __k)) -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator(__nd, this); #else return iterator(__nd); @@ -2501,7 +2432,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) const { if ((__nd->__hash() == __hash) && key_eq()(__nd->__upcast()->__value_, __k)) -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return const_iterator(__nd, this); #else return const_iterator(__nd); @@ -2513,8 +2444,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::find(const _Key& __k) const return end(); } -#ifndef _LIBCPP_CXX03_LANG - template template typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder @@ -2550,43 +2479,12 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash( return __h; } -#else // _LIBCPP_CXX03_LANG - -template -typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(const __container_value_type& __v) -{ - __node_allocator& __na = __node_alloc(); - __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na)); - __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), __v); - __h.get_deleter().__value_constructed = true; - __h->__hash_ = hash_function()(__h->__value_); - __h->__next_ = nullptr; - return _LIBCPP_EXPLICIT_MOVE(__h); // explicitly moved for C++03 -} - -template -typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__node_holder -__hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(size_t __hash, - const __container_value_type& __v) -{ - __node_allocator& __na = __node_alloc(); - __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na)); - __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__value_), __v); - __h.get_deleter().__value_constructed = true; - __h->__hash_ = __hash; - __h->__next_ = nullptr; - return _LIBCPP_EXPLICIT_MOVE(__h); // explicitly moved for C++03 -} - -#endif // _LIBCPP_CXX03_LANG - template typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __p) { __next_pointer __np = __p.__node_; -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__p) == this, "unordered container erase(iterator) called with an iterator not" " referring to this container"); @@ -2606,7 +2504,7 @@ typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_iterator __last) { -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_ASSERT(__get_const_db()->__find_c_from_i(&__first) == this, "unodered container::erase(iterator, iterator) called with an iterator not" " referring to this unodered container"); @@ -2620,7 +2518,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, erase(__p); } __next_pointer __np = __last.__node_; -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 return iterator (__np, this); #else return iterator (__np); @@ -2691,7 +2589,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::remove(const_iterator __p) _NOEXCEPT __pn->__next_ = __cn->__next_; __cn->__next_ = nullptr; --size(); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __c_node* __c = __get_db()->__find_c_and_lock(this); for (__i_node** __dp = __c->end_; __dp != __c->beg_; ) { @@ -2701,7 +2599,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::remove(const_iterator __p) _NOEXCEPT { (*__dp)->__c_ = nullptr; if (--__c->end_ != __dp) - memmove(__dp, __dp+1, (__c->end_ - __dp)*sizeof(__i_node*)); + _VSTD::memmove(__dp, __dp+1, (__c->end_ - __dp)*sizeof(__i_node*)); } } __get_db()->unlock(); @@ -2830,9 +2728,9 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u) __u.__bucket_list_.reset(__npp); } _VSTD::swap(__bucket_list_.get_deleter().size(), __u.__bucket_list_.get_deleter().size()); - __swap_allocator(__bucket_list_.get_deleter().__alloc(), + _VSTD::__swap_allocator(__bucket_list_.get_deleter().__alloc(), __u.__bucket_list_.get_deleter().__alloc()); - __swap_allocator(__node_alloc(), __u.__node_alloc()); + _VSTD::__swap_allocator(__node_alloc(), __u.__node_alloc()); _VSTD::swap(__p1_.first().__next_, __u.__p1_.first().__next_); __p2_.swap(__u.__p2_); __p3_.swap(__u.__p3_); @@ -2842,7 +2740,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::swap(__hash_table& __u) if (__u.size() > 0) __u.__bucket_list_[__constrain_hash(__u.__p1_.first().__next_->__hash(), __u.bucket_count())] = __u.__p1_.first().__ptr(); -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 __get_db()->swap(this, &__u); #endif } @@ -2876,7 +2774,7 @@ swap(__hash_table<_Tp, _Hash, _Equal, _Alloc>& __x, __x.swap(__y); } -#if _LIBCPP_DEBUG_LEVEL >= 2 +#if _LIBCPP_DEBUG_LEVEL == 2 template bool @@ -2906,7 +2804,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__subscriptable(const const_iterator*, return false; } -#endif // _LIBCPP_DEBUG_LEVEL >= 2 +#endif // _LIBCPP_DEBUG_LEVEL == 2 _LIBCPP_END_NAMESPACE_STD diff --git a/lib/libcxx/include/__libcpp_version b/lib/libcxx/include/__libcpp_version index 82b3803a20..e334181b40 100644 --- a/lib/libcxx/include/__libcpp_version +++ b/lib/libcxx/include/__libcpp_version @@ -1 +1 @@ -11000 +12000 diff --git a/lib/libcxx/include/__locale b/lib/libcxx/include/__locale index 6d10fa4d3d..f32bd59ae5 100644 --- a/lib/libcxx/include/__locale +++ b/lib/libcxx/include/__locale @@ -11,6 +11,7 @@ #define _LIBCPP___LOCALE #include <__config> +#include <__availability> #include #include #include @@ -21,7 +22,9 @@ #if defined(_LIBCPP_MSVCRT_LIKE) # include # include -#elif defined(_AIX) +#elif defined(__NuttX__) +# include +#elif defined(_AIX) || defined(__MVS__) # include #elif defined(__ANDROID__) # include @@ -76,7 +79,7 @@ struct __libcpp_locale_guard { // locale name, otherwise it will be a semicolon-separated string listing // each category. In the second case, we know at least one category won't // be what we want, so we only have to check the first case. - if (strcmp(__l.__get_locale(), __lc) != 0) { + if (_VSTD::strcmp(__l.__get_locale(), __lc) != 0) { __locale_all = _strdup(__lc); if (__locale_all == nullptr) __throw_bad_alloc(); @@ -105,7 +108,6 @@ struct __libcpp_locale_guard { }; #endif - class _LIBCPP_TYPE_VIS locale; template @@ -335,8 +337,8 @@ collate<_CharT>::do_hash(const char_type* __lo, const char_type* __hi) const return static_cast(__h); } -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate) -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate) +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate) +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate) // template class collate_byname; @@ -396,7 +398,26 @@ locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x, class _LIBCPP_TYPE_VIS ctype_base { public: -#if defined(__GLIBC__) +#if defined(_LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE) + typedef unsigned long mask; + static const mask space = 1<<0; + static const mask print = 1<<1; + static const mask cntrl = 1<<2; + static const mask upper = 1<<3; + static const mask lower = 1<<4; + static const mask alpha = 1<<5; + static const mask digit = 1<<6; + static const mask punct = 1<<7; + static const mask xdigit = 1<<8; + static const mask blank = 1<<9; +#if defined(__BIONIC__) + // Historically this was a part of regex_traits rather than ctype_base. The + // historical value of the constant is preserved for ABI compatibility. + static const mask __regex_word = 0x8000; +#else + static const mask __regex_word = 1<<10; +#endif // defined(__BIONIC__) +#elif defined(__GLIBC__) typedef unsigned short mask; static const mask space = _ISspace; static const mask print = _ISprint; @@ -485,24 +506,7 @@ public: # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA # define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT #else - typedef unsigned long mask; - static const mask space = 1<<0; - static const mask print = 1<<1; - static const mask cntrl = 1<<2; - static const mask upper = 1<<3; - static const mask lower = 1<<4; - static const mask alpha = 1<<5; - static const mask digit = 1<<6; - static const mask punct = 1<<7; - static const mask xdigit = 1<<8; - static const mask blank = 1<<9; -#if defined(__BIONIC__) - // Historically this was a part of regex_traits rather than ctype_base. The - // historical value of the constant is preserved for ABI compatibility. - static const mask __regex_word = 0x8000; -#else - static const mask __regex_word = 1<<10; -#endif // defined(__BIONIC__) +# error unknown rune table for this platform -- do you mean to define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE? #endif static const mask alnum = alpha | digit; static const mask graph = alnum | punct; @@ -623,7 +627,7 @@ class _LIBCPP_TYPE_VIS ctype public: typedef char char_type; - explicit ctype(const mask* __tab = 0, bool __del = false, size_t __refs = 0); + explicit ctype(const mask* __tab = nullptr, bool __del = false, size_t __refs = 0); _LIBCPP_INLINE_VISIBILITY bool is(mask __m, char_type __c) const @@ -1069,10 +1073,10 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -// template <> class codecvt +// template <> class codecvt // deprecated in C++20 template <> -class _LIBCPP_TYPE_VIS codecvt +class _LIBCPP_TYPE_VIS _LIBCPP_DEPRECATED_IN_CXX20 codecvt : public locale::facet, public codecvt_base { @@ -1155,10 +1159,100 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -// template <> class codecvt +#ifndef _LIBCPP_NO_HAS_CHAR8_T + +// template <> class codecvt // C++20 template <> -class _LIBCPP_TYPE_VIS codecvt +class _LIBCPP_TYPE_VIS codecvt + : public locale::facet, + public codecvt_base +{ +public: + typedef char16_t intern_type; + typedef char8_t extern_type; + typedef mbstate_t state_type; + + _LIBCPP_INLINE_VISIBILITY + explicit codecvt(size_t __refs = 0) + : locale::facet(__refs) {} + + _LIBCPP_INLINE_VISIBILITY + result out(state_type& __st, + const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const + { + return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + result unshift(state_type& __st, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const + { + return do_unshift(__st, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + result in(state_type& __st, + const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt, + intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const + { + return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + int encoding() const _NOEXCEPT + { + return do_encoding(); + } + + _LIBCPP_INLINE_VISIBILITY + bool always_noconv() const _NOEXCEPT + { + return do_always_noconv(); + } + + _LIBCPP_INLINE_VISIBILITY + int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const + { + return do_length(__st, __frm, __end, __mx); + } + + _LIBCPP_INLINE_VISIBILITY + int max_length() const _NOEXCEPT + { + return do_max_length(); + } + + static locale::id id; + +protected: + _LIBCPP_INLINE_VISIBILITY + explicit codecvt(const char*, size_t __refs = 0) + : locale::facet(__refs) {} + + ~codecvt(); + + virtual result do_out(state_type& __st, + const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const; + virtual result do_in(state_type& __st, + const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt, + intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const; + virtual result do_unshift(state_type& __st, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const; + virtual int do_encoding() const _NOEXCEPT; + virtual bool do_always_noconv() const _NOEXCEPT; + virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const; + virtual int do_max_length() const _NOEXCEPT; +}; + +#endif + +// template <> class codecvt // deprecated in C++20 + +template <> +class _LIBCPP_TYPE_VIS _LIBCPP_DEPRECATED_IN_CXX20 codecvt : public locale::facet, public codecvt_base { @@ -1241,6 +1335,96 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; +#ifndef _LIBCPP_NO_HAS_CHAR8_T + +// template <> class codecvt // C++20 + +template <> +class _LIBCPP_TYPE_VIS codecvt + : public locale::facet, + public codecvt_base +{ +public: + typedef char32_t intern_type; + typedef char8_t extern_type; + typedef mbstate_t state_type; + + _LIBCPP_INLINE_VISIBILITY + explicit codecvt(size_t __refs = 0) + : locale::facet(__refs) {} + + _LIBCPP_INLINE_VISIBILITY + result out(state_type& __st, + const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const + { + return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + result unshift(state_type& __st, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const + { + return do_unshift(__st, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + result in(state_type& __st, + const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt, + intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const + { + return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt); + } + + _LIBCPP_INLINE_VISIBILITY + int encoding() const _NOEXCEPT + { + return do_encoding(); + } + + _LIBCPP_INLINE_VISIBILITY + bool always_noconv() const _NOEXCEPT + { + return do_always_noconv(); + } + + _LIBCPP_INLINE_VISIBILITY + int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const + { + return do_length(__st, __frm, __end, __mx); + } + + _LIBCPP_INLINE_VISIBILITY + int max_length() const _NOEXCEPT + { + return do_max_length(); + } + + static locale::id id; + +protected: + _LIBCPP_INLINE_VISIBILITY + explicit codecvt(const char*, size_t __refs = 0) + : locale::facet(__refs) {} + + ~codecvt(); + + virtual result do_out(state_type& __st, + const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const; + virtual result do_in(state_type& __st, + const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt, + intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const; + virtual result do_unshift(state_type& __st, + extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const; + virtual int do_encoding() const _NOEXCEPT; + virtual bool do_always_noconv() const _NOEXCEPT; + virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const; + virtual int do_max_length() const _NOEXCEPT; +}; + +#endif + // template class codecvt_byname template @@ -1258,15 +1442,21 @@ protected: ~codecvt_byname(); }; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH template codecvt_byname<_InternT, _ExternT, _StateT>::~codecvt_byname() { } +_LIBCPP_SUPPRESS_DEPRECATED_POP -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) -_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DEPRECATED_IN_CXX20 codecvt_byname) // deprecated in C++20 +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DEPRECATED_IN_CXX20 codecvt_byname) // deprecated in C++20 +#ifndef _LIBCPP_NO_HAS_CHAR8_T +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) // C++20 +_LIBCPP_EXTERN_TEMPLATE(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname) // C++20 +#endif template struct __narrow_to_utf8 @@ -1290,12 +1480,14 @@ struct __narrow_to_utf8<8> } }; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<16> : public codecvt { _LIBCPP_INLINE_VISIBILITY __narrow_to_utf8() : codecvt(1) {} +_LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8(); @@ -1324,12 +1516,14 @@ struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<16> } }; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> struct _LIBCPP_TEMPLATE_VIS __narrow_to_utf8<32> : public codecvt { _LIBCPP_INLINE_VISIBILITY __narrow_to_utf8() : codecvt(1) {} +_LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_EXPORTED_FROM_ABI ~__narrow_to_utf8(); @@ -1380,12 +1574,14 @@ struct __widen_from_utf8<8> } }; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<16> : public codecvt { _LIBCPP_INLINE_VISIBILITY __widen_from_utf8() : codecvt(1) {} +_LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8(); @@ -1407,19 +1603,21 @@ struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<16> if (__r == codecvt_base::error || __nn == __nb) __throw_runtime_error("locale not supported"); for (const char16_t* __p = __buf; __p < __bn; ++__p, ++__s) - *__s = (wchar_t)*__p; + *__s = *__p; __nb = __nn; } return __s; } }; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH template <> struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<32> : public codecvt { _LIBCPP_INLINE_VISIBILITY __widen_from_utf8() : codecvt(1) {} +_LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_EXPORTED_FROM_ABI ~__widen_from_utf8(); @@ -1441,7 +1639,7 @@ struct _LIBCPP_TEMPLATE_VIS __widen_from_utf8<32> if (__r == codecvt_base::error || __nn == __nb) __throw_runtime_error("locale not supported"); for (const char32_t* __p = __buf; __p < __bn; ++__p, ++__s) - *__s = (wchar_t)*__p; + *__s = *__p; __nb = __nn; } return __s; diff --git a/lib/libcxx/include/__memory/allocator_traits.h b/lib/libcxx/include/__memory/allocator_traits.h new file mode 100644 index 0000000000..cdbdb9ef8e --- /dev/null +++ b/lib/libcxx/include/__memory/allocator_traits.h @@ -0,0 +1,589 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H +#define _LIBCPP___MEMORY_ALLOCATOR_TRAITS_H + +#include <__config> +#include <__memory/base.h> +#include <__memory/pointer_traits.h> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +#pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +_LIBCPP_BEGIN_NAMESPACE_STD + +template +struct __has_pointer_type : false_type {}; + +template +struct __has_pointer_type<_Tp, + typename __void_t::type> : true_type {}; + +namespace __pointer_type_imp +{ + +template ::value> +struct __pointer_type +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Dp::pointer type; +}; + +template +struct __pointer_type<_Tp, _Dp, false> +{ + typedef _LIBCPP_NODEBUG_TYPE _Tp* type; +}; + +} // __pointer_type_imp + +template +struct __pointer_type +{ + typedef _LIBCPP_NODEBUG_TYPE typename __pointer_type_imp::__pointer_type<_Tp, typename remove_reference<_Dp>::type>::type type; +}; + +template +struct __has_const_pointer : false_type {}; + +template +struct __has_const_pointer<_Tp, + typename __void_t::type> : true_type {}; + +template ::value> +struct __const_pointer +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::const_pointer type; +}; + +template +struct __const_pointer<_Tp, _Ptr, _Alloc, false> +{ +#ifndef _LIBCPP_CXX03_LANG + typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind type; +#else + typedef typename pointer_traits<_Ptr>::template rebind::other type; +#endif +}; + +template +struct __has_void_pointer : false_type {}; + +template +struct __has_void_pointer<_Tp, + typename __void_t::type> : true_type {}; + +template ::value> +struct __void_pointer +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::void_pointer type; +}; + +template +struct __void_pointer<_Ptr, _Alloc, false> +{ +#ifndef _LIBCPP_CXX03_LANG + typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind type; +#else + typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind::other type; +#endif +}; + +template +struct __has_const_void_pointer : false_type {}; + +template +struct __has_const_void_pointer<_Tp, + typename __void_t::type> : true_type {}; + +template ::value> +struct __const_void_pointer +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::const_void_pointer type; +}; + +template +struct __const_void_pointer<_Ptr, _Alloc, false> +{ +#ifndef _LIBCPP_CXX03_LANG + typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind type; +#else + typedef _LIBCPP_NODEBUG_TYPE typename pointer_traits<_Ptr>::template rebind::other type; +#endif +}; + +template +struct __has_size_type : false_type {}; + +template +struct __has_size_type<_Tp, + typename __void_t::type> : true_type {}; + +template ::value> +struct __size_type +{ + typedef _LIBCPP_NODEBUG_TYPE typename make_unsigned<_DiffType>::type type; +}; + +template +struct __size_type<_Alloc, _DiffType, true> +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::size_type type; +}; + +template +struct __has_propagate_on_container_copy_assignment : false_type {}; + +template +struct __has_propagate_on_container_copy_assignment<_Tp, + typename __void_t::type> + : true_type {}; + +template ::value> +struct __propagate_on_container_copy_assignment +{ + typedef _LIBCPP_NODEBUG_TYPE false_type type; +}; + +template +struct __propagate_on_container_copy_assignment<_Alloc, true> +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_copy_assignment type; +}; + +template +struct __has_propagate_on_container_move_assignment : false_type {}; + +template +struct __has_propagate_on_container_move_assignment<_Tp, + typename __void_t::type> + : true_type {}; + +template ::value> +struct __propagate_on_container_move_assignment +{ + typedef false_type type; +}; + +template +struct __propagate_on_container_move_assignment<_Alloc, true> +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_move_assignment type; +}; + +template +struct __has_propagate_on_container_swap : false_type {}; + +template +struct __has_propagate_on_container_swap<_Tp, + typename __void_t::type> + : true_type {}; + +template ::value> +struct __propagate_on_container_swap +{ + typedef false_type type; +}; + +template +struct __propagate_on_container_swap<_Alloc, true> +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::propagate_on_container_swap type; +}; + +template +struct __has_is_always_equal : false_type {}; + +template +struct __has_is_always_equal<_Tp, + typename __void_t::type> + : true_type {}; + +template ::value> +struct __is_always_equal +{ + typedef _LIBCPP_NODEBUG_TYPE typename _VSTD::is_empty<_Alloc>::type type; +}; + +template +struct __is_always_equal<_Alloc, true> +{ + typedef _LIBCPP_NODEBUG_TYPE typename _Alloc::is_always_equal type; +}; + +template ::value> +struct __has_rebind_other +{ +private: + struct __two {char __lx; char __lxx;}; + template static __two __test(...); + _LIBCPP_SUPPRESS_DEPRECATED_PUSH + template static char __test(typename _Xp::template rebind<_Up>::other* = 0); + _LIBCPP_SUPPRESS_DEPRECATED_POP +public: + static const bool value = sizeof(__test<_Tp>(0)) == 1; +}; + +template +struct __has_rebind_other<_Tp, _Up, false> +{ + static const bool value = false; +}; + +template ::value> +struct __allocator_traits_rebind +{ + _LIBCPP_SUPPRESS_DEPRECATED_PUSH + typedef _LIBCPP_NODEBUG_TYPE typename _Tp::template rebind<_Up>::other type; + _LIBCPP_SUPPRESS_DEPRECATED_POP +}; + +template