From ce754724b31e141688e1203ba7bb0a2538f2b544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Wed, 5 Feb 2025 10:23:43 +0100 Subject: [PATCH] zig cc: Update intrinsic headers to Clang 20. --- lib/include/adcintrin.h | 5 + lib/include/adxintrin.h | 5 + lib/include/altivec.h | 16 +- lib/include/amxavx512intrin.h | 382 ++ lib/include/amxbf16transposeintrin.h | 94 + lib/include/amxcomplextransposeintrin.h | 303 + lib/include/amxfp16intrin.h | 35 + lib/include/amxfp16transposeintrin.h | 94 + lib/include/amxfp8intrin.h | 230 + lib/include/amxintrin.h | 40 +- lib/include/amxmovrsintrin.h | 48 + lib/include/amxmovrstransposeintrin.h | 200 + lib/include/amxtf32intrin.h | 108 + lib/include/amxtf32transposeintrin.h | 105 + lib/include/amxtransposeintrin.h | 248 + lib/include/arm_acle.h | 39 +- lib/include/arm_neon.h | 5415 +++++++++++------ lib/include/arm_sme.h | 416 +- lib/include/arm_sve.h | 1189 +++- lib/include/arm_vector_types.h | 87 + lib/include/avx10_2_512bf16intrin.h | 561 ++ lib/include/avx10_2_512convertintrin.h | 320 + lib/include/avx10_2_512minmaxintrin.h | 127 + lib/include/avx10_2_512niintrin.h | 314 + lib/include/avx10_2_512satcvtdsintrin.h | 303 + lib/include/avx10_2_512satcvtintrin.h | 301 + lib/include/avx10_2bf16intrin.h | 1085 ++++ lib/include/avx10_2convertintrin.h | 590 ++ lib/include/avx10_2copyintrin.h | 66 + lib/include/avx10_2minmaxintrin.h | 277 + lib/include/avx10_2niintrin.h | 2075 +++++++ lib/include/avx10_2satcvtdsintrin.h | 496 ++ lib/include/avx10_2satcvtintrin.h | 444 ++ lib/include/avx2intrin.h | 9 + lib/include/avx512bitalgintrin.h | 4 +- lib/include/avx512fintrin.h | 36 +- lib/include/avx512vlbitalgintrin.h | 8 +- lib/include/avx512vpopcntdqintrin.h | 16 +- lib/include/avx512vpopcntdqvlintrin.h | 24 +- lib/include/avxintrin.h | 46 +- lib/include/avxvnniint16intrin.h | 113 +- lib/include/avxvnniint8intrin.h | 113 +- lib/include/bmi2intrin.h | 32 +- lib/include/bmiintrin.h | 68 +- lib/include/cmpccxaddintrin.h | 2 +- lib/include/cpuid.h | 23 +- lib/include/emmintrin.h | 248 +- lib/include/gfniintrin.h | 42 +- lib/include/hexagon_types.h | 12 +- lib/include/hvx_hexagon_protos.h | 427 ++ lib/include/immintrin.h | 90 +- lib/include/intrin.h | 30 +- lib/include/intrin0.h | 11 +- lib/include/larchintrin.h | 30 +- lib/include/lasxintrin.h | 52 +- lib/include/limits.h | 11 +- lib/include/llvm_libc_wrappers/ctype.h | 38 + lib/include/llvm_libc_wrappers/stdlib.h | 8 + lib/include/lsxintrin.h | 52 +- lib/include/lzcntintrin.h | 17 +- lib/include/mmintrin.h | 364 +- lib/include/module.modulemap | 6 +- lib/include/movrs_avx10_2_512intrin.h | 98 + lib/include/movrs_avx10_2intrin.h | 174 + lib/include/movrsintrin.h | 59 + .../__clang_openmp_device_functions.h | 9 +- lib/include/openmp_wrappers/complex_cmath.h | 9 +- lib/include/pmmintrin.h | 19 +- lib/include/popcntintrin.h | 14 +- lib/include/ptrauth.h | 6 + lib/include/riscv_corev_alu.h | 128 + lib/include/riscv_vector.h | 1 - lib/include/sm4evexintrin.h | 32 + lib/include/smmintrin.h | 6 + lib/include/stdalign.h | 5 - lib/include/tbmintrin.h | 62 +- lib/include/tmmintrin.h | 102 +- lib/include/vecintrin.h | 1796 +++++- lib/include/wasm_simd128.h | 188 +- lib/include/xmmintrin.h | 347 +- 80 files changed, 18035 insertions(+), 2970 deletions(-) create mode 100644 lib/include/amxavx512intrin.h create mode 100644 lib/include/amxbf16transposeintrin.h create mode 100644 lib/include/amxcomplextransposeintrin.h create mode 100644 lib/include/amxfp16transposeintrin.h create mode 100644 lib/include/amxfp8intrin.h create mode 100644 lib/include/amxmovrsintrin.h create mode 100644 lib/include/amxmovrstransposeintrin.h create mode 100644 lib/include/amxtf32intrin.h create mode 100644 lib/include/amxtf32transposeintrin.h create mode 100644 lib/include/amxtransposeintrin.h create mode 100644 lib/include/avx10_2_512bf16intrin.h create mode 100644 lib/include/avx10_2_512convertintrin.h create mode 100644 lib/include/avx10_2_512minmaxintrin.h create mode 100644 lib/include/avx10_2_512niintrin.h create mode 100644 lib/include/avx10_2_512satcvtdsintrin.h create mode 100644 lib/include/avx10_2_512satcvtintrin.h create mode 100644 lib/include/avx10_2bf16intrin.h create mode 100644 lib/include/avx10_2convertintrin.h create mode 100644 lib/include/avx10_2copyintrin.h create mode 100644 lib/include/avx10_2minmaxintrin.h create mode 100644 lib/include/avx10_2niintrin.h create mode 100644 lib/include/avx10_2satcvtdsintrin.h create mode 100644 lib/include/avx10_2satcvtintrin.h create mode 100644 lib/include/movrs_avx10_2_512intrin.h create mode 100644 lib/include/movrs_avx10_2intrin.h create mode 100644 lib/include/movrsintrin.h create mode 100644 lib/include/riscv_corev_alu.h create mode 100644 lib/include/sm4evexintrin.h diff --git a/lib/include/adcintrin.h b/lib/include/adcintrin.h index 0065a1b543..5c68fce937 100644 --- a/lib/include/adcintrin.h +++ b/lib/include/adcintrin.h @@ -15,7 +15,12 @@ #endif /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__)) constexpr +#else #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#endif /* Use C++ inline semantics in C++, GNU inline for C mode. */ #if defined(__cplusplus) diff --git a/lib/include/adxintrin.h b/lib/include/adxintrin.h index bc6a4caf35..055e91f8e2 100644 --- a/lib/include/adxintrin.h +++ b/lib/include/adxintrin.h @@ -15,8 +15,13 @@ #define __ADXINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("adx"))) constexpr +#else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("adx"))) +#endif /* Use C++ inline semantics in C++, GNU inline for C mode. */ #if defined(__cplusplus) diff --git a/lib/include/altivec.h b/lib/include/altivec.h index 4971631c50..8da6505501 100644 --- a/lib/include/altivec.h +++ b/lib/include/altivec.h @@ -2502,37 +2502,37 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) { static __inline__ vector unsigned char __ATTRS_o_ai vec_popcnt(vector signed char __a) { - return (vector unsigned char)__builtin_altivec_vpopcntb( + return (vector unsigned char)__builtin_elementwise_popcount( (vector unsigned char)__a); } static __inline__ vector unsigned char __ATTRS_o_ai vec_popcnt(vector unsigned char __a) { - return __builtin_altivec_vpopcntb(__a); + return __builtin_elementwise_popcount(__a); } static __inline__ vector unsigned short __ATTRS_o_ai vec_popcnt(vector signed short __a) { - return (vector unsigned short)__builtin_altivec_vpopcnth( + return (vector unsigned short)__builtin_elementwise_popcount( (vector unsigned short)__a); } static __inline__ vector unsigned short __ATTRS_o_ai vec_popcnt(vector unsigned short __a) { - return __builtin_altivec_vpopcnth(__a); + return __builtin_elementwise_popcount(__a); } static __inline__ vector unsigned int __ATTRS_o_ai vec_popcnt(vector signed int __a) { - return __builtin_altivec_vpopcntw((vector unsigned int)__a); + return __builtin_elementwise_popcount((vector unsigned int)__a); } static __inline__ vector unsigned int __ATTRS_o_ai vec_popcnt(vector unsigned int __a) { - return __builtin_altivec_vpopcntw(__a); + return __builtin_elementwise_popcount(__a); } static __inline__ vector unsigned long long __ATTRS_o_ai vec_popcnt(vector signed long long __a) { - return __builtin_altivec_vpopcntd((vector unsigned long long)__a); + return __builtin_elementwise_popcount((vector unsigned long long)__a); } static __inline__ vector unsigned long long __ATTRS_o_ai vec_popcnt(vector unsigned long long __a) { - return __builtin_altivec_vpopcntd(__a); + return __builtin_elementwise_popcount(__a); } #define vec_vclz vec_cntlz diff --git a/lib/include/amxavx512intrin.h b/lib/include/amxavx512intrin.h new file mode 100644 index 0000000000..a158983482 --- /dev/null +++ b/lib/include/amxavx512intrin.h @@ -0,0 +1,382 @@ +/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_AVX512INTRIN_H +#define __AMX_AVX512INTRIN_H +#if defined(__x86_64__) && defined(__SSE2__) + +#define __DEFAULT_FN_ATTRS_AVX512 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-avx512,avx10.2-512"))) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the int32 source elements to fp32. The row of the tile is selected by a +/// 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWD2PS instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The row of the source tile +#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2bf16h(tsrc, row) \ + __builtin_ia32_tcvtrowps2bf16h(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to bf16. It places the resulting bf16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2bf16l(tsrc, row) \ + __builtin_ia32_tcvtrowps2bf16l(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the high 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+0] := 0 +/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row) + +/// Moves a row from a tile register to a zmm destination register, converting +/// the fp32 source elements to fp16. It places the resulting fp16 elements +/// in the low 16 bits within each dword. The row of the tile is selected +/// by a 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row); +/// \endcode +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL >> 3 +/// row_index := row & 0xffff +/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes / 4) - 1 +/// IF i + row_chunk / 4 >= tsrc.colsb / 4 +/// dst.dword[i] := 0 +/// ELSE +/// dst.word[2*i+1] := 0 +/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE) +/// FI +/// ENDFOR +/// dst[MAX_VL-1:VL] := 0 +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction. +/// +/// \param tsrc +/// The source tile. Max size is 1024 Bytes. +/// \param row +/// The the row of the source tile. +#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row) + +/// Move one row of a tile data to a v16f32 data. +/// The row of the tile is selected by a 32b GPR. +/// +/// \headerfile +/// +/// \code +/// __m512 _tile_movrow(__tile a, unsigned b); +/// \endcode +/// +/// This intrinsic corresponds to the TILEMOVROW instruction. +/// +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16f32 data. Size is 64 Bytes. +/// +/// \code{.operation} +/// VL := 512 +/// VL_bytes := VL>>3 +/// row_index := b&0xffff +/// row_chunk := ((b>>16)&0xffff) * VL_bytes +/// FOR i := 0 TO (VL_bytes-1) +/// IF (row_chunk + i >= a.colsb) +/// dst.byte[i] := 0 +/// ELSE +/// dst.byte[i] := a.row[row_index].byte[row_chunk+i] +/// ENDFOR +/// \endcode +#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b) + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. + +static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 +_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n, + _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512 +_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n, + _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal( + unsigned short m, unsigned short n, _tile1024i src, unsigned u) { + return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u); +} + +/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source +/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if +/// MXCSR.RC=RNE. Embedded rounding is not supported. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCVTROWD2PS instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16f32 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) { + return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source +/// elements to bf16 at high 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCVTROWPS2BF16H instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32bf16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source +/// elements to bf16 at low 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCVTROWPS2BF16L instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32bf16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source +/// elements to fp16 at high 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCVTROWPS2PHH instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32fp16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source +/// elements to fp16 at low 16-bits of each dword. +/// The row and chunk elements of tile is fetched from 32bit src1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCVTROWPS2PHL instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v32fp16 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) { + return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1); +} + +/// Move one row of a tile data to a v16f32 data. +/// The row of the tile is selected by a 32b GPR. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILEMOVROW instruction. +/// +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source r32. Size is 4 Bytes. +/// \returns +/// The destination v16i32 data. Size is 64 Bytes. +__DEFAULT_FN_ATTRS_AVX512 +static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { + return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); +} + +#endif // __x86_64__ && __SSE2__ +#endif // __AMX_AVX512INTRIN_H diff --git a/lib/include/amxbf16transposeintrin.h b/lib/include/amxbf16transposeintrin.h new file mode 100644 index 0000000000..86f09f2ad8 --- /dev/null +++ b/lib/include/amxbf16transposeintrin.h @@ -0,0 +1,94 @@ +/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_BF16TRANSPOSEINTRIN_H +#define __AMX_BF16TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-bf16,amx-transpose"))) + +/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in +/// tiles \a a and \a b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in \a dst, and store the +/// 32-bit result back to tile \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) * +/// FP32(b.row[k].bf16[2*n+0]) +/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) * +/// FP32(b.row[k].bf16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b)) + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS +_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in +/// tiles src0 and src1, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in "dst", and store the +/// 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __x86_64__ */ +#endif /* __AMX_BF16TRANSPOSEINTRIN_H */ diff --git a/lib/include/amxcomplextransposeintrin.h b/lib/include/amxcomplextransposeintrin.h new file mode 100644 index 0000000000..11abaf98e9 --- /dev/null +++ b/lib/include/amxcomplextransposeintrin.h @@ -0,0 +1,303 @@ +/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H +#define __AMX_COMPLEXTRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-complex,amx-transpose"))) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (transposed column of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The imaginary part of the \a a element +/// is multiplied with the real part of the corresponding \a b element, and +/// the real part of the \a a element is multiplied with the imaginary part +/// of the corresponding \a b elements. The two accumulated results are +/// added, and then accumulated into the corresponding row and column of +/// \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tcmmimfp16ps(dst, a, b) \ + __builtin_ia32_ttcmmimfp16ps((dst), (a), (b)) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the real part of the result. For each possible combination +/// of (rtransposed colum of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The real part of the \a a element is +/// multiplied with the real part of the corresponding \a b element, and the +/// negated imaginary part of the \a a element is multiplied with the +/// imaginary part of the corresponding \a b elements. The two accumulated +/// results are added, and then accumulated into the corresponding row and +/// column of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tcmmrlfp16ps(dst, a, b) \ + __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b)) + +/// Perform matrix conjugate transpose and multiplication of two tiles +/// containing complex elements and accumulate the results into a packed +/// single precision tile. Each dword element in input tiles \a a and \a b +/// is interpreted as a complex number with FP16 real part and FP16 imaginary +/// part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (transposed column of \a a, column of \a b), it performs a set of +/// multiplication and accumulations on all corresponding complex numbers +/// (one from \a a and one from \a b). The negated imaginary part of the \a a +/// element is multiplied with the real part of the corresponding \a b +/// element, and the real part of the \a a element is multiplied with the +/// imaginary part of the corresponding \a b elements. The two accumulated +/// results are added, and then accumulated into the corresponding row and +/// column of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO a.rows - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_conjtcmmimfp16ps(dst, a, b) \ + __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b)) + +/// Perform conjugate transpose of an FP16-pair of complex elements from \a a +/// and writes the result to \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_conjtfp16(__tile dst, __tile a); +/// \endcode +/// +/// \code{.operation} +/// FOR i := 0 TO dst.rows - 1 +/// FOR j := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0] +/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1] +/// ENDFOR +/// write_row_and_zero(dst, i, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCONJTFP16 instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The source tile. Max size is 1024 Bytes. +#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a)) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal( + unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, + _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS +_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_tconjtfp16_internal(m, n, src); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// This function calculates the real part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTCMMRLFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix conjugate transpose and multiplication of two tiles +/// containing complex elements and accumulate the results into a packed +/// single precision tile. Each dword element in input tiles src0 and src1 +/// is interpreted as a complex number with FP16 real part and FP16 imaginary +/// part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCONJTCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform conjugate transpose of an FP16-pair of complex elements from src and +/// writes the result to dst. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCONJTFP16 instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) { + dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile); +} + +#undef __DEFAULT_FN_ATTRS + +#endif // __x86_64__ +#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H diff --git a/lib/include/amxfp16intrin.h b/lib/include/amxfp16intrin.h index ed798245d4..bb4bc31fda 100644 --- a/lib/include/amxfp16intrin.h +++ b/lib/include/amxfp16intrin.h @@ -15,6 +15,10 @@ #define __AMX_FP16INTRIN_H #ifdef __x86_64__ +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) + /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a /// and \a b, accumulating the intermediate single-precision (32-bit) /// floating-point elements with elements in \a dst, and store the 32-bit @@ -54,5 +58,36 @@ #define _tile_dpfp16ps(dst, a, b) \ __builtin_ia32_tdpfp16ps(dst, a, b) +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS +_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and +/// src1, accumulating the intermediate single-precision (32-bit) floating-point +/// elements with elements in "dst", and store the 32-bit result back to tile +/// "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#undef __DEFAULT_FN_ATTRS + #endif /* __x86_64__ */ #endif /* __AMX_FP16INTRIN_H */ diff --git a/lib/include/amxfp16transposeintrin.h b/lib/include/amxfp16transposeintrin.h new file mode 100644 index 0000000000..191f8c6097 --- /dev/null +++ b/lib/include/amxfp16transposeintrin.h @@ -0,0 +1,94 @@ +/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_FP16TRANSPOSEINTRIN_H +#define __AMX_FP16TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-fp16,amx-transpose"))) + +/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in +/// tiles \a a and \a b, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in \a dst, and store the +/// 32-bit result back to tile \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * +/// FP32(b.row[k].fp16[2*n+0]) +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * +/// FP32(b.row[k].fp16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TTDPFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b)) + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS +_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in +/// tiles src0 and src1, accumulating the intermediate single-precision +/// (32-bit) floating-point elements with elements in "dst", and store the +/// 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTDPFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS +static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __x86_64__ */ +#endif /* __AMX_FP16TRANSPOSEINTRIN_H */ diff --git a/lib/include/amxfp8intrin.h b/lib/include/amxfp8intrin.h new file mode 100644 index 0000000000..92e7989974 --- /dev/null +++ b/lib/include/amxfp8intrin.h @@ -0,0 +1,230 @@ +/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXFP8INTRIN_H +#define __AMXFP8INTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_FP8 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8"))) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 +_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2 +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 +/// FOR k := 0 TO src1.colsb / 4 - 1 +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// temp1[n] += +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) +/// ENDFOR +/// ENDFOR +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src1 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src2 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_FP8 static void +__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { + dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 +_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2 +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 +/// FOR k := 0 TO src1.colsb / 4 - 1 +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// temp1[n] += +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) +/// ENDFOR +/// ENDFOR +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src1 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src2 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_FP8 static void +__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { + dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 +_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2 +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 +/// FOR k := 0 TO src1.colsb / 4 - 1 +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// temp1[n] += +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) +/// ENDFOR +/// ENDFOR +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src1 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src2 +/// The 2nd source tile. Max size is 1024 Bytes. + +__DEFAULT_FN_ATTRS_FP8 static void +__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { + dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 +_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2 +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// +/// \code +/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 +/// FOR k := 0 TO src1.colsb / 4 - 1 +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// temp1[n] += +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) +/// ENDFOR +/// ENDFOR +/// FOR n := 0 TO dst.colsb / 4 - 1 +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src1 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src2 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_FP8 static void +__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { + dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile, + src1.tile, src2.tile); +} + +#define _tile_dpbf8ps(dst, src1, src2) \ + __builtin_ia32_tdpbf8ps((dst), (src1), (src2)) +#define _tile_dpbhf8ps(dst, src1, src2) \ + __builtin_ia32_tdpbhf8ps((dst), (src1), (src2)) +#define _tile_dphbf8ps(dst, src1, src2) \ + __builtin_ia32_tdphbf8ps((dst), (src1), (src2)) +#define _tile_dphf8ps(dst, src1, src2) \ + __builtin_ia32_tdphf8ps((dst), (src1), (src2)) + +#undef __DEFAULT_FN_ATTRS_FP8 + +#endif /* __x86_64__ */ +#endif /* __AMXFP8INTRIN_H */ diff --git a/lib/include/amxintrin.h b/lib/include/amxintrin.h index baa56f5b28..a7da10d995 100644 --- a/lib/include/amxintrin.h +++ b/lib/include/amxintrin.h @@ -22,8 +22,6 @@ __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) #define __DEFAULT_FN_ATTRS_BF16 \ __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) -#define __DEFAULT_FN_ATTRS_FP16 \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16"))) /// Load tile configuration from a 64-byte memory location specified by /// "mem_addr". The tile configuration includes the tile type palette, the @@ -232,9 +230,11 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { /// bytes. Since there is no 2D type in llvm IR, we use vector type to /// represent 2D tile and the fixed size is maximum amx tile register size. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); +typedef int _tile1024i_1024a + __attribute__((__vector_size__(1024), __aligned__(1024))); /// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, __SIZE_TYPE__ stride) { return __builtin_ia32_tileloadd64_internal(m, n, base, @@ -242,7 +242,7 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, } /// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, __SIZE_TYPE__ stride) { return __builtin_ia32_tileloaddt164_internal(m, n, base, @@ -278,7 +278,7 @@ _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, } /// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ void __DEFAULT_FN_ATTRS_INT8 +static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_stored_internal(unsigned short m, unsigned short n, void *base, __SIZE_TYPE__ stride, _tile1024i tile) { return __builtin_ia32_tilestored64_internal(m, n, base, @@ -292,13 +292,6 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); } -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16 -_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2); -} - /// This struct pack the shape and tile data together for user. We suggest /// initializing the struct as early as possible, because compiler depends /// on the shape information to do configure. The constant value is preferred @@ -493,32 +486,9 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, src0.tile, src1.tile); } -/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and -/// src1, accumulating the intermediate single-precision (32-bit) floating-point -/// elements with elements in "dst", and store the 32-bit result back to tile -/// "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPFP16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_FP16 -static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - #undef __DEFAULT_FN_ATTRS_TILE #undef __DEFAULT_FN_ATTRS_INT8 #undef __DEFAULT_FN_ATTRS_BF16 -#undef __DEFAULT_FN_ATTRS_FP16 #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ diff --git a/lib/include/amxmovrsintrin.h b/lib/include/amxmovrsintrin.h new file mode 100644 index 0000000000..5fe2fdecb8 --- /dev/null +++ b/lib/include/amxmovrsintrin.h @@ -0,0 +1,48 @@ +/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-------------------------------------------------------------------=== */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXMOVRSINTRIN_H +#define __AMXMOVRSINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_MOVRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs"))) + +#define _tile_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +#define _tile_stream_loaddrs(dst, base, stride) \ + __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrs64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS +_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddrst164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS +__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride); +} +static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs( + __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride); +} +#undef __DEFAULT_FN_ATTRS_MOVRS +#endif /* __x86_64__ */ +#endif /* __AMXMOVRSINTRIN_H */ diff --git a/lib/include/amxmovrstransposeintrin.h b/lib/include/amxmovrstransposeintrin.h new file mode 100644 index 0000000000..17a9f7506a --- /dev/null +++ b/lib/include/amxmovrstransposeintrin.h @@ -0,0 +1,200 @@ +/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H +#define __AMX_MOVRS_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-transpose,amx-movrs"))) + +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride) +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride) +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride) +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride) + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rs_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1rst1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely become +/// read shared in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1RS instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS +static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +#undef __DEFAULT_FN_ATTRS +#endif /* __x86_64__ */ +#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */ \ No newline at end of file diff --git a/lib/include/amxtf32intrin.h b/lib/include/amxtf32intrin.h new file mode 100644 index 0000000000..44d002c660 --- /dev/null +++ b/lib/include/amxtf32intrin.h @@ -0,0 +1,108 @@ +/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_TF32INTRIN_H +#define __AMX_TF32INTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TF32 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-tf32"))) + +/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus +/// with \a srcdst. +/// All the calculation is base on float32 but with the lower 13-bit set to 0. +/// +/// \headerfile +/// +/// \code +/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \ +/// constexpr int b); +/// \endcode +/// +/// This intrinsic corresponds to the TMMULTF32PS instruction. +/// +/// \param srcdst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { +/// dword[12:0] := 0 +/// dword[31:13] := x[31:13] +/// return dword +/// } +/// +/// DEFINE silence_snan_fp32(x[31:0]) { +/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) +/// x.fraction[22] := 1 +/// return x +/// } +/// +/// elements_a := a.colsb / 4 +/// elements_dest := srcdst.colsb / 4 +/// +/// FOR m = 0 TO (srcdst.rows-1) +/// tmp[511:0] := 0 +/// FOR k = 0 TO (elements_a-1) +/// FOR n = 0 TO (elements_dest-1) +/// af := silence_snan_fp32(a.row[m].fp32[k]) +/// bf := silence_snan_fp32(b.row[k].fp32[n]) +/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af) +/// * zero_lower_mantissa_bits_fp32(bf) +/// ENDFOR +/// ENDFOR +/// +/// FOR n = 0 TO (elements_dest-1) +/// tmp.fp32[n] += srcdst.row[m].fp32[n] +/// ENDFOR +/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) +/// +/// ENDFOR +/// +/// zero_upper_rows(srcdst, srcdst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_mmultf32ps(srcdst, a, b) \ + __builtin_ia32_tmmultf32ps((srcdst), (a), (b)) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32 +_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2); +} + +/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst. +/// All the calculation is base on float32 but with the lower 13-bit set to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TMMULTF32PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TF32 +static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_TF32INTRIN_H diff --git a/lib/include/amxtf32transposeintrin.h b/lib/include/amxtf32transposeintrin.h new file mode 100644 index 0000000000..60336f953e --- /dev/null +++ b/lib/include/amxtf32transposeintrin.h @@ -0,0 +1,105 @@ +/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_TF32TRANSPOSEINTRIN_H +#define __AMX_TF32TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-tf32,amx-transpose"))) + +/// \code +/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ +/// constexpr int b); +/// \endcode +/// +/// This intrinsic corresponds to the TTMMULTF32PS instruction. +/// +/// \param srcdst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { +/// dword[12:0] := 0 +/// dword[31:13] := x[31:13] +/// return dword +/// } +/// +/// DEFINE silence_snan_fp32(x[31:0]) { +/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) +/// x.fraction[22] := 1 +/// return x +/// } +/// +/// elements_dest:= srcdst.colsb/4 +/// +/// FOR m := 0 TO (srcdst.rows-1) +/// tmp[511:0] := 0 +/// FOR k := 0 TO (a.rows-1) +/// FOR n := 0 TO (elements_dest-1) +/// a1e := silence_snan_fp32(a.row[k].fp32[m]) +/// a2e := silence_snan_fp32(b.row[k].fp32[n]) +/// s1e := zero_lower_mantissa_bits_fp32(a1e) +/// s2e := zero_lower_mantissa_bits_fp32(a2e) +/// tmp.fp32[n] += s1e * s2e +/// ENDFOR +/// ENDFOR +/// +/// FOR n := 0 TO (elements_dest-1) +/// tmp.fp32[n] += srcdst.row[m].fp32[n] +/// ENDFOR +/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) +/// +/// ENDFOR +/// +/// zero_upper_rows(srcdst, srcdst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_tmmultf32ps(srcdst, a, b) \ + __builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) + +// dst = m x n (srcdest), src1 = k x m, src2 = k x n +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE +_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do +/// Matrix Plus with dst. All the calculation is base on float32 but with the +/// lower 13-bit set to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTMMULTF32PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TF32_TRANSPOSE +static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_TF32TRANSPOSEINTRIN_H diff --git a/lib/include/amxtransposeintrin.h b/lib/include/amxtransposeintrin.h new file mode 100644 index 0000000000..b3fa37d766 --- /dev/null +++ b/lib/include/amxtransposeintrin.h @@ -0,0 +1,248 @@ +/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * ===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; use instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMX_TRANSPOSEINTRIN_H +#define __AMX_TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose"))) + +#define _tile_2rpntlvwz0(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0(tdst, base, stride) +#define _tile_2rpntlvwz0t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride) +#define _tile_2rpntlvwz1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1(tdst, base, stride) +#define _tile_2rpntlvwz1t1(tdst, base, stride) \ + __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride) + +/// Transpose 32-bit elements from \a src and write the result to \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_transposed(__tile dst, __tile src); +/// \endcode +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// +/// FOR i := 0 TO (dst.rows-1) +/// tmp[511:0] := 0 +/// FOR j := 0 TO (dst.colsb/4-1) +/// tmp.dword[j] := src.row[j].dword[i] +/// ENDFOR +/// dst.row[i] := tmp +/// ENDFOR +/// +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src) + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + // Use __tile1024i_1024a* to escape the alignment check in + // clang/test/Headers/x86-intrinsics-headers-clean.cpp + __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz0t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0, + (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal( + unsigned short row, unsigned short col0, unsigned short col1, + _tile1024i *dst0, _tile1024i *dst1, const void *base, + __SIZE_TYPE__ stride) { + __builtin_ia32_t2rpntlvwz1t1_internal( + row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base, + (__SIZE_TYPE__)(stride)); +} + +// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE +_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) { + return __builtin_ia32_ttransposed_internal(m, n, src); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ0T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Converts a pair of tiles from memory into VNNI format, and places the +/// results in a pair of destinations specified by dst. The pair of tiles +/// in memory is specified via a tsib; the second tile is after the first +/// one, separated by the same stride that separates each row. +/// The tile configuration for the destination tiles indicates the amount +/// of data to read from memory. The instruction will load a number of rows +/// that is equal to twice the number of rows in tmm1. The size of each row +/// is equal to the average width of the destination tiles. If the second +/// tile is configured with zero rows and columns, only the first tile will +/// be written. The last row will be not be read from memory but instead +/// filled with zeros. +/// Provides a hint to the implementation that the data will likely not be +/// reused in the near future and the data caching can be optimized. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the T2RPNTLVWZ1T1 instruction. +/// +/// \param dst0 +/// First tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param dst1 +/// Second tile of destination tile pair. Max size is 1024i*2 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1, + const void *base, __SIZE_TYPE__ stride) { + _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile, + &dst1->tile, base, stride); +} + +/// Transpose 32-bit elements from src and write the result to dst. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTRANSPOSED instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src +/// The source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TRANSPOSE +static void __tile_transposed(__tile1024i *dst, __tile1024i src) { + dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile); +} + +#endif /* __x86_64__ */ +#endif /* __AMX_TRANSPOSEINTRIN_H */ diff --git a/lib/include/arm_acle.h b/lib/include/arm_acle.h index 1518b0c4c8..b1dc90f84a 100644 --- a/lib/include/arm_acle.h +++ b/lib/include/arm_acle.h @@ -264,28 +264,28 @@ __rbitl(unsigned long __t) { } /* 8.3 16-bit multiplications */ -#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smulbb(int32_t __a, int32_t __b) { return __builtin_arm_smulbb(__a, __b); } -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smulbt(int32_t __a, int32_t __b) { return __builtin_arm_smulbt(__a, __b); } -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smultb(int32_t __a, int32_t __b) { return __builtin_arm_smultb(__a, __b); } -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smultt(int32_t __a, int32_t __b) { return __builtin_arm_smultt(__a, __b); } -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smulwb(int32_t __a, int32_t __b) { return __builtin_arm_smulwb(__a, __b); } -static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp"))) __smulwt(int32_t __a, int32_t __b) { return __builtin_arm_smulwt(__a, __b); } @@ -304,46 +304,46 @@ __smulwt(int32_t __a, int32_t __b) { #endif /* 8.4.2 Saturating addition and subtraction intrinsics */ -#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __qadd(int32_t __t, int32_t __v) { return __builtin_arm_qadd(__t, __v); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __qsub(int32_t __t, int32_t __v) { return __builtin_arm_qsub(__t, __v); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __qdbl(int32_t __t) { return __builtin_arm_qadd(__t, __t); } #endif /* 8.4.3 Accumulating multiplications */ -#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlabb(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlabb(__a, __b, __c); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlabt(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlabt(__a, __b, __c); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlatb(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlatb(__a, __b, __c); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlatt(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlatt(__a, __b, __c); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlawb(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlawb(__a, __b, __c); } -static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp"))) __smlawt(int32_t __a, int32_t __b, int32_t __c) { return __builtin_arm_smlawt(__a, __b, __c); } @@ -621,8 +621,6 @@ __rintnf(float __a) { #endif /* 8.8 CRC32 intrinsics */ -#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \ - (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) __crc32b(uint32_t __a, uint8_t __b) { return __builtin_arm_crc32b(__a, __b); @@ -662,7 +660,6 @@ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target __crc32cd(uint32_t __a, uint64_t __b) { return __builtin_arm_crc32cd(__a, __b); } -#endif /* 8.6 Floating-point data-processing intrinsics */ /* Armv8.3-A Javascript conversion intrinsic */ diff --git a/lib/include/arm_neon.h b/lib/include/arm_neon.h index b67616134b..ab28e839e4 100644 --- a/lib/include/arm_neon.h +++ b/lib/include/arm_neon.h @@ -359,9 +359,7 @@ __ai __attribute__((target("bf16,neon"))) bfloat16x8_t __noswap_vcombine_bf16(bf }) __ai __attribute__((target("bf16,neon"))) float32_t vcvtah_f32_bf16(bfloat16_t __p0) { float32_t __ret; -bfloat16_t __reint = __p0; -int32_t __reint1 = (int32_t)(*(int16_t *) &__reint) << 16; - __ret = *(float32_t *) &__reint1; + __ret = __builtin_bit_cast(float32_t, (uint32_t)(__builtin_bit_cast(uint16_t, __p0)) << 16); return __ret; } __ai __attribute__((target("bf16,neon"))) bfloat16_t vcvth_bf16_f32(float32_t __p0) { @@ -35841,9 +35839,7 @@ __ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_f float16x4_t __s0_150 = __p0_150; \ float16x4_t __s1_150 = __p1_150; \ float16x4_t __s2_150 = __p2_150; \ -float16x4_t __reint_150 = __s2_150; \ -uint32x2_t __reint1_150 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_150, __p3_150), vget_lane_u32(*(uint32x2_t *) &__reint_150, __p3_150)}; \ - __ret_150 = vcmla_f16(__s0_150, __s1_150, *(float16x4_t *) &__reint1_150); \ + __ret_150 = vcmla_f16(__s0_150, __s1_150, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_150), __p3_150), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_150), __p3_150)})); \ __ret_150; \ }) #else @@ -35855,9 +35851,7 @@ uint32x2_t __reint1_150 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x4_t __rev0_151; __rev0_151 = __builtin_shufflevector(__s0_151, __s0_151, 3, 2, 1, 0); \ float16x4_t __rev1_151; __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \ float16x4_t __rev2_151; __rev2_151 = __builtin_shufflevector(__s2_151, __s2_151, 3, 2, 1, 0); \ -float16x4_t __reint_151 = __rev2_151; \ -uint32x2_t __reint1_151 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_151, __p3_151), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_151, __p3_151)}; \ - __ret_151 = __noswap_vcmla_f16(__rev0_151, __rev1_151, *(float16x4_t *) &__reint1_151); \ + __ret_151 = __noswap_vcmla_f16(__rev0_151, __rev1_151, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_151), __p3_151), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_151), __p3_151)})); \ __ret_151 = __builtin_shufflevector(__ret_151, __ret_151, 3, 2, 1, 0); \ __ret_151; \ }) @@ -35869,9 +35863,7 @@ uint32x2_t __reint1_151 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x8_t __s0_152 = __p0_152; \ float16x8_t __s1_152 = __p1_152; \ float16x4_t __s2_152 = __p2_152; \ -float16x4_t __reint_152 = __s2_152; \ -uint32x4_t __reint1_152 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152), vget_lane_u32(*(uint32x2_t *) &__reint_152, __p3_152)}; \ - __ret_152 = vcmlaq_f16(__s0_152, __s1_152, *(float16x8_t *) &__reint1_152); \ + __ret_152 = vcmlaq_f16(__s0_152, __s1_152, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_152), __p3_152), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_152), __p3_152), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_152), __p3_152), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_152), __p3_152)})); \ __ret_152; \ }) #else @@ -35883,9 +35875,7 @@ uint32x4_t __reint1_152 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x8_t __rev0_153; __rev0_153 = __builtin_shufflevector(__s0_153, __s0_153, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_153; __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_153; __rev2_153 = __builtin_shufflevector(__s2_153, __s2_153, 3, 2, 1, 0); \ -float16x4_t __reint_153 = __rev2_153; \ -uint32x4_t __reint1_153 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_153, __p3_153)}; \ - __ret_153 = __noswap_vcmlaq_f16(__rev0_153, __rev1_153, *(float16x8_t *) &__reint1_153); \ + __ret_153 = __noswap_vcmlaq_f16(__rev0_153, __rev1_153, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_153), __p3_153), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_153), __p3_153), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_153), __p3_153), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_153), __p3_153)})); \ __ret_153 = __builtin_shufflevector(__ret_153, __ret_153, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_153; \ }) @@ -35897,9 +35887,7 @@ uint32x4_t __reint1_153 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x4_t __s0_154 = __p0_154; \ float16x4_t __s1_154 = __p1_154; \ float16x8_t __s2_154 = __p2_154; \ -float16x8_t __reint_154 = __s2_154; \ -uint32x2_t __reint1_154 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_154, __p3_154), vgetq_lane_u32(*(uint32x4_t *) &__reint_154, __p3_154)}; \ - __ret_154 = vcmla_f16(__s0_154, __s1_154, *(float16x4_t *) &__reint1_154); \ + __ret_154 = vcmla_f16(__s0_154, __s1_154, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_154), __p3_154), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_154), __p3_154)})); \ __ret_154; \ }) #else @@ -35911,9 +35899,7 @@ uint32x2_t __reint1_154 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x4_t __rev0_155; __rev0_155 = __builtin_shufflevector(__s0_155, __s0_155, 3, 2, 1, 0); \ float16x4_t __rev1_155; __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 3, 2, 1, 0); \ float16x8_t __rev2_155; __rev2_155 = __builtin_shufflevector(__s2_155, __s2_155, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_155 = __rev2_155; \ -uint32x2_t __reint1_155 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_155, __p3_155), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_155, __p3_155)}; \ - __ret_155 = __noswap_vcmla_f16(__rev0_155, __rev1_155, *(float16x4_t *) &__reint1_155); \ + __ret_155 = __noswap_vcmla_f16(__rev0_155, __rev1_155, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_155), __p3_155), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_155), __p3_155)})); \ __ret_155 = __builtin_shufflevector(__ret_155, __ret_155, 3, 2, 1, 0); \ __ret_155; \ }) @@ -35925,9 +35911,7 @@ uint32x2_t __reint1_155 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) float16x8_t __s0_156 = __p0_156; \ float16x8_t __s1_156 = __p1_156; \ float16x8_t __s2_156 = __p2_156; \ -float16x8_t __reint_156 = __s2_156; \ -uint32x4_t __reint1_156 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156), vgetq_lane_u32(*(uint32x4_t *) &__reint_156, __p3_156)}; \ - __ret_156 = vcmlaq_f16(__s0_156, __s1_156, *(float16x8_t *) &__reint1_156); \ + __ret_156 = vcmlaq_f16(__s0_156, __s1_156, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_156), __p3_156), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_156), __p3_156), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_156), __p3_156), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_156), __p3_156)})); \ __ret_156; \ }) #else @@ -35939,9 +35923,7 @@ uint32x4_t __reint1_156 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x8_t __rev0_157; __rev0_157 = __builtin_shufflevector(__s0_157, __s0_157, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_157; __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_157; __rev2_157 = __builtin_shufflevector(__s2_157, __s2_157, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_157 = __rev2_157; \ -uint32x4_t __reint1_157 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_157, __p3_157)}; \ - __ret_157 = __noswap_vcmlaq_f16(__rev0_157, __rev1_157, *(float16x8_t *) &__reint1_157); \ + __ret_157 = __noswap_vcmlaq_f16(__rev0_157, __rev1_157, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_157), __p3_157), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_157), __p3_157), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_157), __p3_157), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_157), __p3_157)})); \ __ret_157 = __builtin_shufflevector(__ret_157, __ret_157, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_157; \ }) @@ -35999,9 +35981,7 @@ __ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_r float16x4_t __s0_158 = __p0_158; \ float16x4_t __s1_158 = __p1_158; \ float16x4_t __s2_158 = __p2_158; \ -float16x4_t __reint_158 = __s2_158; \ -uint32x2_t __reint1_158 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_158, __p3_158), vget_lane_u32(*(uint32x2_t *) &__reint_158, __p3_158)}; \ - __ret_158 = vcmla_rot180_f16(__s0_158, __s1_158, *(float16x4_t *) &__reint1_158); \ + __ret_158 = vcmla_rot180_f16(__s0_158, __s1_158, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_158), __p3_158), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_158), __p3_158)})); \ __ret_158; \ }) #else @@ -36013,9 +35993,7 @@ uint32x2_t __reint1_158 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x4_t __rev0_159; __rev0_159 = __builtin_shufflevector(__s0_159, __s0_159, 3, 2, 1, 0); \ float16x4_t __rev1_159; __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \ float16x4_t __rev2_159; __rev2_159 = __builtin_shufflevector(__s2_159, __s2_159, 3, 2, 1, 0); \ -float16x4_t __reint_159 = __rev2_159; \ -uint32x2_t __reint1_159 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_159, __p3_159), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_159, __p3_159)}; \ - __ret_159 = __noswap_vcmla_rot180_f16(__rev0_159, __rev1_159, *(float16x4_t *) &__reint1_159); \ + __ret_159 = __noswap_vcmla_rot180_f16(__rev0_159, __rev1_159, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_159), __p3_159), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_159), __p3_159)})); \ __ret_159 = __builtin_shufflevector(__ret_159, __ret_159, 3, 2, 1, 0); \ __ret_159; \ }) @@ -36027,9 +36005,7 @@ uint32x2_t __reint1_159 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x8_t __s0_160 = __p0_160; \ float16x8_t __s1_160 = __p1_160; \ float16x4_t __s2_160 = __p2_160; \ -float16x4_t __reint_160 = __s2_160; \ -uint32x4_t __reint1_160 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160), vget_lane_u32(*(uint32x2_t *) &__reint_160, __p3_160)}; \ - __ret_160 = vcmlaq_rot180_f16(__s0_160, __s1_160, *(float16x8_t *) &__reint1_160); \ + __ret_160 = vcmlaq_rot180_f16(__s0_160, __s1_160, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_160), __p3_160), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_160), __p3_160), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_160), __p3_160), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_160), __p3_160)})); \ __ret_160; \ }) #else @@ -36041,9 +36017,7 @@ uint32x4_t __reint1_160 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x8_t __rev0_161; __rev0_161 = __builtin_shufflevector(__s0_161, __s0_161, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_161; __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_161; __rev2_161 = __builtin_shufflevector(__s2_161, __s2_161, 3, 2, 1, 0); \ -float16x4_t __reint_161 = __rev2_161; \ -uint32x4_t __reint1_161 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_161, __p3_161)}; \ - __ret_161 = __noswap_vcmlaq_rot180_f16(__rev0_161, __rev1_161, *(float16x8_t *) &__reint1_161); \ + __ret_161 = __noswap_vcmlaq_rot180_f16(__rev0_161, __rev1_161, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_161), __p3_161), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_161), __p3_161), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_161), __p3_161), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_161), __p3_161)})); \ __ret_161 = __builtin_shufflevector(__ret_161, __ret_161, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_161; \ }) @@ -36055,9 +36029,7 @@ uint32x4_t __reint1_161 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x4_t __s0_162 = __p0_162; \ float16x4_t __s1_162 = __p1_162; \ float16x8_t __s2_162 = __p2_162; \ -float16x8_t __reint_162 = __s2_162; \ -uint32x2_t __reint1_162 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_162, __p3_162), vgetq_lane_u32(*(uint32x4_t *) &__reint_162, __p3_162)}; \ - __ret_162 = vcmla_rot180_f16(__s0_162, __s1_162, *(float16x4_t *) &__reint1_162); \ + __ret_162 = vcmla_rot180_f16(__s0_162, __s1_162, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_162), __p3_162), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_162), __p3_162)})); \ __ret_162; \ }) #else @@ -36069,9 +36041,7 @@ uint32x2_t __reint1_162 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x4_t __rev0_163; __rev0_163 = __builtin_shufflevector(__s0_163, __s0_163, 3, 2, 1, 0); \ float16x4_t __rev1_163; __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 3, 2, 1, 0); \ float16x8_t __rev2_163; __rev2_163 = __builtin_shufflevector(__s2_163, __s2_163, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_163 = __rev2_163; \ -uint32x2_t __reint1_163 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_163, __p3_163), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_163, __p3_163)}; \ - __ret_163 = __noswap_vcmla_rot180_f16(__rev0_163, __rev1_163, *(float16x4_t *) &__reint1_163); \ + __ret_163 = __noswap_vcmla_rot180_f16(__rev0_163, __rev1_163, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_163), __p3_163), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_163), __p3_163)})); \ __ret_163 = __builtin_shufflevector(__ret_163, __ret_163, 3, 2, 1, 0); \ __ret_163; \ }) @@ -36083,9 +36053,7 @@ uint32x2_t __reint1_163 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) float16x8_t __s0_164 = __p0_164; \ float16x8_t __s1_164 = __p1_164; \ float16x8_t __s2_164 = __p2_164; \ -float16x8_t __reint_164 = __s2_164; \ -uint32x4_t __reint1_164 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164), vgetq_lane_u32(*(uint32x4_t *) &__reint_164, __p3_164)}; \ - __ret_164 = vcmlaq_rot180_f16(__s0_164, __s1_164, *(float16x8_t *) &__reint1_164); \ + __ret_164 = vcmlaq_rot180_f16(__s0_164, __s1_164, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_164), __p3_164), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_164), __p3_164), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_164), __p3_164), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_164), __p3_164)})); \ __ret_164; \ }) #else @@ -36097,9 +36065,7 @@ uint32x4_t __reint1_164 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x8_t __rev0_165; __rev0_165 = __builtin_shufflevector(__s0_165, __s0_165, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_165; __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_165; __rev2_165 = __builtin_shufflevector(__s2_165, __s2_165, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_165 = __rev2_165; \ -uint32x4_t __reint1_165 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_165, __p3_165)}; \ - __ret_165 = __noswap_vcmlaq_rot180_f16(__rev0_165, __rev1_165, *(float16x8_t *) &__reint1_165); \ + __ret_165 = __noswap_vcmlaq_rot180_f16(__rev0_165, __rev1_165, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_165), __p3_165), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_165), __p3_165), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_165), __p3_165), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_165), __p3_165)})); \ __ret_165 = __builtin_shufflevector(__ret_165, __ret_165, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_165; \ }) @@ -36157,9 +36123,7 @@ __ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_r float16x4_t __s0_166 = __p0_166; \ float16x4_t __s1_166 = __p1_166; \ float16x4_t __s2_166 = __p2_166; \ -float16x4_t __reint_166 = __s2_166; \ -uint32x2_t __reint1_166 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_166, __p3_166), vget_lane_u32(*(uint32x2_t *) &__reint_166, __p3_166)}; \ - __ret_166 = vcmla_rot270_f16(__s0_166, __s1_166, *(float16x4_t *) &__reint1_166); \ + __ret_166 = vcmla_rot270_f16(__s0_166, __s1_166, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_166), __p3_166), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_166), __p3_166)})); \ __ret_166; \ }) #else @@ -36171,9 +36135,7 @@ uint32x2_t __reint1_166 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x4_t __rev0_167; __rev0_167 = __builtin_shufflevector(__s0_167, __s0_167, 3, 2, 1, 0); \ float16x4_t __rev1_167; __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \ float16x4_t __rev2_167; __rev2_167 = __builtin_shufflevector(__s2_167, __s2_167, 3, 2, 1, 0); \ -float16x4_t __reint_167 = __rev2_167; \ -uint32x2_t __reint1_167 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_167, __p3_167), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_167, __p3_167)}; \ - __ret_167 = __noswap_vcmla_rot270_f16(__rev0_167, __rev1_167, *(float16x4_t *) &__reint1_167); \ + __ret_167 = __noswap_vcmla_rot270_f16(__rev0_167, __rev1_167, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_167), __p3_167), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_167), __p3_167)})); \ __ret_167 = __builtin_shufflevector(__ret_167, __ret_167, 3, 2, 1, 0); \ __ret_167; \ }) @@ -36185,9 +36147,7 @@ uint32x2_t __reint1_167 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x8_t __s0_168 = __p0_168; \ float16x8_t __s1_168 = __p1_168; \ float16x4_t __s2_168 = __p2_168; \ -float16x4_t __reint_168 = __s2_168; \ -uint32x4_t __reint1_168 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168), vget_lane_u32(*(uint32x2_t *) &__reint_168, __p3_168)}; \ - __ret_168 = vcmlaq_rot270_f16(__s0_168, __s1_168, *(float16x8_t *) &__reint1_168); \ + __ret_168 = vcmlaq_rot270_f16(__s0_168, __s1_168, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_168), __p3_168), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_168), __p3_168), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_168), __p3_168), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_168), __p3_168)})); \ __ret_168; \ }) #else @@ -36199,9 +36159,7 @@ uint32x4_t __reint1_168 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x8_t __rev0_169; __rev0_169 = __builtin_shufflevector(__s0_169, __s0_169, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_169; __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_169; __rev2_169 = __builtin_shufflevector(__s2_169, __s2_169, 3, 2, 1, 0); \ -float16x4_t __reint_169 = __rev2_169; \ -uint32x4_t __reint1_169 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_169, __p3_169)}; \ - __ret_169 = __noswap_vcmlaq_rot270_f16(__rev0_169, __rev1_169, *(float16x8_t *) &__reint1_169); \ + __ret_169 = __noswap_vcmlaq_rot270_f16(__rev0_169, __rev1_169, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_169), __p3_169), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_169), __p3_169), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_169), __p3_169), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_169), __p3_169)})); \ __ret_169 = __builtin_shufflevector(__ret_169, __ret_169, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_169; \ }) @@ -36213,9 +36171,7 @@ uint32x4_t __reint1_169 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x4_t __s0_170 = __p0_170; \ float16x4_t __s1_170 = __p1_170; \ float16x8_t __s2_170 = __p2_170; \ -float16x8_t __reint_170 = __s2_170; \ -uint32x2_t __reint1_170 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_170, __p3_170), vgetq_lane_u32(*(uint32x4_t *) &__reint_170, __p3_170)}; \ - __ret_170 = vcmla_rot270_f16(__s0_170, __s1_170, *(float16x4_t *) &__reint1_170); \ + __ret_170 = vcmla_rot270_f16(__s0_170, __s1_170, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_170), __p3_170), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_170), __p3_170)})); \ __ret_170; \ }) #else @@ -36227,9 +36183,7 @@ uint32x2_t __reint1_170 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x4_t __rev0_171; __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \ float16x4_t __rev1_171; __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \ float16x8_t __rev2_171; __rev2_171 = __builtin_shufflevector(__s2_171, __s2_171, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_171 = __rev2_171; \ -uint32x2_t __reint1_171 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_171, __p3_171), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_171, __p3_171)}; \ - __ret_171 = __noswap_vcmla_rot270_f16(__rev0_171, __rev1_171, *(float16x4_t *) &__reint1_171); \ + __ret_171 = __noswap_vcmla_rot270_f16(__rev0_171, __rev1_171, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_171), __p3_171), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_171), __p3_171)})); \ __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 3, 2, 1, 0); \ __ret_171; \ }) @@ -36241,9 +36195,7 @@ uint32x2_t __reint1_171 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) float16x8_t __s0_172 = __p0_172; \ float16x8_t __s1_172 = __p1_172; \ float16x8_t __s2_172 = __p2_172; \ -float16x8_t __reint_172 = __s2_172; \ -uint32x4_t __reint1_172 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172), vgetq_lane_u32(*(uint32x4_t *) &__reint_172, __p3_172)}; \ - __ret_172 = vcmlaq_rot270_f16(__s0_172, __s1_172, *(float16x8_t *) &__reint1_172); \ + __ret_172 = vcmlaq_rot270_f16(__s0_172, __s1_172, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_172), __p3_172), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_172), __p3_172), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_172), __p3_172), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_172), __p3_172)})); \ __ret_172; \ }) #else @@ -36255,9 +36207,7 @@ uint32x4_t __reint1_172 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x8_t __rev0_173; __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_173; __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_173; __rev2_173 = __builtin_shufflevector(__s2_173, __s2_173, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_173 = __rev2_173; \ -uint32x4_t __reint1_173 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_173, __p3_173)}; \ - __ret_173 = __noswap_vcmlaq_rot270_f16(__rev0_173, __rev1_173, *(float16x8_t *) &__reint1_173); \ + __ret_173 = __noswap_vcmlaq_rot270_f16(__rev0_173, __rev1_173, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_173), __p3_173), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_173), __p3_173), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_173), __p3_173), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_173), __p3_173)})); \ __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_173; \ }) @@ -36315,9 +36265,7 @@ __ai __attribute__((target("v8.3a,fullfp16,neon"))) float16x4_t __noswap_vcmla_r float16x4_t __s0_174 = __p0_174; \ float16x4_t __s1_174 = __p1_174; \ float16x4_t __s2_174 = __p2_174; \ -float16x4_t __reint_174 = __s2_174; \ -uint32x2_t __reint1_174 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_174, __p3_174), vget_lane_u32(*(uint32x2_t *) &__reint_174, __p3_174)}; \ - __ret_174 = vcmla_rot90_f16(__s0_174, __s1_174, *(float16x4_t *) &__reint1_174); \ + __ret_174 = vcmla_rot90_f16(__s0_174, __s1_174, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_174), __p3_174), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_174), __p3_174)})); \ __ret_174; \ }) #else @@ -36329,9 +36277,7 @@ uint32x2_t __reint1_174 = (uint32x2_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x4_t __rev0_175; __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 3, 2, 1, 0); \ float16x4_t __rev1_175; __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 3, 2, 1, 0); \ float16x4_t __rev2_175; __rev2_175 = __builtin_shufflevector(__s2_175, __s2_175, 3, 2, 1, 0); \ -float16x4_t __reint_175 = __rev2_175; \ -uint32x2_t __reint1_175 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_175, __p3_175), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_175, __p3_175)}; \ - __ret_175 = __noswap_vcmla_rot90_f16(__rev0_175, __rev1_175, *(float16x4_t *) &__reint1_175); \ + __ret_175 = __noswap_vcmla_rot90_f16(__rev0_175, __rev1_175, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_175), __p3_175), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_175), __p3_175)})); \ __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 3, 2, 1, 0); \ __ret_175; \ }) @@ -36343,9 +36289,7 @@ uint32x2_t __reint1_175 = (uint32x2_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x8_t __s0_176 = __p0_176; \ float16x8_t __s1_176 = __p1_176; \ float16x4_t __s2_176 = __p2_176; \ -float16x4_t __reint_176 = __s2_176; \ -uint32x4_t __reint1_176 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176), vget_lane_u32(*(uint32x2_t *) &__reint_176, __p3_176)}; \ - __ret_176 = vcmlaq_rot90_f16(__s0_176, __s1_176, *(float16x8_t *) &__reint1_176); \ + __ret_176 = vcmlaq_rot90_f16(__s0_176, __s1_176, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_176), __p3_176), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_176), __p3_176), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_176), __p3_176), vget_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_176), __p3_176)})); \ __ret_176; \ }) #else @@ -36357,9 +36301,7 @@ uint32x4_t __reint1_176 = (uint32x4_t) {vget_lane_u32(*(uint32x2_t *) &__reint_1 float16x8_t __rev0_177; __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_177; __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x4_t __rev2_177; __rev2_177 = __builtin_shufflevector(__s2_177, __s2_177, 3, 2, 1, 0); \ -float16x4_t __reint_177 = __rev2_177; \ -uint32x4_t __reint1_177 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177), __noswap_vget_lane_u32(*(uint32x2_t *) &__reint_177, __p3_177)}; \ - __ret_177 = __noswap_vcmlaq_rot90_f16(__rev0_177, __rev1_177, *(float16x8_t *) &__reint1_177); \ + __ret_177 = __noswap_vcmlaq_rot90_f16(__rev0_177, __rev1_177, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_177), __p3_177), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_177), __p3_177), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_177), __p3_177), __noswap_vget_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_177), __p3_177)})); \ __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_177; \ }) @@ -36371,9 +36313,7 @@ uint32x4_t __reint1_177 = (uint32x4_t) {__noswap_vget_lane_u32(*(uint32x2_t *) & float16x4_t __s0_178 = __p0_178; \ float16x4_t __s1_178 = __p1_178; \ float16x8_t __s2_178 = __p2_178; \ -float16x8_t __reint_178 = __s2_178; \ -uint32x2_t __reint1_178 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_178, __p3_178), vgetq_lane_u32(*(uint32x4_t *) &__reint_178, __p3_178)}; \ - __ret_178 = vcmla_rot90_f16(__s0_178, __s1_178, *(float16x4_t *) &__reint1_178); \ + __ret_178 = vcmla_rot90_f16(__s0_178, __s1_178, __builtin_bit_cast(float16x4_t, (uint32x2_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_178), __p3_178), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_178), __p3_178)})); \ __ret_178; \ }) #else @@ -36385,9 +36325,7 @@ uint32x2_t __reint1_178 = (uint32x2_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x4_t __rev0_179; __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 3, 2, 1, 0); \ float16x4_t __rev1_179; __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 3, 2, 1, 0); \ float16x8_t __rev2_179; __rev2_179 = __builtin_shufflevector(__s2_179, __s2_179, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_179 = __rev2_179; \ -uint32x2_t __reint1_179 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_179, __p3_179), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_179, __p3_179)}; \ - __ret_179 = __noswap_vcmla_rot90_f16(__rev0_179, __rev1_179, *(float16x4_t *) &__reint1_179); \ + __ret_179 = __noswap_vcmla_rot90_f16(__rev0_179, __rev1_179, __builtin_bit_cast(float16x4_t, (uint32x2_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_179), __p3_179), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_179), __p3_179)})); \ __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \ __ret_179; \ }) @@ -36399,9 +36337,7 @@ uint32x2_t __reint1_179 = (uint32x2_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) float16x8_t __s0_180 = __p0_180; \ float16x8_t __s1_180 = __p1_180; \ float16x8_t __s2_180 = __p2_180; \ -float16x8_t __reint_180 = __s2_180; \ -uint32x4_t __reint1_180 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180), vgetq_lane_u32(*(uint32x4_t *) &__reint_180, __p3_180)}; \ - __ret_180 = vcmlaq_rot90_f16(__s0_180, __s1_180, *(float16x8_t *) &__reint1_180); \ + __ret_180 = vcmlaq_rot90_f16(__s0_180, __s1_180, __builtin_bit_cast(float16x8_t, (uint32x4_t) {vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_180), __p3_180), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_180), __p3_180), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_180), __p3_180), vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __s2_180), __p3_180)})); \ __ret_180; \ }) #else @@ -36413,9 +36349,7 @@ uint32x4_t __reint1_180 = (uint32x4_t) {vgetq_lane_u32(*(uint32x4_t *) &__reint_ float16x8_t __rev0_181; __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev1_181; __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __rev2_181; __rev2_181 = __builtin_shufflevector(__s2_181, __s2_181, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_181 = __rev2_181; \ -uint32x4_t __reint1_181 = (uint32x4_t) {__noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181), __noswap_vgetq_lane_u32(*(uint32x4_t *) &__reint_181, __p3_181)}; \ - __ret_181 = __noswap_vcmlaq_rot90_f16(__rev0_181, __rev1_181, *(float16x8_t *) &__reint1_181); \ + __ret_181 = __noswap_vcmlaq_rot90_f16(__rev0_181, __rev1_181, __builtin_bit_cast(float16x8_t, (uint32x4_t) {__noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_181), __p3_181), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_181), __p3_181), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_181), __p3_181), __noswap_vgetq_lane_u32(__builtin_bit_cast(uint32x4_t, __rev2_181), __p3_181)})); \ __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_181; \ }) @@ -36541,9 +36475,7 @@ __ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_f32(float3 float32x2_t __s0_182 = __p0_182; \ float32x2_t __s1_182 = __p1_182; \ float32x2_t __s2_182 = __p2_182; \ -float32x2_t __reint_182 = __s2_182; \ -uint64x1_t __reint1_182 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_182, __p3_182)}; \ - __ret_182 = vcmla_f32(__s0_182, __s1_182, *(float32x2_t *) &__reint1_182); \ + __ret_182 = vcmla_f32(__s0_182, __s1_182, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_182), __p3_182)})); \ __ret_182; \ }) #else @@ -36555,9 +36487,7 @@ uint64x1_t __reint1_182 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x2_t __rev0_183; __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 1, 0); \ float32x2_t __rev1_183; __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 1, 0); \ float32x2_t __rev2_183; __rev2_183 = __builtin_shufflevector(__s2_183, __s2_183, 1, 0); \ -float32x2_t __reint_183 = __rev2_183; \ -uint64x1_t __reint1_183 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_183, __p3_183)}; \ - __ret_183 = __noswap_vcmla_f32(__rev0_183, __rev1_183, *(float32x2_t *) &__reint1_183); \ + __ret_183 = __noswap_vcmla_f32(__rev0_183, __rev1_183, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_183), __p3_183)})); \ __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 1, 0); \ __ret_183; \ }) @@ -36569,9 +36499,7 @@ uint64x1_t __reint1_183 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x4_t __s0_184 = __p0_184; \ float32x4_t __s1_184 = __p1_184; \ float32x2_t __s2_184 = __p2_184; \ -float32x2_t __reint_184 = __s2_184; \ -uint64x2_t __reint1_184 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_184, __p3_184), vget_lane_u64(*(uint64x1_t *) &__reint_184, __p3_184)}; \ - __ret_184 = vcmlaq_f32(__s0_184, __s1_184, *(float32x4_t *) &__reint1_184); \ + __ret_184 = vcmlaq_f32(__s0_184, __s1_184, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_184), __p3_184), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_184), __p3_184)})); \ __ret_184; \ }) #else @@ -36583,9 +36511,7 @@ uint64x2_t __reint1_184 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x4_t __rev0_185; __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 3, 2, 1, 0); \ float32x4_t __rev1_185; __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 3, 2, 1, 0); \ float32x2_t __rev2_185; __rev2_185 = __builtin_shufflevector(__s2_185, __s2_185, 1, 0); \ -float32x2_t __reint_185 = __rev2_185; \ -uint64x2_t __reint1_185 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_185, __p3_185), vget_lane_u64(*(uint64x1_t *) &__reint_185, __p3_185)}; \ - __ret_185 = __noswap_vcmlaq_f32(__rev0_185, __rev1_185, *(float32x4_t *) &__reint1_185); \ + __ret_185 = __noswap_vcmlaq_f32(__rev0_185, __rev1_185, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_185), __p3_185), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_185), __p3_185)})); \ __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \ __ret_185; \ }) @@ -36597,9 +36523,7 @@ uint64x2_t __reint1_185 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x2_t __s0_186 = __p0_186; \ float32x2_t __s1_186 = __p1_186; \ float32x4_t __s2_186 = __p2_186; \ -float32x4_t __reint_186 = __s2_186; \ -uint64x1_t __reint1_186 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_186, __p3_186)}; \ - __ret_186 = vcmla_f32(__s0_186, __s1_186, *(float32x2_t *) &__reint1_186); \ + __ret_186 = vcmla_f32(__s0_186, __s1_186, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_186), __p3_186)})); \ __ret_186; \ }) #else @@ -36611,9 +36535,7 @@ uint64x1_t __reint1_186 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x2_t __rev0_187; __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 1, 0); \ float32x2_t __rev1_187; __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 1, 0); \ float32x4_t __rev2_187; __rev2_187 = __builtin_shufflevector(__s2_187, __s2_187, 3, 2, 1, 0); \ -float32x4_t __reint_187 = __rev2_187; \ -uint64x1_t __reint1_187 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_187, __p3_187)}; \ - __ret_187 = __noswap_vcmla_f32(__rev0_187, __rev1_187, *(float32x2_t *) &__reint1_187); \ + __ret_187 = __noswap_vcmla_f32(__rev0_187, __rev1_187, __builtin_bit_cast(float32x2_t, (uint64x1_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_187), __p3_187)})); \ __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 1, 0); \ __ret_187; \ }) @@ -36625,9 +36547,7 @@ uint64x1_t __reint1_187 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) float32x4_t __s0_188 = __p0_188; \ float32x4_t __s1_188 = __p1_188; \ float32x4_t __s2_188 = __p2_188; \ -float32x4_t __reint_188 = __s2_188; \ -uint64x2_t __reint1_188 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_188, __p3_188), vgetq_lane_u64(*(uint64x2_t *) &__reint_188, __p3_188)}; \ - __ret_188 = vcmlaq_f32(__s0_188, __s1_188, *(float32x4_t *) &__reint1_188); \ + __ret_188 = vcmlaq_f32(__s0_188, __s1_188, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_188), __p3_188), vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_188), __p3_188)})); \ __ret_188; \ }) #else @@ -36639,9 +36559,7 @@ uint64x2_t __reint1_188 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x4_t __rev0_189; __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \ float32x4_t __rev1_189; __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \ float32x4_t __rev2_189; __rev2_189 = __builtin_shufflevector(__s2_189, __s2_189, 3, 2, 1, 0); \ -float32x4_t __reint_189 = __rev2_189; \ -uint64x2_t __reint1_189 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_189, __p3_189), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_189, __p3_189)}; \ - __ret_189 = __noswap_vcmlaq_f32(__rev0_189, __rev1_189, *(float32x4_t *) &__reint1_189); \ + __ret_189 = __noswap_vcmlaq_f32(__rev0_189, __rev1_189, __builtin_bit_cast(float32x4_t, (uint64x2_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_189), __p3_189), __noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_189), __p3_189)})); \ __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 3, 2, 1, 0); \ __ret_189; \ }) @@ -36699,9 +36617,7 @@ __ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot180_f32 float32x2_t __s0_190 = __p0_190; \ float32x2_t __s1_190 = __p1_190; \ float32x2_t __s2_190 = __p2_190; \ -float32x2_t __reint_190 = __s2_190; \ -uint64x1_t __reint1_190 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_190, __p3_190)}; \ - __ret_190 = vcmla_rot180_f32(__s0_190, __s1_190, *(float32x2_t *) &__reint1_190); \ + __ret_190 = vcmla_rot180_f32(__s0_190, __s1_190, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_190), __p3_190)})); \ __ret_190; \ }) #else @@ -36713,9 +36629,7 @@ uint64x1_t __reint1_190 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x2_t __rev0_191; __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \ float32x2_t __rev1_191; __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \ float32x2_t __rev2_191; __rev2_191 = __builtin_shufflevector(__s2_191, __s2_191, 1, 0); \ -float32x2_t __reint_191 = __rev2_191; \ -uint64x1_t __reint1_191 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_191, __p3_191)}; \ - __ret_191 = __noswap_vcmla_rot180_f32(__rev0_191, __rev1_191, *(float32x2_t *) &__reint1_191); \ + __ret_191 = __noswap_vcmla_rot180_f32(__rev0_191, __rev1_191, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_191), __p3_191)})); \ __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 1, 0); \ __ret_191; \ }) @@ -36727,9 +36641,7 @@ uint64x1_t __reint1_191 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x4_t __s0_192 = __p0_192; \ float32x4_t __s1_192 = __p1_192; \ float32x2_t __s2_192 = __p2_192; \ -float32x2_t __reint_192 = __s2_192; \ -uint64x2_t __reint1_192 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_192, __p3_192), vget_lane_u64(*(uint64x1_t *) &__reint_192, __p3_192)}; \ - __ret_192 = vcmlaq_rot180_f32(__s0_192, __s1_192, *(float32x4_t *) &__reint1_192); \ + __ret_192 = vcmlaq_rot180_f32(__s0_192, __s1_192, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_192), __p3_192), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_192), __p3_192)})); \ __ret_192; \ }) #else @@ -36741,9 +36653,7 @@ uint64x2_t __reint1_192 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x4_t __rev0_193; __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 3, 2, 1, 0); \ float32x4_t __rev1_193; __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 3, 2, 1, 0); \ float32x2_t __rev2_193; __rev2_193 = __builtin_shufflevector(__s2_193, __s2_193, 1, 0); \ -float32x2_t __reint_193 = __rev2_193; \ -uint64x2_t __reint1_193 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_193, __p3_193), vget_lane_u64(*(uint64x1_t *) &__reint_193, __p3_193)}; \ - __ret_193 = __noswap_vcmlaq_rot180_f32(__rev0_193, __rev1_193, *(float32x4_t *) &__reint1_193); \ + __ret_193 = __noswap_vcmlaq_rot180_f32(__rev0_193, __rev1_193, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_193), __p3_193), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_193), __p3_193)})); \ __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 3, 2, 1, 0); \ __ret_193; \ }) @@ -36755,9 +36665,7 @@ uint64x2_t __reint1_193 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x2_t __s0_194 = __p0_194; \ float32x2_t __s1_194 = __p1_194; \ float32x4_t __s2_194 = __p2_194; \ -float32x4_t __reint_194 = __s2_194; \ -uint64x1_t __reint1_194 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_194, __p3_194)}; \ - __ret_194 = vcmla_rot180_f32(__s0_194, __s1_194, *(float32x2_t *) &__reint1_194); \ + __ret_194 = vcmla_rot180_f32(__s0_194, __s1_194, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_194), __p3_194)})); \ __ret_194; \ }) #else @@ -36769,9 +36677,7 @@ uint64x1_t __reint1_194 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x2_t __rev0_195; __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 1, 0); \ float32x2_t __rev1_195; __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 1, 0); \ float32x4_t __rev2_195; __rev2_195 = __builtin_shufflevector(__s2_195, __s2_195, 3, 2, 1, 0); \ -float32x4_t __reint_195 = __rev2_195; \ -uint64x1_t __reint1_195 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_195, __p3_195)}; \ - __ret_195 = __noswap_vcmla_rot180_f32(__rev0_195, __rev1_195, *(float32x2_t *) &__reint1_195); \ + __ret_195 = __noswap_vcmla_rot180_f32(__rev0_195, __rev1_195, __builtin_bit_cast(float32x2_t, (uint64x1_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_195), __p3_195)})); \ __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 1, 0); \ __ret_195; \ }) @@ -36783,9 +36689,7 @@ uint64x1_t __reint1_195 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) float32x4_t __s0_196 = __p0_196; \ float32x4_t __s1_196 = __p1_196; \ float32x4_t __s2_196 = __p2_196; \ -float32x4_t __reint_196 = __s2_196; \ -uint64x2_t __reint1_196 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_196, __p3_196), vgetq_lane_u64(*(uint64x2_t *) &__reint_196, __p3_196)}; \ - __ret_196 = vcmlaq_rot180_f32(__s0_196, __s1_196, *(float32x4_t *) &__reint1_196); \ + __ret_196 = vcmlaq_rot180_f32(__s0_196, __s1_196, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_196), __p3_196), vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_196), __p3_196)})); \ __ret_196; \ }) #else @@ -36797,9 +36701,7 @@ uint64x2_t __reint1_196 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x4_t __rev0_197; __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 3, 2, 1, 0); \ float32x4_t __rev1_197; __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 3, 2, 1, 0); \ float32x4_t __rev2_197; __rev2_197 = __builtin_shufflevector(__s2_197, __s2_197, 3, 2, 1, 0); \ -float32x4_t __reint_197 = __rev2_197; \ -uint64x2_t __reint1_197 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_197, __p3_197), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_197, __p3_197)}; \ - __ret_197 = __noswap_vcmlaq_rot180_f32(__rev0_197, __rev1_197, *(float32x4_t *) &__reint1_197); \ + __ret_197 = __noswap_vcmlaq_rot180_f32(__rev0_197, __rev1_197, __builtin_bit_cast(float32x4_t, (uint64x2_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_197), __p3_197), __noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_197), __p3_197)})); \ __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \ __ret_197; \ }) @@ -36857,9 +36759,7 @@ __ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot270_f32 float32x2_t __s0_198 = __p0_198; \ float32x2_t __s1_198 = __p1_198; \ float32x2_t __s2_198 = __p2_198; \ -float32x2_t __reint_198 = __s2_198; \ -uint64x1_t __reint1_198 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_198, __p3_198)}; \ - __ret_198 = vcmla_rot270_f32(__s0_198, __s1_198, *(float32x2_t *) &__reint1_198); \ + __ret_198 = vcmla_rot270_f32(__s0_198, __s1_198, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_198), __p3_198)})); \ __ret_198; \ }) #else @@ -36871,9 +36771,7 @@ uint64x1_t __reint1_198 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x2_t __rev0_199; __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 1, 0); \ float32x2_t __rev1_199; __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 1, 0); \ float32x2_t __rev2_199; __rev2_199 = __builtin_shufflevector(__s2_199, __s2_199, 1, 0); \ -float32x2_t __reint_199 = __rev2_199; \ -uint64x1_t __reint1_199 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_199, __p3_199)}; \ - __ret_199 = __noswap_vcmla_rot270_f32(__rev0_199, __rev1_199, *(float32x2_t *) &__reint1_199); \ + __ret_199 = __noswap_vcmla_rot270_f32(__rev0_199, __rev1_199, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_199), __p3_199)})); \ __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 1, 0); \ __ret_199; \ }) @@ -36885,9 +36783,7 @@ uint64x1_t __reint1_199 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_1 float32x4_t __s0_200 = __p0_200; \ float32x4_t __s1_200 = __p1_200; \ float32x2_t __s2_200 = __p2_200; \ -float32x2_t __reint_200 = __s2_200; \ -uint64x2_t __reint1_200 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_200, __p3_200), vget_lane_u64(*(uint64x1_t *) &__reint_200, __p3_200)}; \ - __ret_200 = vcmlaq_rot270_f32(__s0_200, __s1_200, *(float32x4_t *) &__reint1_200); \ + __ret_200 = vcmlaq_rot270_f32(__s0_200, __s1_200, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_200), __p3_200), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_200), __p3_200)})); \ __ret_200; \ }) #else @@ -36899,9 +36795,7 @@ uint64x2_t __reint1_200 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x4_t __rev0_201; __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \ float32x4_t __rev1_201; __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \ float32x2_t __rev2_201; __rev2_201 = __builtin_shufflevector(__s2_201, __s2_201, 1, 0); \ -float32x2_t __reint_201 = __rev2_201; \ -uint64x2_t __reint1_201 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_201, __p3_201), vget_lane_u64(*(uint64x1_t *) &__reint_201, __p3_201)}; \ - __ret_201 = __noswap_vcmlaq_rot270_f32(__rev0_201, __rev1_201, *(float32x4_t *) &__reint1_201); \ + __ret_201 = __noswap_vcmlaq_rot270_f32(__rev0_201, __rev1_201, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_201), __p3_201), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_201), __p3_201)})); \ __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 3, 2, 1, 0); \ __ret_201; \ }) @@ -36913,9 +36807,7 @@ uint64x2_t __reint1_201 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x2_t __s0_202 = __p0_202; \ float32x2_t __s1_202 = __p1_202; \ float32x4_t __s2_202 = __p2_202; \ -float32x4_t __reint_202 = __s2_202; \ -uint64x1_t __reint1_202 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_202, __p3_202)}; \ - __ret_202 = vcmla_rot270_f32(__s0_202, __s1_202, *(float32x2_t *) &__reint1_202); \ + __ret_202 = vcmla_rot270_f32(__s0_202, __s1_202, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_202), __p3_202)})); \ __ret_202; \ }) #else @@ -36927,9 +36819,7 @@ uint64x1_t __reint1_202 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x2_t __rev0_203; __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \ float32x2_t __rev1_203; __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \ float32x4_t __rev2_203; __rev2_203 = __builtin_shufflevector(__s2_203, __s2_203, 3, 2, 1, 0); \ -float32x4_t __reint_203 = __rev2_203; \ -uint64x1_t __reint1_203 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_203, __p3_203)}; \ - __ret_203 = __noswap_vcmla_rot270_f32(__rev0_203, __rev1_203, *(float32x2_t *) &__reint1_203); \ + __ret_203 = __noswap_vcmla_rot270_f32(__rev0_203, __rev1_203, __builtin_bit_cast(float32x2_t, (uint64x1_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_203), __p3_203)})); \ __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 1, 0); \ __ret_203; \ }) @@ -36941,9 +36831,7 @@ uint64x1_t __reint1_203 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) float32x4_t __s0_204 = __p0_204; \ float32x4_t __s1_204 = __p1_204; \ float32x4_t __s2_204 = __p2_204; \ -float32x4_t __reint_204 = __s2_204; \ -uint64x2_t __reint1_204 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_204, __p3_204), vgetq_lane_u64(*(uint64x2_t *) &__reint_204, __p3_204)}; \ - __ret_204 = vcmlaq_rot270_f32(__s0_204, __s1_204, *(float32x4_t *) &__reint1_204); \ + __ret_204 = vcmlaq_rot270_f32(__s0_204, __s1_204, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_204), __p3_204), vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_204), __p3_204)})); \ __ret_204; \ }) #else @@ -36955,9 +36843,7 @@ uint64x2_t __reint1_204 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x4_t __rev0_205; __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 3, 2, 1, 0); \ float32x4_t __rev1_205; __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 3, 2, 1, 0); \ float32x4_t __rev2_205; __rev2_205 = __builtin_shufflevector(__s2_205, __s2_205, 3, 2, 1, 0); \ -float32x4_t __reint_205 = __rev2_205; \ -uint64x2_t __reint1_205 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_205, __p3_205), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_205, __p3_205)}; \ - __ret_205 = __noswap_vcmlaq_rot270_f32(__rev0_205, __rev1_205, *(float32x4_t *) &__reint1_205); \ + __ret_205 = __noswap_vcmlaq_rot270_f32(__rev0_205, __rev1_205, __builtin_bit_cast(float32x4_t, (uint64x2_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_205), __p3_205), __noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_205), __p3_205)})); \ __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 3, 2, 1, 0); \ __ret_205; \ }) @@ -37015,9 +36901,7 @@ __ai __attribute__((target("v8.3a,neon"))) float32x2_t __noswap_vcmla_rot90_f32( float32x2_t __s0_206 = __p0_206; \ float32x2_t __s1_206 = __p1_206; \ float32x2_t __s2_206 = __p2_206; \ -float32x2_t __reint_206 = __s2_206; \ -uint64x1_t __reint1_206 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_206, __p3_206)}; \ - __ret_206 = vcmla_rot90_f32(__s0_206, __s1_206, *(float32x2_t *) &__reint1_206); \ + __ret_206 = vcmla_rot90_f32(__s0_206, __s1_206, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_206), __p3_206)})); \ __ret_206; \ }) #else @@ -37029,9 +36913,7 @@ uint64x1_t __reint1_206 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x2_t __rev0_207; __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 1, 0); \ float32x2_t __rev1_207; __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 1, 0); \ float32x2_t __rev2_207; __rev2_207 = __builtin_shufflevector(__s2_207, __s2_207, 1, 0); \ -float32x2_t __reint_207 = __rev2_207; \ -uint64x1_t __reint1_207 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_207, __p3_207)}; \ - __ret_207 = __noswap_vcmla_rot90_f32(__rev0_207, __rev1_207, *(float32x2_t *) &__reint1_207); \ + __ret_207 = __noswap_vcmla_rot90_f32(__rev0_207, __rev1_207, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_207), __p3_207)})); \ __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 1, 0); \ __ret_207; \ }) @@ -37043,9 +36925,7 @@ uint64x1_t __reint1_207 = (uint64x1_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x4_t __s0_208 = __p0_208; \ float32x4_t __s1_208 = __p1_208; \ float32x2_t __s2_208 = __p2_208; \ -float32x2_t __reint_208 = __s2_208; \ -uint64x2_t __reint1_208 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_208, __p3_208), vget_lane_u64(*(uint64x1_t *) &__reint_208, __p3_208)}; \ - __ret_208 = vcmlaq_rot90_f32(__s0_208, __s1_208, *(float32x4_t *) &__reint1_208); \ + __ret_208 = vcmlaq_rot90_f32(__s0_208, __s1_208, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_208), __p3_208), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __s2_208), __p3_208)})); \ __ret_208; \ }) #else @@ -37057,9 +36937,7 @@ uint64x2_t __reint1_208 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x4_t __rev0_209; __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 3, 2, 1, 0); \ float32x4_t __rev1_209; __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 3, 2, 1, 0); \ float32x2_t __rev2_209; __rev2_209 = __builtin_shufflevector(__s2_209, __s2_209, 1, 0); \ -float32x2_t __reint_209 = __rev2_209; \ -uint64x2_t __reint1_209 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_209, __p3_209), vget_lane_u64(*(uint64x1_t *) &__reint_209, __p3_209)}; \ - __ret_209 = __noswap_vcmlaq_rot90_f32(__rev0_209, __rev1_209, *(float32x4_t *) &__reint1_209); \ + __ret_209 = __noswap_vcmlaq_rot90_f32(__rev0_209, __rev1_209, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_209), __p3_209), vget_lane_u64(__builtin_bit_cast(uint64x1_t, __rev2_209), __p3_209)})); \ __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \ __ret_209; \ }) @@ -37071,9 +36949,7 @@ uint64x2_t __reint1_209 = (uint64x2_t) {vget_lane_u64(*(uint64x1_t *) &__reint_2 float32x2_t __s0_210 = __p0_210; \ float32x2_t __s1_210 = __p1_210; \ float32x4_t __s2_210 = __p2_210; \ -float32x4_t __reint_210 = __s2_210; \ -uint64x1_t __reint1_210 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_210, __p3_210)}; \ - __ret_210 = vcmla_rot90_f32(__s0_210, __s1_210, *(float32x2_t *) &__reint1_210); \ + __ret_210 = vcmla_rot90_f32(__s0_210, __s1_210, __builtin_bit_cast(float32x2_t, (uint64x1_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_210), __p3_210)})); \ __ret_210; \ }) #else @@ -37085,9 +36961,7 @@ uint64x1_t __reint1_210 = (uint64x1_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x2_t __rev0_211; __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 1, 0); \ float32x2_t __rev1_211; __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 1, 0); \ float32x4_t __rev2_211; __rev2_211 = __builtin_shufflevector(__s2_211, __s2_211, 3, 2, 1, 0); \ -float32x4_t __reint_211 = __rev2_211; \ -uint64x1_t __reint1_211 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_211, __p3_211)}; \ - __ret_211 = __noswap_vcmla_rot90_f32(__rev0_211, __rev1_211, *(float32x2_t *) &__reint1_211); \ + __ret_211 = __noswap_vcmla_rot90_f32(__rev0_211, __rev1_211, __builtin_bit_cast(float32x2_t, (uint64x1_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_211), __p3_211)})); \ __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 1, 0); \ __ret_211; \ }) @@ -37099,9 +36973,7 @@ uint64x1_t __reint1_211 = (uint64x1_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) float32x4_t __s0_212 = __p0_212; \ float32x4_t __s1_212 = __p1_212; \ float32x4_t __s2_212 = __p2_212; \ -float32x4_t __reint_212 = __s2_212; \ -uint64x2_t __reint1_212 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_212, __p3_212), vgetq_lane_u64(*(uint64x2_t *) &__reint_212, __p3_212)}; \ - __ret_212 = vcmlaq_rot90_f32(__s0_212, __s1_212, *(float32x4_t *) &__reint1_212); \ + __ret_212 = vcmlaq_rot90_f32(__s0_212, __s1_212, __builtin_bit_cast(float32x4_t, (uint64x2_t) {vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_212), __p3_212), vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __s2_212), __p3_212)})); \ __ret_212; \ }) #else @@ -37113,9 +36985,7 @@ uint64x2_t __reint1_212 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_ float32x4_t __rev0_213; __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \ float32x4_t __rev1_213; __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \ float32x4_t __rev2_213; __rev2_213 = __builtin_shufflevector(__s2_213, __s2_213, 3, 2, 1, 0); \ -float32x4_t __reint_213 = __rev2_213; \ -uint64x2_t __reint1_213 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_213, __p3_213), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_213, __p3_213)}; \ - __ret_213 = __noswap_vcmlaq_rot90_f32(__rev0_213, __rev1_213, *(float32x4_t *) &__reint1_213); \ + __ret_213 = __noswap_vcmlaq_rot90_f32(__rev0_213, __rev1_213, __builtin_bit_cast(float32x4_t, (uint64x2_t) {__noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_213), __p3_213), __noswap_vgetq_lane_u64(__builtin_bit_cast(uint64x2_t, __rev2_213), __p3_213)})); \ __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 3, 2, 1, 0); \ __ret_213; \ }) @@ -41248,6 +41118,2252 @@ __ai __attribute__((target("neon"))) float32x2_t vfms_f32(float32x2_t __p0, floa } #endif +#endif +#if defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_bf16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_bf16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_f16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt1_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_f16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt1_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_high_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_high_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_high_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_high_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_high_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt1_high_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_high_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt1_high_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_low_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_low_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt1_low_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt1_low_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_low_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt1_low_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt1_low_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt1_low_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_bf16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_bf16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_f16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt2_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_f16_mf8_fpm(mfloat8x8_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt2_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_high_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_high_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_high_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_high_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_high_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt2_high_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_high_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt2_high_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_low_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_low_bf16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) bfloat16x8_t vcvt2_low_bf16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + bfloat16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (bfloat16x8_t) __builtin_neon_vcvt2_low_bf16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_low_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvt2_low_f16_mf8_fpm(__p0, __p1); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vcvt2_low_f16_mf8_fpm(mfloat8x16_t __p0, fpm_t __p1) { + float16x8_t __ret; + mfloat8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vcvt2_low_f16_mf8_fpm(__rev0, __p1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) mfloat8x16_t vcvt_high_mf8_f32_fpm(mfloat8x8_t __p0, float32x4_t __p1, float32x4_t __p2, fpm_t __p3) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t) __builtin_neon_vcvt_high_mf8_f32_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) mfloat8x16_t vcvt_high_mf8_f32_fpm(mfloat8x8_t __p0, float32x4_t __p1, float32x4_t __p2, fpm_t __p3) { + mfloat8x16_t __ret; + mfloat8x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + __ret = (mfloat8x16_t) __builtin_neon_vcvt_high_mf8_f32_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) mfloat8x16_t vcvtq_mf8_f16_fpm(float16x8_t __p0, float16x8_t __p1, fpm_t __p2) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t) __builtin_neon_vcvtq_mf8_f16_fpm((int8x16_t)__p0, (int8x16_t)__p1, __p2); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) mfloat8x16_t vcvtq_mf8_f16_fpm(float16x8_t __p0, float16x8_t __p1, fpm_t __p2) { + mfloat8x16_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (mfloat8x16_t) __builtin_neon_vcvtq_mf8_f16_fpm((int8x16_t)__rev0, (int8x16_t)__rev1, __p2); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) mfloat8x8_t vcvt_mf8_f16_fpm(float16x4_t __p0, float16x4_t __p1, fpm_t __p2) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t) __builtin_neon_vcvt_mf8_f16_fpm((int8x8_t)__p0, (int8x8_t)__p1, __p2); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) mfloat8x8_t vcvt_mf8_f16_fpm(float16x4_t __p0, float16x4_t __p1, fpm_t __p2) { + mfloat8x8_t __ret; + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (mfloat8x8_t) __builtin_neon_vcvt_mf8_f16_fpm((int8x8_t)__rev0, (int8x8_t)__rev1, __p2); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) mfloat8x8_t vcvt_mf8_f32_fpm(float32x4_t __p0, float32x4_t __p1, fpm_t __p2) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t) __builtin_neon_vcvt_mf8_f32_fpm(__p0, __p1, __p2); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) mfloat8x8_t vcvt_mf8_f32_fpm(float32x4_t __p0, float32x4_t __p1, fpm_t __p2) { + mfloat8x8_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (mfloat8x8_t) __builtin_neon_vcvt_mf8_f32_fpm(__rev0, __rev1, __p2); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float32x2_t vscale_f32(float32x2_t __p0, int32x2_t __p1) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vscale_f32((int8x8_t)__p0, (int8x8_t)__p1, 9); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float32x2_t vscale_f32(float32x2_t __p0, int32x2_t __p1) { + float32x2_t __ret; + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + int32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float32x2_t) __builtin_neon_vscale_f32((int8x8_t)__rev0, (int8x8_t)__rev1, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x4_t vscale_f16(float16x4_t __p0, int16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vscale_f16((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x4_t vscale_f16(float16x4_t __p0, int16x4_t __p1) { + float16x4_t __ret; + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float16x4_t) __builtin_neon_vscale_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float64x2_t vscaleq_f64(float64x2_t __p0, int64x2_t __p1) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vscaleq_f64((int8x16_t)__p0, (int8x16_t)__p1, 42); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float64x2_t vscaleq_f64(float64x2_t __p0, int64x2_t __p1) { + float64x2_t __ret; + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + int64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float64x2_t) __builtin_neon_vscaleq_f64((int8x16_t)__rev0, (int8x16_t)__rev1, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float32x4_t vscaleq_f32(float32x4_t __p0, int32x4_t __p1) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vscaleq_f32((int8x16_t)__p0, (int8x16_t)__p1, 41); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float32x4_t vscaleq_f32(float32x4_t __p0, int32x4_t __p1) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vscaleq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8,neon"))) float16x8_t vscaleq_f16(float16x8_t __p0, int16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vscaleq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai __attribute__((target("fp8,neon"))) float16x8_t vscaleq_f16(float16x8_t __p0, int16x8_t __p1) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vscaleq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8dot2,neon"))) float16x8_t vdotq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vdotq_f16_mf8_fpm((int8x16_t)__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8dot2,neon"))) float16x8_t vdotq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vdotq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8dot2,neon"))) float16x4_t vdot_f16_mf8_fpm(float16x4_t __p0, mfloat8x8_t __p1, mfloat8x8_t __p2, fpm_t __p3) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vdot_f16_mf8_fpm((int8x8_t)__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8dot2,neon"))) float16x4_t vdot_f16_mf8_fpm(float16x4_t __p0, mfloat8x8_t __p1, mfloat8x8_t __p2, fpm_t __p3) { + float16x4_t __ret; + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x4_t) __builtin_neon_vdot_f16_mf8_fpm((int8x8_t)__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdotq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vdotq_lane_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdotq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vdotq_lane_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdot_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x4_t __ret; \ + float16x4_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x4_t) __builtin_neon_vdot_lane_f16_mf8_fpm((int8x8_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdot_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x4_t __ret; \ + float16x4_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x4_t) __builtin_neon_vdot_lane_f16_mf8_fpm((int8x8_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdotq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vdotq_laneq_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdotq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vdotq_laneq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdot_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x4_t __ret; \ + float16x4_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x4_t) __builtin_neon_vdot_laneq_f16_mf8_fpm((int8x8_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdot_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x4_t __ret; \ + float16x4_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x4_t) __builtin_neon_vdot_laneq_f16_mf8_fpm((int8x8_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8dot4,neon"))) float32x4_t vdotq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vdotq_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8dot4,neon"))) float32x4_t vdotq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vdotq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8dot4,neon"))) float32x2_t vdot_f32_mf8_fpm(float32x2_t __p0, mfloat8x8_t __p1, mfloat8x8_t __p2, fpm_t __p3) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vdot_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8dot4,neon"))) float32x2_t vdot_f32_mf8_fpm(float32x2_t __p0, mfloat8x8_t __p1, mfloat8x8_t __p2, fpm_t __p3) { + float32x2_t __ret; + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x2_t) __builtin_neon_vdot_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdotq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vdotq_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdotq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vdotq_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdot_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x2_t __ret; \ + float32x2_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x2_t) __builtin_neon_vdot_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdot_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x2_t __ret; \ + float32x2_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x2_t) __builtin_neon_vdot_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdotq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vdotq_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdotq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vdotq_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdot_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x2_t __ret; \ + float32x2_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x2_t) __builtin_neon_vdot_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vdot_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x2_t __ret; \ + float32x2_t __s0 = __p0; \ + mfloat8x8_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + mfloat8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x2_t) __builtin_neon_vdot_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmlalbq_f16_mf8_fpm((int8x16_t)__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vmlalbq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlalbq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vmlalbq_lane_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlalbq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vmlalbq_lane_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlalbq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vmlalbq_laneq_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlalbq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vmlalbq_laneq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vmlallbbq_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vmlallbbq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallbbq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallbbq_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallbbq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallbbq_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallbbq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallbbq_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallbbq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallbbq_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vmlallbtq_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vmlallbtq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallbtq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallbtq_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallbtq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallbtq_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallbtq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallbtq_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallbtq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallbtq_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vmlalltbq_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vmlalltbq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlalltbq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlalltbq_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlalltbq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlalltbq_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlalltbq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlalltbq_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlalltbq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlalltbq_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vmlallttq_f32_mf8_fpm(__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vmlallttq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallttq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallttq_lane_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallttq_lane_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallttq_lane_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlallttq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float32x4_t) __builtin_neon_vmlallttq_laneq_f32_mf8_fpm(__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlallttq_laneq_f32_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float32x4_t __ret; \ + float32x4_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float32x4_t) __builtin_neon_vmlallttq_laneq_f32_mf8_fpm(__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("fp8fma,neon"))) float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmlaltq_f16_mf8_fpm((int8x16_t)__p0, __p1, __p2, __p3); + return __ret; +} +#else +__ai __attribute__((target("fp8fma,neon"))) float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t __p0, mfloat8x16_t __p1, mfloat8x16_t __p2, fpm_t __p3) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vmlaltq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlaltq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vmlaltq_lane_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlaltq_lane_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x8_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vmlaltq_lane_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmlaltq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + __ret = (float16x8_t) __builtin_neon_vmlaltq_laneq_f16_mf8_fpm((int8x16_t)__s0, __s1, __s2, __p3, __s4); \ + __ret; \ +}) +#else +#define vmlaltq_laneq_f16_mf8_fpm(__p0, __p1, __p2, __p3, __p4) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + mfloat8x16_t __s1 = __p1; \ + mfloat8x16_t __s2 = __p2; \ + fpm_t __s4 = __p4; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + mfloat8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vmlaltq_laneq_f16_mf8_fpm((int8x16_t)__rev0, __rev1, __rev2, __p3, __s4); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti2_lane_p8((int8x8_t)__s0, (int8x8_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti2_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti2_lane_p8((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti2q_lane_p8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti2q_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti2q_lane_p8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti2q_lane_u8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti2q_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti2q_lane_u8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti2q_lane_s8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti2q_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti2q_lane_s8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti2_lane_u8((int8x8_t)__s0, (int8x8_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti2_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti2_lane_u8((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti2_lane_s8((int8x8_t)__s0, (int8x8_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti2_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti2_lane_s8((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti2_lane_p16((int8x8_t)__s0, (int8x8_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti2_lane_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti2_lane_p16((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti2q_lane_p16((int8x16_t)__s0, (int8x8_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti2q_lane_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti2q_lane_p16((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti2q_lane_u16((int8x16_t)__s0, (int8x8_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti2q_lane_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti2q_lane_u16((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti2q_lane_f16((int8x16_t)__s0, (int8x8_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti2q_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti2q_lane_f16((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti2q_lane_s16((int8x16_t)__s0, (int8x8_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti2q_lane_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti2q_lane_s16((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti2_lane_u16((int8x8_t)__s0, (int8x8_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti2_lane_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti2_lane_u16((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti2_lane_f16((int8x8_t)__s0, (int8x8_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti2_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti2_lane_f16((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti2_lane_s16((int8x8_t)__s0, (int8x8_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti2_lane_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti2_lane_s16((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti2_laneq_p8((int8x8_t)__s0, (int8x16_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti2_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti2_laneq_p8((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti2q_laneq_p8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti2q_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti2q_laneq_p8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti2q_laneq_u8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti2q_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti2q_laneq_u8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti2q_laneq_s8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti2q_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti2q_laneq_s8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti2_laneq_u8((int8x8_t)__s0, (int8x16_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti2_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti2_laneq_u8((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti2_laneq_s8((int8x8_t)__s0, (int8x16_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti2_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int8x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti2_laneq_s8((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti2_laneq_p16((int8x8_t)__s0, (int8x16_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti2_laneq_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti2_laneq_p16((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti2q_laneq_p16((int8x16_t)__s0, (int8x16_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti2q_laneq_p16(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti2q_laneq_p16((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti2q_laneq_u16((int8x16_t)__s0, (int8x16_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti2q_laneq_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti2q_laneq_u16((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti2q_laneq_f16((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti2q_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti2q_laneq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti2q_laneq_s16((int8x16_t)__s0, (int8x16_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti2q_laneq_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti2q_laneq_s16((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti2_laneq_u16((int8x8_t)__s0, (int8x16_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti2_laneq_u16(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti2_laneq_u16((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti2_laneq_f16((int8x8_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti2_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti2_laneq_f16((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti2_laneq_s16((int8x8_t)__s0, (int8x16_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti2_laneq_s16(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti2_laneq_s16((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti4q_lane_p8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti4q_lane_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti4q_lane_p8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti4q_lane_u8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti4q_lane_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti4q_lane_u8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti4q_lane_s8((int8x16_t)__s0, (int8x8_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti4q_lane_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti4q_lane_s8((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_p16_x2(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti4q_lane_p16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x8_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti4q_lane_p16_x2(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + poly16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti4q_lane_p16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_u16_x2(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti4q_lane_u16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x8_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti4q_lane_u16_x2(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti4q_lane_u16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_f16_x2(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti4q_lane_f16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x8_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti4q_lane_f16_x2(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + float16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti4q_lane_f16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_s16_x2(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti4q_lane_s16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x8_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti4q_lane_s16_x2(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + int16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti4q_lane_s16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly8x16_t) __builtin_neon_vluti4q_laneq_p8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 36); \ + __ret; \ +}) +#else +#define vluti4q_laneq_p8(__p0, __p1, __p2) __extension__ ({ \ + poly8x16_t __ret; \ + poly8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly8x16_t) __builtin_neon_vluti4q_laneq_p8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 36); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint8x16_t) __builtin_neon_vluti4q_laneq_u8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \ + __ret; \ +}) +#else +#define vluti4q_laneq_u8(__p0, __p1, __p2) __extension__ ({ \ + uint8x16_t __ret; \ + uint8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint8x16_t) __builtin_neon_vluti4q_laneq_u8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int8x16_t) __builtin_neon_vluti4q_laneq_s8((int8x16_t)__s0, (int8x16_t)__s1, __p2, 32); \ + __ret; \ +}) +#else +#define vluti4q_laneq_s8(__p0, __p1, __p2) __extension__ ({ \ + int8x16_t __ret; \ + int8x16_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int8x16_t) __builtin_neon_vluti4q_laneq_s8((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 32); \ + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_p16_x2(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (poly16x8_t) __builtin_neon_vluti4q_laneq_p16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x16_t)__s1, __p2, 37); \ + __ret; \ +}) +#else +#define vluti4q_laneq_p16_x2(__p0, __p1, __p2) __extension__ ({ \ + poly16x8_t __ret; \ + poly16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + poly16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (poly16x8_t) __builtin_neon_vluti4q_laneq_p16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, __p2, 37); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_u16_x2(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (uint16x8_t) __builtin_neon_vluti4q_laneq_u16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x16_t)__s1, __p2, 49); \ + __ret; \ +}) +#else +#define vluti4q_laneq_u16_x2(__p0, __p1, __p2) __extension__ ({ \ + uint16x8_t __ret; \ + uint16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (uint16x8_t) __builtin_neon_vluti4q_laneq_u16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, __p2, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_f16_x2(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (float16x8_t) __builtin_neon_vluti4q_laneq_f16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vluti4q_laneq_f16_x2(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __ret; \ + float16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + float16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (float16x8_t) __builtin_neon_vluti4q_laneq_f16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_s16_x2(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (int16x8_t) __builtin_neon_vluti4q_laneq_s16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x16_t)__s1, __p2, 33); \ + __ret; \ +}) +#else +#define vluti4q_laneq_s16_x2(__p0, __p1, __p2) __extension__ ({ \ + int16x8_t __ret; \ + int16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + int16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (int16x8_t) __builtin_neon_vluti4q_laneq_s16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, __p2, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2q_lane_bf16((int8x16_t)__s0, (int8x8_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti2q_lane_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + bfloat16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2q_lane_bf16((int8x16_t)__rev0, (int8x8_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2_lane_bf16((int8x8_t)__s0, (int8x8_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti2_lane_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x4_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + bfloat16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2_lane_bf16((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2q_laneq_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2q_laneq_bf16((int8x16_t)__s0, (int8x16_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti2q_laneq_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + bfloat16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2q_laneq_bf16((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti2_laneq_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2_laneq_bf16((int8x8_t)__s0, (int8x16_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti2_laneq_bf16(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + bfloat16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti2_laneq_bf16((int8x8_t)__rev0, (int8x16_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_lane_bf16_x2(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti4q_lane_bf16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x8_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti4q_lane_bf16_x2(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + bfloat16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti4q_lane_bf16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x8_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vluti4q_laneq_bf16_x2(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + __ret = (bfloat16x8_t) __builtin_neon_vluti4q_laneq_bf16_x2((int8x16_t)__s0.val[0], (int8x16_t)__s0.val[1], (int8x16_t)__s1, __p2, 43); \ + __ret; \ +}) +#else +#define vluti4q_laneq_bf16_x2(__p0, __p1, __p2) __extension__ ({ \ + bfloat16x8_t __ret; \ + bfloat16x8x2_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + bfloat16x8x2_t __rev0; \ + __rev0.val[0] = __builtin_shufflevector(__s0.val[0], __s0.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev0.val[1] = __builtin_shufflevector(__s0.val[1], __s0.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret = (bfloat16x8_t) __builtin_neon_vluti4q_laneq_bf16_x2((int8x16_t)__rev0.val[0], (int8x16_t)__rev0.val[1], (int8x16_t)__rev1, __p2, 43); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float64x2_t vamaxq_f64(float64x2_t __p0, float64x2_t __p1) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vamaxq_f64((int8x16_t)__p0, (int8x16_t)__p1, 42); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float64x2_t vamaxq_f64(float64x2_t __p0, float64x2_t __p1) { + float64x2_t __ret; + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float64x2_t) __builtin_neon_vamaxq_f64((int8x16_t)__rev0, (int8x16_t)__rev1, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float32x4_t vamaxq_f32(float32x4_t __p0, float32x4_t __p1) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vamaxq_f32((int8x16_t)__p0, (int8x16_t)__p1, 41); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float32x4_t vamaxq_f32(float32x4_t __p0, float32x4_t __p1) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vamaxq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float16x8_t vamaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vamaxq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float16x8_t vamaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vamaxq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float32x2_t vamax_f32(float32x2_t __p0, float32x2_t __p1) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vamax_f32((int8x8_t)__p0, (int8x8_t)__p1, 9); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float32x2_t vamax_f32(float32x2_t __p0, float32x2_t __p1) { + float32x2_t __ret; + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float32x2_t) __builtin_neon_vamax_f32((int8x8_t)__rev0, (int8x8_t)__rev1, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float16x4_t vamax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vamax_f16((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float16x4_t vamax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float16x4_t) __builtin_neon_vamax_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float64x2_t vaminq_f64(float64x2_t __p0, float64x2_t __p1) { + float64x2_t __ret; + __ret = (float64x2_t) __builtin_neon_vaminq_f64((int8x16_t)__p0, (int8x16_t)__p1, 42); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float64x2_t vaminq_f64(float64x2_t __p0, float64x2_t __p1) { + float64x2_t __ret; + float64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float64x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float64x2_t) __builtin_neon_vaminq_f64((int8x16_t)__rev0, (int8x16_t)__rev1, 42); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float32x4_t vaminq_f32(float32x4_t __p0, float32x4_t __p1) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vaminq_f32((int8x16_t)__p0, (int8x16_t)__p1, 41); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float32x4_t vaminq_f32(float32x4_t __p0, float32x4_t __p1) { + float32x4_t __ret; + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float32x4_t) __builtin_neon_vaminq_f32((int8x16_t)__rev0, (int8x16_t)__rev1, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float16x8_t vaminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vaminq_f16((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float16x8_t vaminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + __ret = (float16x8_t) __builtin_neon_vaminq_f16((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float32x2_t vamin_f32(float32x2_t __p0, float32x2_t __p1) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vamin_f32((int8x8_t)__p0, (int8x8_t)__p1, 9); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float32x2_t vamin_f32(float32x2_t __p0, float32x2_t __p1) { + float32x2_t __ret; + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + __ret = (float32x2_t) __builtin_neon_vamin_f32((int8x8_t)__rev0, (int8x8_t)__rev1, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai __attribute__((target("neon,faminmax"))) float16x4_t vamin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vamin_f16((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai __attribute__((target("neon,faminmax"))) float16x4_t vamin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + __ret = (float16x4_t) __builtin_neon_vamin_f16((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + #endif #if defined(__aarch64__) || defined(__arm64ec__) __ai __attribute__((target("aes,neon"))) poly128_t vmull_p64(poly64_t __p0, poly64_t __p1) { @@ -41255,27 +43371,6 @@ __ai __attribute__((target("aes,neon"))) poly128_t vmull_p64(poly64_t __p0, poly __ret = (poly128_t) __builtin_neon_vmull_p64(__p0, __p1); return __ret; } -#ifdef __LITTLE_ENDIAN__ -__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __a64_vcvtq_low_bf16_f32(float32x4_t __p0) { - bfloat16x8_t __ret; - __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__p0, 43); - return __ret; -} -#else -__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __a64_vcvtq_low_bf16_f32(float32x4_t __p0) { - bfloat16x8_t __ret; - float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__rev0, 43); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -__ai __attribute__((target("bf16,neon"))) bfloat16x8_t __noswap___a64_vcvtq_low_bf16_f32(float32x4_t __p0) { - bfloat16x8_t __ret; - __ret = (bfloat16x8_t) __builtin_neon___a64_vcvtq_low_bf16_f32((int8x16_t)__p0, 43); - return __ret; -} -#endif - #ifdef __LITTLE_ENDIAN__ #define vcopyq_lane_bf16(__p0_230, __p1_230, __p2_230, __p3_230) __extension__ ({ \ bfloat16x8_t __ret_230; \ @@ -41363,14 +43458,14 @@ __ai __attribute__((target("bf16,neon"))) bfloat16x8_t __noswap___a64_vcvtq_low_ #ifdef __LITTLE_ENDIAN__ __ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) { bfloat16x4_t __ret; - __ret = vget_low_bf16(__a64_vcvtq_low_bf16_f32(__p0)); + __ret = (bfloat16x4_t) __builtin_neon_vcvt_bf16_f32((int8x16_t)__p0, 11); return __ret; } #else __ai __attribute__((target("bf16,neon"))) bfloat16x4_t vcvt_bf16_f32(float32x4_t __p0) { bfloat16x4_t __ret; float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - __ret = __noswap_vget_low_bf16(__noswap___a64_vcvtq_low_bf16_f32(__rev0)); + __ret = (bfloat16x4_t) __builtin_neon_vcvt_bf16_f32((int8x16_t)__rev0, 11); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } @@ -41396,14 +43491,14 @@ __ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_high_bf16_f32(bfloa #ifdef __LITTLE_ENDIAN__ __ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) { bfloat16x8_t __ret; - __ret = __a64_vcvtq_low_bf16_f32(__p0); + __ret = (bfloat16x8_t) __builtin_neon_vcvtq_low_bf16_f32((int8x16_t)__p0, 43); return __ret; } #else __ai __attribute__((target("bf16,neon"))) bfloat16x8_t vcvtq_low_bf16_f32(float32x4_t __p0) { bfloat16x8_t __ret; float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - __ret = __noswap___a64_vcvtq_low_bf16_f32(__rev0); + __ret = (bfloat16x8_t) __builtin_neon_vcvtq_low_bf16_f32((int8x16_t)__rev0, 43); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); return __ret; } @@ -41705,9 +43800,7 @@ __ai __attribute__((target("bf16,neon"))) bfloat16x4_t vreinterpret_bf16_s16(int uint32x4_t __s0_238 = __p0_238; \ uint8x16_t __s1_238 = __p1_238; \ uint8x16_t __s2_238 = __p2_238; \ -uint8x16_t __reint_238 = __s2_238; \ -uint32x4_t __reint1_238 = splatq_laneq_u32(*(uint32x4_t *) &__reint_238, __p3_238); \ - __ret_238 = vdotq_u32(__s0_238, __s1_238, *(uint8x16_t *) &__reint1_238); \ + __ret_238 = vdotq_u32(__s0_238, __s1_238, __builtin_bit_cast(uint8x16_t, splatq_laneq_u32(__builtin_bit_cast(uint32x4_t, __s2_238), __p3_238))); \ __ret_238; \ }) #else @@ -41719,9 +43812,7 @@ uint32x4_t __reint1_238 = splatq_laneq_u32(*(uint32x4_t *) &__reint_238, __p3_23 uint32x4_t __rev0_239; __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 3, 2, 1, 0); \ uint8x16_t __rev1_239; __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ uint8x16_t __rev2_239; __rev2_239 = __builtin_shufflevector(__s2_239, __s2_239, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x16_t __reint_239 = __rev2_239; \ -uint32x4_t __reint1_239 = __noswap_splatq_laneq_u32(*(uint32x4_t *) &__reint_239, __p3_239); \ - __ret_239 = __noswap_vdotq_u32(__rev0_239, __rev1_239, *(uint8x16_t *) &__reint1_239); \ + __ret_239 = __noswap_vdotq_u32(__rev0_239, __rev1_239, __builtin_bit_cast(uint8x16_t, __noswap_splatq_laneq_u32(__builtin_bit_cast(uint32x4_t, __rev2_239), __p3_239))); \ __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \ __ret_239; \ }) @@ -41733,9 +43824,7 @@ uint32x4_t __reint1_239 = __noswap_splatq_laneq_u32(*(uint32x4_t *) &__reint_239 int32x4_t __s0_240 = __p0_240; \ int8x16_t __s1_240 = __p1_240; \ int8x16_t __s2_240 = __p2_240; \ -int8x16_t __reint_240 = __s2_240; \ -int32x4_t __reint1_240 = splatq_laneq_s32(*(int32x4_t *) &__reint_240, __p3_240); \ - __ret_240 = vdotq_s32(__s0_240, __s1_240, *(int8x16_t *) &__reint1_240); \ + __ret_240 = vdotq_s32(__s0_240, __s1_240, __builtin_bit_cast(int8x16_t, splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_240), __p3_240))); \ __ret_240; \ }) #else @@ -41747,9 +43836,7 @@ int32x4_t __reint1_240 = splatq_laneq_s32(*(int32x4_t *) &__reint_240, __p3_240) int32x4_t __rev0_241; __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 3, 2, 1, 0); \ int8x16_t __rev1_241; __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int8x16_t __rev2_241; __rev2_241 = __builtin_shufflevector(__s2_241, __s2_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x16_t __reint_241 = __rev2_241; \ -int32x4_t __reint1_241 = __noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_241, __p3_241); \ - __ret_241 = __noswap_vdotq_s32(__rev0_241, __rev1_241, *(int8x16_t *) &__reint1_241); \ + __ret_241 = __noswap_vdotq_s32(__rev0_241, __rev1_241, __builtin_bit_cast(int8x16_t, __noswap_splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_241), __p3_241))); \ __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 3, 2, 1, 0); \ __ret_241; \ }) @@ -41761,9 +43848,7 @@ int32x4_t __reint1_241 = __noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_241, uint32x2_t __s0_242 = __p0_242; \ uint8x8_t __s1_242 = __p1_242; \ uint8x16_t __s2_242 = __p2_242; \ -uint8x16_t __reint_242 = __s2_242; \ -uint32x2_t __reint1_242 = splat_laneq_u32(*(uint32x4_t *) &__reint_242, __p3_242); \ - __ret_242 = vdot_u32(__s0_242, __s1_242, *(uint8x8_t *) &__reint1_242); \ + __ret_242 = vdot_u32(__s0_242, __s1_242, __builtin_bit_cast(uint8x8_t, splat_laneq_u32(__builtin_bit_cast(uint32x4_t, __s2_242), __p3_242))); \ __ret_242; \ }) #else @@ -41775,9 +43860,7 @@ uint32x2_t __reint1_242 = splat_laneq_u32(*(uint32x4_t *) &__reint_242, __p3_242 uint32x2_t __rev0_243; __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 1, 0); \ uint8x8_t __rev1_243; __rev1_243 = __builtin_shufflevector(__s1_243, __s1_243, 7, 6, 5, 4, 3, 2, 1, 0); \ uint8x16_t __rev2_243; __rev2_243 = __builtin_shufflevector(__s2_243, __s2_243, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x16_t __reint_243 = __rev2_243; \ -uint32x2_t __reint1_243 = __noswap_splat_laneq_u32(*(uint32x4_t *) &__reint_243, __p3_243); \ - __ret_243 = __noswap_vdot_u32(__rev0_243, __rev1_243, *(uint8x8_t *) &__reint1_243); \ + __ret_243 = __noswap_vdot_u32(__rev0_243, __rev1_243, __builtin_bit_cast(uint8x8_t, __noswap_splat_laneq_u32(__builtin_bit_cast(uint32x4_t, __rev2_243), __p3_243))); \ __ret_243 = __builtin_shufflevector(__ret_243, __ret_243, 1, 0); \ __ret_243; \ }) @@ -41789,9 +43872,7 @@ uint32x2_t __reint1_243 = __noswap_splat_laneq_u32(*(uint32x4_t *) &__reint_243, int32x2_t __s0_244 = __p0_244; \ int8x8_t __s1_244 = __p1_244; \ int8x16_t __s2_244 = __p2_244; \ -int8x16_t __reint_244 = __s2_244; \ -int32x2_t __reint1_244 = splat_laneq_s32(*(int32x4_t *) &__reint_244, __p3_244); \ - __ret_244 = vdot_s32(__s0_244, __s1_244, *(int8x8_t *) &__reint1_244); \ + __ret_244 = vdot_s32(__s0_244, __s1_244, __builtin_bit_cast(int8x8_t, splat_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_244), __p3_244))); \ __ret_244; \ }) #else @@ -41803,9 +43884,7 @@ int32x2_t __reint1_244 = splat_laneq_s32(*(int32x4_t *) &__reint_244, __p3_244); int32x2_t __rev0_245; __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 1, 0); \ int8x8_t __rev1_245; __rev1_245 = __builtin_shufflevector(__s1_245, __s1_245, 7, 6, 5, 4, 3, 2, 1, 0); \ int8x16_t __rev2_245; __rev2_245 = __builtin_shufflevector(__s2_245, __s2_245, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x16_t __reint_245 = __rev2_245; \ -int32x2_t __reint1_245 = __noswap_splat_laneq_s32(*(int32x4_t *) &__reint_245, __p3_245); \ - __ret_245 = __noswap_vdot_s32(__rev0_245, __rev1_245, *(int8x8_t *) &__reint1_245); \ + __ret_245 = __noswap_vdot_s32(__rev0_245, __rev1_245, __builtin_bit_cast(int8x8_t, __noswap_splat_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_245), __p3_245))); \ __ret_245 = __builtin_shufflevector(__ret_245, __ret_245, 1, 0); \ __ret_245; \ }) @@ -43018,8 +45097,7 @@ __ai __attribute__((target("fullfp16,neon"))) float16x4_t vsqrt_f16(float16x4_t int32x4_t __s0_270 = __p0_270; \ int8x16_t __s1_270 = __p1_270; \ uint8x16_t __s2_270 = __p2_270; \ -uint8x16_t __reint_270 = __s2_270; \ - __ret_270 = vusdotq_s32(__s0_270, (uint8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_270, __p3_270)), __s1_270); \ + __ret_270 = vusdotq_s32(__s0_270, (uint8x16_t)(splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_270), __p3_270)), __s1_270); \ __ret_270; \ }) #else @@ -43031,8 +45109,7 @@ uint8x16_t __reint_270 = __s2_270; \ int32x4_t __rev0_271; __rev0_271 = __builtin_shufflevector(__s0_271, __s0_271, 3, 2, 1, 0); \ int8x16_t __rev1_271; __rev1_271 = __builtin_shufflevector(__s1_271, __s1_271, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ uint8x16_t __rev2_271; __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x16_t __reint_271 = __rev2_271; \ - __ret_271 = __noswap_vusdotq_s32(__rev0_271, (uint8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_271, __p3_271)), __rev1_271); \ + __ret_271 = __noswap_vusdotq_s32(__rev0_271, (uint8x16_t)(__noswap_splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_271), __p3_271)), __rev1_271); \ __ret_271 = __builtin_shufflevector(__ret_271, __ret_271, 3, 2, 1, 0); \ __ret_271; \ }) @@ -43044,8 +45121,7 @@ uint8x16_t __reint_271 = __rev2_271; \ int32x2_t __s0_272 = __p0_272; \ int8x8_t __s1_272 = __p1_272; \ uint8x16_t __s2_272 = __p2_272; \ -uint8x16_t __reint_272 = __s2_272; \ - __ret_272 = vusdot_s32(__s0_272, (uint8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_272, __p3_272)), __s1_272); \ + __ret_272 = vusdot_s32(__s0_272, (uint8x8_t)(splat_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_272), __p3_272)), __s1_272); \ __ret_272; \ }) #else @@ -43057,8 +45133,7 @@ uint8x16_t __reint_272 = __s2_272; \ int32x2_t __rev0_273; __rev0_273 = __builtin_shufflevector(__s0_273, __s0_273, 1, 0); \ int8x8_t __rev1_273; __rev1_273 = __builtin_shufflevector(__s1_273, __s1_273, 7, 6, 5, 4, 3, 2, 1, 0); \ uint8x16_t __rev2_273; __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x16_t __reint_273 = __rev2_273; \ - __ret_273 = __noswap_vusdot_s32(__rev0_273, (uint8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_273, __p3_273)), __rev1_273); \ + __ret_273 = __noswap_vusdot_s32(__rev0_273, (uint8x8_t)(__noswap_splat_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_273), __p3_273)), __rev1_273); \ __ret_273 = __builtin_shufflevector(__ret_273, __ret_273, 1, 0); \ __ret_273; \ }) @@ -43070,8 +45145,7 @@ uint8x16_t __reint_273 = __rev2_273; \ int32x4_t __s0_274 = __p0_274; \ uint8x16_t __s1_274 = __p1_274; \ int8x16_t __s2_274 = __p2_274; \ -int8x16_t __reint_274 = __s2_274; \ - __ret_274 = vusdotq_s32(__s0_274, __s1_274, (int8x16_t)(splatq_laneq_s32(*(int32x4_t *) &__reint_274, __p3_274))); \ + __ret_274 = vusdotq_s32(__s0_274, __s1_274, (int8x16_t)(splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_274), __p3_274))); \ __ret_274; \ }) #else @@ -43083,8 +45157,7 @@ int8x16_t __reint_274 = __s2_274; \ int32x4_t __rev0_275; __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 3, 2, 1, 0); \ uint8x16_t __rev1_275; __rev1_275 = __builtin_shufflevector(__s1_275, __s1_275, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ int8x16_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x16_t __reint_275 = __rev2_275; \ - __ret_275 = __noswap_vusdotq_s32(__rev0_275, __rev1_275, (int8x16_t)(__noswap_splatq_laneq_s32(*(int32x4_t *) &__reint_275, __p3_275))); \ + __ret_275 = __noswap_vusdotq_s32(__rev0_275, __rev1_275, (int8x16_t)(__noswap_splatq_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_275), __p3_275))); \ __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 3, 2, 1, 0); \ __ret_275; \ }) @@ -43096,8 +45169,7 @@ int8x16_t __reint_275 = __rev2_275; \ int32x2_t __s0_276 = __p0_276; \ uint8x8_t __s1_276 = __p1_276; \ int8x16_t __s2_276 = __p2_276; \ -int8x16_t __reint_276 = __s2_276; \ - __ret_276 = vusdot_s32(__s0_276, __s1_276, (int8x8_t)(splat_laneq_s32(*(int32x4_t *) &__reint_276, __p3_276))); \ + __ret_276 = vusdot_s32(__s0_276, __s1_276, (int8x8_t)(splat_laneq_s32(__builtin_bit_cast(int32x4_t, __s2_276), __p3_276))); \ __ret_276; \ }) #else @@ -43109,8 +45181,7 @@ int8x16_t __reint_276 = __s2_276; \ int32x2_t __rev0_277; __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \ uint8x8_t __rev1_277; __rev1_277 = __builtin_shufflevector(__s1_277, __s1_277, 7, 6, 5, 4, 3, 2, 1, 0); \ int8x16_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x16_t __reint_277 = __rev2_277; \ - __ret_277 = __noswap_vusdot_s32(__rev0_277, __rev1_277, (int8x8_t)(__noswap_splat_laneq_s32(*(int32x4_t *) &__reint_277, __p3_277))); \ + __ret_277 = __noswap_vusdot_s32(__rev0_277, __rev1_277, (int8x8_t)(__noswap_splat_laneq_s32(__builtin_bit_cast(int32x4_t, __rev2_277), __p3_277))); \ __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \ __ret_277; \ }) @@ -57773,6 +59844,11 @@ __ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s64(int64x1_t __p __ret = (poly8x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_mf8(mfloat8x8_t __p0) { + poly8x8_t __ret; + __ret = (poly8x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly8x8_t vreinterpret_p8_s16(int16x4_t __p0) { poly8x8_t __ret; __ret = (poly8x8_t)(__p0); @@ -57838,6 +59914,11 @@ __ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s64(int64x1_t _ __ret = (poly64x1_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_mf8(mfloat8x8_t __p0) { + poly64x1_t __ret; + __ret = (poly64x1_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly64x1_t vreinterpret_p64_s16(int16x4_t __p0) { poly64x1_t __ret; __ret = (poly64x1_t)(__p0); @@ -57903,6 +59984,11 @@ __ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s64(int64x1_t _ __ret = (poly16x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_mf8(mfloat8x8_t __p0) { + poly16x4_t __ret; + __ret = (poly16x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly16x4_t vreinterpret_p16_s16(int16x4_t __p0) { poly16x4_t __ret; __ret = (poly16x4_t)(__p0); @@ -57973,6 +60059,11 @@ __ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s64(int64x2_t _ __ret = (poly8x16_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_mf8(mfloat8x16_t __p0) { + poly8x16_t __ret; + __ret = (poly8x16_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly8x16_t vreinterpretq_p8_s16(int16x8_t __p0) { poly8x16_t __ret; __ret = (poly8x16_t)(__p0); @@ -58043,6 +60134,11 @@ __ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s64(int64x2_t __ret = (poly128_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_mf8(mfloat8x16_t __p0) { + poly128_t __ret; + __ret = (poly128_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly128_t vreinterpretq_p128_s16(int16x8_t __p0) { poly128_t __ret; __ret = (poly128_t)(__p0); @@ -58113,6 +60209,11 @@ __ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s64(int64x2_t __ret = (poly64x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_mf8(mfloat8x16_t __p0) { + poly64x2_t __ret; + __ret = (poly64x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly64x2_t vreinterpretq_p64_s16(int16x8_t __p0) { poly64x2_t __ret; __ret = (poly64x2_t)(__p0); @@ -58183,6 +60284,11 @@ __ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s64(int64x2_t __ret = (poly16x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_mf8(mfloat8x16_t __p0) { + poly16x8_t __ret; + __ret = (poly16x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) poly16x8_t vreinterpretq_p16_s16(int16x8_t __p0) { poly16x8_t __ret; __ret = (poly16x8_t)(__p0); @@ -58253,6 +60359,11 @@ __ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s64(int64x2_t _ __ret = (uint8x16_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_mf8(mfloat8x16_t __p0) { + uint8x16_t __ret; + __ret = (uint8x16_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint8x16_t vreinterpretq_u8_s16(int16x8_t __p0) { uint8x16_t __ret; __ret = (uint8x16_t)(__p0); @@ -58323,6 +60434,11 @@ __ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s64(int64x2_t __ret = (uint32x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_mf8(mfloat8x16_t __p0) { + uint32x4_t __ret; + __ret = (uint32x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint32x4_t vreinterpretq_u32_s16(int16x8_t __p0) { uint32x4_t __ret; __ret = (uint32x4_t)(__p0); @@ -58393,6 +60509,11 @@ __ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s64(int64x2_t __ret = (uint64x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_mf8(mfloat8x16_t __p0) { + uint64x2_t __ret; + __ret = (uint64x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint64x2_t vreinterpretq_u64_s16(int16x8_t __p0) { uint64x2_t __ret; __ret = (uint64x2_t)(__p0); @@ -58463,6 +60584,11 @@ __ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s64(int64x2_t __ret = (uint16x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_mf8(mfloat8x16_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint16x8_t vreinterpretq_u16_s16(int16x8_t __p0) { uint16x8_t __ret; __ret = (uint16x8_t)(__p0); @@ -58533,6 +60659,11 @@ __ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s64(int64x2_t __ __ret = (int8x16_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_mf8(mfloat8x16_t __p0) { + int8x16_t __ret; + __ret = (int8x16_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int8x16_t vreinterpretq_s8_s16(int16x8_t __p0) { int8x16_t __ret; __ret = (int8x16_t)(__p0); @@ -58603,6 +60734,11 @@ __ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s64(int64x2_t __ret = (float64x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_mf8(mfloat8x16_t __p0) { + float64x2_t __ret; + __ret = (float64x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float64x2_t vreinterpretq_f64_s16(int16x8_t __p0) { float64x2_t __ret; __ret = (float64x2_t)(__p0); @@ -58673,6 +60809,11 @@ __ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s64(int64x2_t __ret = (float32x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_mf8(mfloat8x16_t __p0) { + float32x4_t __ret; + __ret = (float32x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float32x4_t vreinterpretq_f32_s16(int16x8_t __p0) { float32x4_t __ret; __ret = (float32x4_t)(__p0); @@ -58743,6 +60884,11 @@ __ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s64(int64x2_t __ret = (float16x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_mf8(mfloat8x16_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float16x8_t vreinterpretq_f16_s16(int16x8_t __p0) { float16x8_t __ret; __ret = (float16x8_t)(__p0); @@ -58813,6 +60959,11 @@ __ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s64(int64x2_t _ __ret = (int32x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_mf8(mfloat8x16_t __p0) { + int32x4_t __ret; + __ret = (int32x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int32x4_t vreinterpretq_s32_s16(int16x8_t __p0) { int32x4_t __ret; __ret = (int32x4_t)(__p0); @@ -58883,11 +61034,91 @@ __ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s32(int32x4_t _ __ret = (int64x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_mf8(mfloat8x16_t __p0) { + int64x2_t __ret; + __ret = (int64x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int64x2_t vreinterpretq_s64_s16(int16x8_t __p0) { int64x2_t __ret; __ret = (int64x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_p8(poly8x16_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_p128(poly128_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_p64(poly64x2_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_p16(poly16x8_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_u8(uint8x16_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_u32(uint32x4_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_u64(uint64x2_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_u16(uint16x8_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_s8(int8x16_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_f64(float64x2_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_f32(float32x4_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_f16(float16x8_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_s32(int32x4_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_s64(int64x2_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x16_t vreinterpretq_mf8_s16(int16x8_t __p0) { + mfloat8x16_t __ret; + __ret = (mfloat8x16_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_p8(poly8x16_t __p0) { int16x8_t __ret; __ret = (int16x8_t)(__p0); @@ -58958,6 +61189,11 @@ __ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_s64(int64x2_t _ __ret = (int16x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int16x8_t vreinterpretq_s16_mf8(mfloat8x16_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_p8(poly8x8_t __p0) { uint8x8_t __ret; __ret = (uint8x8_t)(__p0); @@ -59018,6 +61254,11 @@ __ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s64(int64x1_t __p __ret = (uint8x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_mf8(mfloat8x8_t __p0) { + uint8x8_t __ret; + __ret = (uint8x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint8x8_t vreinterpret_u8_s16(int16x4_t __p0) { uint8x8_t __ret; __ret = (uint8x8_t)(__p0); @@ -59083,6 +61324,11 @@ __ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s64(int64x1_t _ __ret = (uint32x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_mf8(mfloat8x8_t __p0) { + uint32x2_t __ret; + __ret = (uint32x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint32x2_t vreinterpret_u32_s16(int16x4_t __p0) { uint32x2_t __ret; __ret = (uint32x2_t)(__p0); @@ -59148,6 +61394,11 @@ __ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s64(int64x1_t _ __ret = (uint64x1_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_mf8(mfloat8x8_t __p0) { + uint64x1_t __ret; + __ret = (uint64x1_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint64x1_t vreinterpret_u64_s16(int16x4_t __p0) { uint64x1_t __ret; __ret = (uint64x1_t)(__p0); @@ -59213,6 +61464,11 @@ __ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s64(int64x1_t _ __ret = (uint16x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_mf8(mfloat8x8_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint16x4_t vreinterpret_u16_s16(int16x4_t __p0) { uint16x4_t __ret; __ret = (uint16x4_t)(__p0); @@ -59278,6 +61534,11 @@ __ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s64(int64x1_t __p0 __ret = (int8x8_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_mf8(mfloat8x8_t __p0) { + int8x8_t __ret; + __ret = (int8x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int8x8_t vreinterpret_s8_s16(int16x4_t __p0) { int8x8_t __ret; __ret = (int8x8_t)(__p0); @@ -59343,6 +61604,11 @@ __ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s64(int64x1_t __ret = (float64x1_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_mf8(mfloat8x8_t __p0) { + float64x1_t __ret; + __ret = (float64x1_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float64x1_t vreinterpret_f64_s16(int16x4_t __p0) { float64x1_t __ret; __ret = (float64x1_t)(__p0); @@ -59408,6 +61674,11 @@ __ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s64(int64x1_t __ret = (float32x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_mf8(mfloat8x8_t __p0) { + float32x2_t __ret; + __ret = (float32x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float32x2_t vreinterpret_f32_s16(int16x4_t __p0) { float32x2_t __ret; __ret = (float32x2_t)(__p0); @@ -59473,6 +61744,11 @@ __ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s64(int64x1_t __ret = (float16x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_mf8(mfloat8x8_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) float16x4_t vreinterpret_f16_s16(int16x4_t __p0) { float16x4_t __ret; __ret = (float16x4_t)(__p0); @@ -59538,6 +61814,11 @@ __ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s64(int64x1_t __ __ret = (int32x2_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_mf8(mfloat8x8_t __p0) { + int32x2_t __ret; + __ret = (int32x2_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int32x2_t vreinterpret_s32_s16(int16x4_t __p0) { int32x2_t __ret; __ret = (int32x2_t)(__p0); @@ -59603,11 +61884,86 @@ __ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s32(int32x2_t __ __ret = (int64x1_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_mf8(mfloat8x8_t __p0) { + int64x1_t __ret; + __ret = (int64x1_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int64x1_t vreinterpret_s64_s16(int16x4_t __p0) { int64x1_t __ret; __ret = (int64x1_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_p8(poly8x8_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_p64(poly64x1_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_p16(poly16x4_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_u8(uint8x8_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_u32(uint32x2_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_u64(uint64x1_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_u16(uint16x4_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_s8(int8x8_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_f64(float64x1_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_f32(float32x2_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_f16(float16x4_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_s32(int32x2_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_s64(int64x1_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} +__ai __attribute__((target("neon"))) mfloat8x8_t vreinterpret_mf8_s16(int16x4_t __p0) { + mfloat8x8_t __ret; + __ret = (mfloat8x8_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_p8(poly8x8_t __p0) { int16x4_t __ret; __ret = (int16x4_t)(__p0); @@ -59673,6 +62029,11 @@ __ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_s64(int64x1_t __ __ret = (int16x4_t)(__p0); return __ret; } +__ai __attribute__((target("neon"))) int16x4_t vreinterpret_s16_mf8(mfloat8x8_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t)(__p0); + return __ret; +} __ai __attribute__((target("neon"))) uint64_t vrshld_u64(uint64_t __p0, int64_t __p1) { uint64_t __ret; __ret = (uint64_t) __builtin_neon_vrshld_u64(__p0, __p1); @@ -65616,106 +67977,6 @@ __ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_f64(float64x2_t __ __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { - float64x2_t __ret; - __ret = (float64x2_t) __builtin_neon_vcmlaq_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); - return __ret; -} -#endif - -__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { - float64x1_t __ret; - __ret = (float64x1_t) __builtin_neon_vcmla_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); - return __ret; -} -#define vcmla_lane_f64(__p0_792, __p1_792, __p2_792, __p3_792) __extension__ ({ \ - float64x1_t __ret_792; \ - float64x1_t __s0_792 = __p0_792; \ - float64x1_t __s1_792 = __p1_792; \ - float64x1_t __s2_792 = __p2_792; \ -float64x1_t __reint_792 = __s2_792; \ -uint64x2_t __reint1_792 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_792, __p3_792), vgetq_lane_u64(*(uint64x2_t *) &__reint_792, __p3_792)}; \ - __ret_792 = vcmla_f64(__s0_792, __s1_792, *(float64x1_t *) &__reint1_792); \ - __ret_792; \ -}) -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_lane_f64(__p0_793, __p1_793, __p2_793, __p3_793) __extension__ ({ \ - float64x2_t __ret_793; \ - float64x2_t __s0_793 = __p0_793; \ - float64x2_t __s1_793 = __p1_793; \ - float64x1_t __s2_793 = __p2_793; \ -float64x1_t __reint_793 = __s2_793; \ -uint64x2_t __reint1_793 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_793, __p3_793), vgetq_lane_u64(*(uint64x2_t *) &__reint_793, __p3_793)}; \ - __ret_793 = vcmlaq_f64(__s0_793, __s1_793, *(float64x2_t *) &__reint1_793); \ - __ret_793; \ -}) -#else -#define vcmlaq_lane_f64(__p0_794, __p1_794, __p2_794, __p3_794) __extension__ ({ \ - float64x2_t __ret_794; \ - float64x2_t __s0_794 = __p0_794; \ - float64x2_t __s1_794 = __p1_794; \ - float64x1_t __s2_794 = __p2_794; \ - float64x2_t __rev0_794; __rev0_794 = __builtin_shufflevector(__s0_794, __s0_794, 1, 0); \ - float64x2_t __rev1_794; __rev1_794 = __builtin_shufflevector(__s1_794, __s1_794, 1, 0); \ -float64x1_t __reint_794 = __s2_794; \ -uint64x2_t __reint1_794 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_794, __p3_794), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_794, __p3_794)}; \ - __ret_794 = __noswap_vcmlaq_f64(__rev0_794, __rev1_794, *(float64x2_t *) &__reint1_794); \ - __ret_794 = __builtin_shufflevector(__ret_794, __ret_794, 1, 0); \ - __ret_794; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmla_laneq_f64(__p0_795, __p1_795, __p2_795, __p3_795) __extension__ ({ \ - float64x1_t __ret_795; \ - float64x1_t __s0_795 = __p0_795; \ - float64x1_t __s1_795 = __p1_795; \ - float64x2_t __s2_795 = __p2_795; \ -float64x2_t __reint_795 = __s2_795; \ -uint64x2_t __reint1_795 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_795, __p3_795), vgetq_lane_u64(*(uint64x2_t *) &__reint_795, __p3_795)}; \ - __ret_795 = vcmla_f64(__s0_795, __s1_795, *(float64x1_t *) &__reint1_795); \ - __ret_795; \ -}) -#else -#define vcmla_laneq_f64(__p0_796, __p1_796, __p2_796, __p3_796) __extension__ ({ \ - float64x1_t __ret_796; \ - float64x1_t __s0_796 = __p0_796; \ - float64x1_t __s1_796 = __p1_796; \ - float64x2_t __s2_796 = __p2_796; \ - float64x2_t __rev2_796; __rev2_796 = __builtin_shufflevector(__s2_796, __s2_796, 1, 0); \ -float64x2_t __reint_796 = __rev2_796; \ -uint64x2_t __reint1_796 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_796, __p3_796), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_796, __p3_796)}; \ - __ret_796 = vcmla_f64(__s0_796, __s1_796, *(float64x1_t *) &__reint1_796); \ - __ret_796; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_laneq_f64(__p0_797, __p1_797, __p2_797, __p3_797) __extension__ ({ \ - float64x2_t __ret_797; \ - float64x2_t __s0_797 = __p0_797; \ - float64x2_t __s1_797 = __p1_797; \ - float64x2_t __s2_797 = __p2_797; \ -float64x2_t __reint_797 = __s2_797; \ -uint64x2_t __reint1_797 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_797, __p3_797), vgetq_lane_u64(*(uint64x2_t *) &__reint_797, __p3_797)}; \ - __ret_797 = vcmlaq_f64(__s0_797, __s1_797, *(float64x2_t *) &__reint1_797); \ - __ret_797; \ -}) -#else -#define vcmlaq_laneq_f64(__p0_798, __p1_798, __p2_798, __p3_798) __extension__ ({ \ - float64x2_t __ret_798; \ - float64x2_t __s0_798 = __p0_798; \ - float64x2_t __s1_798 = __p1_798; \ - float64x2_t __s2_798 = __p2_798; \ - float64x2_t __rev0_798; __rev0_798 = __builtin_shufflevector(__s0_798, __s0_798, 1, 0); \ - float64x2_t __rev1_798; __rev1_798 = __builtin_shufflevector(__s1_798, __s1_798, 1, 0); \ - float64x2_t __rev2_798; __rev2_798 = __builtin_shufflevector(__s2_798, __s2_798, 1, 0); \ -float64x2_t __reint_798 = __rev2_798; \ -uint64x2_t __reint1_798 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_798, __p3_798), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_798, __p3_798)}; \ - __ret_798 = __noswap_vcmlaq_f64(__rev0_798, __rev1_798, *(float64x2_t *) &__reint1_798); \ - __ret_798 = __builtin_shufflevector(__ret_798, __ret_798, 1, 0); \ - __ret_798; \ -}) #endif #ifdef __LITTLE_ENDIAN__ @@ -65734,106 +67995,6 @@ __ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot180_f64(float64 __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot180_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { - float64x2_t __ret; - __ret = (float64x2_t) __builtin_neon_vcmlaq_rot180_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); - return __ret; -} -#endif - -__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot180_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { - float64x1_t __ret; - __ret = (float64x1_t) __builtin_neon_vcmla_rot180_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); - return __ret; -} -#define vcmla_rot180_lane_f64(__p0_799, __p1_799, __p2_799, __p3_799) __extension__ ({ \ - float64x1_t __ret_799; \ - float64x1_t __s0_799 = __p0_799; \ - float64x1_t __s1_799 = __p1_799; \ - float64x1_t __s2_799 = __p2_799; \ -float64x1_t __reint_799 = __s2_799; \ -uint64x2_t __reint1_799 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_799, __p3_799), vgetq_lane_u64(*(uint64x2_t *) &__reint_799, __p3_799)}; \ - __ret_799 = vcmla_rot180_f64(__s0_799, __s1_799, *(float64x1_t *) &__reint1_799); \ - __ret_799; \ -}) -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot180_lane_f64(__p0_800, __p1_800, __p2_800, __p3_800) __extension__ ({ \ - float64x2_t __ret_800; \ - float64x2_t __s0_800 = __p0_800; \ - float64x2_t __s1_800 = __p1_800; \ - float64x1_t __s2_800 = __p2_800; \ -float64x1_t __reint_800 = __s2_800; \ -uint64x2_t __reint1_800 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_800, __p3_800), vgetq_lane_u64(*(uint64x2_t *) &__reint_800, __p3_800)}; \ - __ret_800 = vcmlaq_rot180_f64(__s0_800, __s1_800, *(float64x2_t *) &__reint1_800); \ - __ret_800; \ -}) -#else -#define vcmlaq_rot180_lane_f64(__p0_801, __p1_801, __p2_801, __p3_801) __extension__ ({ \ - float64x2_t __ret_801; \ - float64x2_t __s0_801 = __p0_801; \ - float64x2_t __s1_801 = __p1_801; \ - float64x1_t __s2_801 = __p2_801; \ - float64x2_t __rev0_801; __rev0_801 = __builtin_shufflevector(__s0_801, __s0_801, 1, 0); \ - float64x2_t __rev1_801; __rev1_801 = __builtin_shufflevector(__s1_801, __s1_801, 1, 0); \ -float64x1_t __reint_801 = __s2_801; \ -uint64x2_t __reint1_801 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_801, __p3_801), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_801, __p3_801)}; \ - __ret_801 = __noswap_vcmlaq_rot180_f64(__rev0_801, __rev1_801, *(float64x2_t *) &__reint1_801); \ - __ret_801 = __builtin_shufflevector(__ret_801, __ret_801, 1, 0); \ - __ret_801; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmla_rot180_laneq_f64(__p0_802, __p1_802, __p2_802, __p3_802) __extension__ ({ \ - float64x1_t __ret_802; \ - float64x1_t __s0_802 = __p0_802; \ - float64x1_t __s1_802 = __p1_802; \ - float64x2_t __s2_802 = __p2_802; \ -float64x2_t __reint_802 = __s2_802; \ -uint64x2_t __reint1_802 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_802, __p3_802), vgetq_lane_u64(*(uint64x2_t *) &__reint_802, __p3_802)}; \ - __ret_802 = vcmla_rot180_f64(__s0_802, __s1_802, *(float64x1_t *) &__reint1_802); \ - __ret_802; \ -}) -#else -#define vcmla_rot180_laneq_f64(__p0_803, __p1_803, __p2_803, __p3_803) __extension__ ({ \ - float64x1_t __ret_803; \ - float64x1_t __s0_803 = __p0_803; \ - float64x1_t __s1_803 = __p1_803; \ - float64x2_t __s2_803 = __p2_803; \ - float64x2_t __rev2_803; __rev2_803 = __builtin_shufflevector(__s2_803, __s2_803, 1, 0); \ -float64x2_t __reint_803 = __rev2_803; \ -uint64x2_t __reint1_803 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_803, __p3_803), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_803, __p3_803)}; \ - __ret_803 = vcmla_rot180_f64(__s0_803, __s1_803, *(float64x1_t *) &__reint1_803); \ - __ret_803; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot180_laneq_f64(__p0_804, __p1_804, __p2_804, __p3_804) __extension__ ({ \ - float64x2_t __ret_804; \ - float64x2_t __s0_804 = __p0_804; \ - float64x2_t __s1_804 = __p1_804; \ - float64x2_t __s2_804 = __p2_804; \ -float64x2_t __reint_804 = __s2_804; \ -uint64x2_t __reint1_804 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_804, __p3_804), vgetq_lane_u64(*(uint64x2_t *) &__reint_804, __p3_804)}; \ - __ret_804 = vcmlaq_rot180_f64(__s0_804, __s1_804, *(float64x2_t *) &__reint1_804); \ - __ret_804; \ -}) -#else -#define vcmlaq_rot180_laneq_f64(__p0_805, __p1_805, __p2_805, __p3_805) __extension__ ({ \ - float64x2_t __ret_805; \ - float64x2_t __s0_805 = __p0_805; \ - float64x2_t __s1_805 = __p1_805; \ - float64x2_t __s2_805 = __p2_805; \ - float64x2_t __rev0_805; __rev0_805 = __builtin_shufflevector(__s0_805, __s0_805, 1, 0); \ - float64x2_t __rev1_805; __rev1_805 = __builtin_shufflevector(__s1_805, __s1_805, 1, 0); \ - float64x2_t __rev2_805; __rev2_805 = __builtin_shufflevector(__s2_805, __s2_805, 1, 0); \ -float64x2_t __reint_805 = __rev2_805; \ -uint64x2_t __reint1_805 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_805, __p3_805), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_805, __p3_805)}; \ - __ret_805 = __noswap_vcmlaq_rot180_f64(__rev0_805, __rev1_805, *(float64x2_t *) &__reint1_805); \ - __ret_805 = __builtin_shufflevector(__ret_805, __ret_805, 1, 0); \ - __ret_805; \ -}) #endif #ifdef __LITTLE_ENDIAN__ @@ -65852,106 +68013,6 @@ __ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot270_f64(float64 __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot270_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { - float64x2_t __ret; - __ret = (float64x2_t) __builtin_neon_vcmlaq_rot270_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); - return __ret; -} -#endif - -__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot270_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { - float64x1_t __ret; - __ret = (float64x1_t) __builtin_neon_vcmla_rot270_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); - return __ret; -} -#define vcmla_rot270_lane_f64(__p0_806, __p1_806, __p2_806, __p3_806) __extension__ ({ \ - float64x1_t __ret_806; \ - float64x1_t __s0_806 = __p0_806; \ - float64x1_t __s1_806 = __p1_806; \ - float64x1_t __s2_806 = __p2_806; \ -float64x1_t __reint_806 = __s2_806; \ -uint64x2_t __reint1_806 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_806, __p3_806), vgetq_lane_u64(*(uint64x2_t *) &__reint_806, __p3_806)}; \ - __ret_806 = vcmla_rot270_f64(__s0_806, __s1_806, *(float64x1_t *) &__reint1_806); \ - __ret_806; \ -}) -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot270_lane_f64(__p0_807, __p1_807, __p2_807, __p3_807) __extension__ ({ \ - float64x2_t __ret_807; \ - float64x2_t __s0_807 = __p0_807; \ - float64x2_t __s1_807 = __p1_807; \ - float64x1_t __s2_807 = __p2_807; \ -float64x1_t __reint_807 = __s2_807; \ -uint64x2_t __reint1_807 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_807, __p3_807), vgetq_lane_u64(*(uint64x2_t *) &__reint_807, __p3_807)}; \ - __ret_807 = vcmlaq_rot270_f64(__s0_807, __s1_807, *(float64x2_t *) &__reint1_807); \ - __ret_807; \ -}) -#else -#define vcmlaq_rot270_lane_f64(__p0_808, __p1_808, __p2_808, __p3_808) __extension__ ({ \ - float64x2_t __ret_808; \ - float64x2_t __s0_808 = __p0_808; \ - float64x2_t __s1_808 = __p1_808; \ - float64x1_t __s2_808 = __p2_808; \ - float64x2_t __rev0_808; __rev0_808 = __builtin_shufflevector(__s0_808, __s0_808, 1, 0); \ - float64x2_t __rev1_808; __rev1_808 = __builtin_shufflevector(__s1_808, __s1_808, 1, 0); \ -float64x1_t __reint_808 = __s2_808; \ -uint64x2_t __reint1_808 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_808, __p3_808), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_808, __p3_808)}; \ - __ret_808 = __noswap_vcmlaq_rot270_f64(__rev0_808, __rev1_808, *(float64x2_t *) &__reint1_808); \ - __ret_808 = __builtin_shufflevector(__ret_808, __ret_808, 1, 0); \ - __ret_808; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmla_rot270_laneq_f64(__p0_809, __p1_809, __p2_809, __p3_809) __extension__ ({ \ - float64x1_t __ret_809; \ - float64x1_t __s0_809 = __p0_809; \ - float64x1_t __s1_809 = __p1_809; \ - float64x2_t __s2_809 = __p2_809; \ -float64x2_t __reint_809 = __s2_809; \ -uint64x2_t __reint1_809 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_809, __p3_809), vgetq_lane_u64(*(uint64x2_t *) &__reint_809, __p3_809)}; \ - __ret_809 = vcmla_rot270_f64(__s0_809, __s1_809, *(float64x1_t *) &__reint1_809); \ - __ret_809; \ -}) -#else -#define vcmla_rot270_laneq_f64(__p0_810, __p1_810, __p2_810, __p3_810) __extension__ ({ \ - float64x1_t __ret_810; \ - float64x1_t __s0_810 = __p0_810; \ - float64x1_t __s1_810 = __p1_810; \ - float64x2_t __s2_810 = __p2_810; \ - float64x2_t __rev2_810; __rev2_810 = __builtin_shufflevector(__s2_810, __s2_810, 1, 0); \ -float64x2_t __reint_810 = __rev2_810; \ -uint64x2_t __reint1_810 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_810, __p3_810), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_810, __p3_810)}; \ - __ret_810 = vcmla_rot270_f64(__s0_810, __s1_810, *(float64x1_t *) &__reint1_810); \ - __ret_810; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot270_laneq_f64(__p0_811, __p1_811, __p2_811, __p3_811) __extension__ ({ \ - float64x2_t __ret_811; \ - float64x2_t __s0_811 = __p0_811; \ - float64x2_t __s1_811 = __p1_811; \ - float64x2_t __s2_811 = __p2_811; \ -float64x2_t __reint_811 = __s2_811; \ -uint64x2_t __reint1_811 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_811, __p3_811), vgetq_lane_u64(*(uint64x2_t *) &__reint_811, __p3_811)}; \ - __ret_811 = vcmlaq_rot270_f64(__s0_811, __s1_811, *(float64x2_t *) &__reint1_811); \ - __ret_811; \ -}) -#else -#define vcmlaq_rot270_laneq_f64(__p0_812, __p1_812, __p2_812, __p3_812) __extension__ ({ \ - float64x2_t __ret_812; \ - float64x2_t __s0_812 = __p0_812; \ - float64x2_t __s1_812 = __p1_812; \ - float64x2_t __s2_812 = __p2_812; \ - float64x2_t __rev0_812; __rev0_812 = __builtin_shufflevector(__s0_812, __s0_812, 1, 0); \ - float64x2_t __rev1_812; __rev1_812 = __builtin_shufflevector(__s1_812, __s1_812, 1, 0); \ - float64x2_t __rev2_812; __rev2_812 = __builtin_shufflevector(__s2_812, __s2_812, 1, 0); \ -float64x2_t __reint_812 = __rev2_812; \ -uint64x2_t __reint1_812 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_812, __p3_812), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_812, __p3_812)}; \ - __ret_812 = __noswap_vcmlaq_rot270_f64(__rev0_812, __rev1_812, *(float64x2_t *) &__reint1_812); \ - __ret_812 = __builtin_shufflevector(__ret_812, __ret_812, 1, 0); \ - __ret_812; \ -}) #endif #ifdef __LITTLE_ENDIAN__ @@ -65970,106 +68031,6 @@ __ai __attribute__((target("v8.3a,neon"))) float64x2_t vcmlaq_rot90_f64(float64x __ret = __builtin_shufflevector(__ret, __ret, 1, 0); return __ret; } -__ai __attribute__((target("v8.3a,neon"))) float64x2_t __noswap_vcmlaq_rot90_f64(float64x2_t __p0, float64x2_t __p1, float64x2_t __p2) { - float64x2_t __ret; - __ret = (float64x2_t) __builtin_neon_vcmlaq_rot90_f64((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 42); - return __ret; -} -#endif - -__ai __attribute__((target("v8.3a,neon"))) float64x1_t vcmla_rot90_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) { - float64x1_t __ret; - __ret = (float64x1_t) __builtin_neon_vcmla_rot90_f64((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 10); - return __ret; -} -#define vcmla_rot90_lane_f64(__p0_813, __p1_813, __p2_813, __p3_813) __extension__ ({ \ - float64x1_t __ret_813; \ - float64x1_t __s0_813 = __p0_813; \ - float64x1_t __s1_813 = __p1_813; \ - float64x1_t __s2_813 = __p2_813; \ -float64x1_t __reint_813 = __s2_813; \ -uint64x2_t __reint1_813 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_813, __p3_813), vgetq_lane_u64(*(uint64x2_t *) &__reint_813, __p3_813)}; \ - __ret_813 = vcmla_rot90_f64(__s0_813, __s1_813, *(float64x1_t *) &__reint1_813); \ - __ret_813; \ -}) -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot90_lane_f64(__p0_814, __p1_814, __p2_814, __p3_814) __extension__ ({ \ - float64x2_t __ret_814; \ - float64x2_t __s0_814 = __p0_814; \ - float64x2_t __s1_814 = __p1_814; \ - float64x1_t __s2_814 = __p2_814; \ -float64x1_t __reint_814 = __s2_814; \ -uint64x2_t __reint1_814 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_814, __p3_814), vgetq_lane_u64(*(uint64x2_t *) &__reint_814, __p3_814)}; \ - __ret_814 = vcmlaq_rot90_f64(__s0_814, __s1_814, *(float64x2_t *) &__reint1_814); \ - __ret_814; \ -}) -#else -#define vcmlaq_rot90_lane_f64(__p0_815, __p1_815, __p2_815, __p3_815) __extension__ ({ \ - float64x2_t __ret_815; \ - float64x2_t __s0_815 = __p0_815; \ - float64x2_t __s1_815 = __p1_815; \ - float64x1_t __s2_815 = __p2_815; \ - float64x2_t __rev0_815; __rev0_815 = __builtin_shufflevector(__s0_815, __s0_815, 1, 0); \ - float64x2_t __rev1_815; __rev1_815 = __builtin_shufflevector(__s1_815, __s1_815, 1, 0); \ -float64x1_t __reint_815 = __s2_815; \ -uint64x2_t __reint1_815 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_815, __p3_815), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_815, __p3_815)}; \ - __ret_815 = __noswap_vcmlaq_rot90_f64(__rev0_815, __rev1_815, *(float64x2_t *) &__reint1_815); \ - __ret_815 = __builtin_shufflevector(__ret_815, __ret_815, 1, 0); \ - __ret_815; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmla_rot90_laneq_f64(__p0_816, __p1_816, __p2_816, __p3_816) __extension__ ({ \ - float64x1_t __ret_816; \ - float64x1_t __s0_816 = __p0_816; \ - float64x1_t __s1_816 = __p1_816; \ - float64x2_t __s2_816 = __p2_816; \ -float64x2_t __reint_816 = __s2_816; \ -uint64x2_t __reint1_816 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_816, __p3_816), vgetq_lane_u64(*(uint64x2_t *) &__reint_816, __p3_816)}; \ - __ret_816 = vcmla_rot90_f64(__s0_816, __s1_816, *(float64x1_t *) &__reint1_816); \ - __ret_816; \ -}) -#else -#define vcmla_rot90_laneq_f64(__p0_817, __p1_817, __p2_817, __p3_817) __extension__ ({ \ - float64x1_t __ret_817; \ - float64x1_t __s0_817 = __p0_817; \ - float64x1_t __s1_817 = __p1_817; \ - float64x2_t __s2_817 = __p2_817; \ - float64x2_t __rev2_817; __rev2_817 = __builtin_shufflevector(__s2_817, __s2_817, 1, 0); \ -float64x2_t __reint_817 = __rev2_817; \ -uint64x2_t __reint1_817 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_817, __p3_817), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_817, __p3_817)}; \ - __ret_817 = vcmla_rot90_f64(__s0_817, __s1_817, *(float64x1_t *) &__reint1_817); \ - __ret_817; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcmlaq_rot90_laneq_f64(__p0_818, __p1_818, __p2_818, __p3_818) __extension__ ({ \ - float64x2_t __ret_818; \ - float64x2_t __s0_818 = __p0_818; \ - float64x2_t __s1_818 = __p1_818; \ - float64x2_t __s2_818 = __p2_818; \ -float64x2_t __reint_818 = __s2_818; \ -uint64x2_t __reint1_818 = (uint64x2_t) {vgetq_lane_u64(*(uint64x2_t *) &__reint_818, __p3_818), vgetq_lane_u64(*(uint64x2_t *) &__reint_818, __p3_818)}; \ - __ret_818 = vcmlaq_rot90_f64(__s0_818, __s1_818, *(float64x2_t *) &__reint1_818); \ - __ret_818; \ -}) -#else -#define vcmlaq_rot90_laneq_f64(__p0_819, __p1_819, __p2_819, __p3_819) __extension__ ({ \ - float64x2_t __ret_819; \ - float64x2_t __s0_819 = __p0_819; \ - float64x2_t __s1_819 = __p1_819; \ - float64x2_t __s2_819 = __p2_819; \ - float64x2_t __rev0_819; __rev0_819 = __builtin_shufflevector(__s0_819, __s0_819, 1, 0); \ - float64x2_t __rev1_819; __rev1_819 = __builtin_shufflevector(__s1_819, __s1_819, 1, 0); \ - float64x2_t __rev2_819; __rev2_819 = __builtin_shufflevector(__s2_819, __s2_819, 1, 0); \ -float64x2_t __reint_819 = __rev2_819; \ -uint64x2_t __reint1_819 = (uint64x2_t) {__noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_819, __p3_819), __noswap_vgetq_lane_u64(*(uint64x2_t *) &__reint_819, __p3_819)}; \ - __ret_819 = __noswap_vcmlaq_rot90_f64(__rev0_819, __rev1_819, *(float64x2_t *) &__reint1_819); \ - __ret_819 = __builtin_shufflevector(__ret_819, __ret_819, 1, 0); \ - __ret_819; \ -}) #endif #ifdef __LITTLE_ENDIAN__ @@ -66286,237 +68247,215 @@ __ai __attribute__((target("v8.5a,neon"))) float64x1_t vrnd64z_f64(float64x1_t _ } #endif #ifdef __LITTLE_ENDIAN__ -#define vbfdotq_lane_f32(__p0_820, __p1_820, __p2_820, __p3_820) __extension__ ({ \ - float32x4_t __ret_820; \ - float32x4_t __s0_820 = __p0_820; \ - bfloat16x8_t __s1_820 = __p1_820; \ - bfloat16x4_t __s2_820 = __p2_820; \ -bfloat16x4_t __reint_820 = __s2_820; \ -float32x4_t __reint1_820 = splatq_lane_f32(*(float32x2_t *) &__reint_820, __p3_820); \ - __ret_820 = vbfdotq_f32(__s0_820, __s1_820, *(bfloat16x8_t *) &__reint1_820); \ - __ret_820; \ +#define vbfdotq_lane_f32(__p0_792, __p1_792, __p2_792, __p3_792) __extension__ ({ \ + float32x4_t __ret_792; \ + float32x4_t __s0_792 = __p0_792; \ + bfloat16x8_t __s1_792 = __p1_792; \ + bfloat16x4_t __s2_792 = __p2_792; \ + __ret_792 = vbfdotq_f32(__s0_792, __s1_792, __builtin_bit_cast(bfloat16x8_t, splatq_lane_f32(__builtin_bit_cast(float32x2_t, __s2_792), __p3_792))); \ + __ret_792; \ }) #else -#define vbfdotq_lane_f32(__p0_821, __p1_821, __p2_821, __p3_821) __extension__ ({ \ - float32x4_t __ret_821; \ - float32x4_t __s0_821 = __p0_821; \ - bfloat16x8_t __s1_821 = __p1_821; \ - bfloat16x4_t __s2_821 = __p2_821; \ - float32x4_t __rev0_821; __rev0_821 = __builtin_shufflevector(__s0_821, __s0_821, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_821; __rev1_821 = __builtin_shufflevector(__s1_821, __s1_821, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x4_t __rev2_821; __rev2_821 = __builtin_shufflevector(__s2_821, __s2_821, 3, 2, 1, 0); \ -bfloat16x4_t __reint_821 = __rev2_821; \ -float32x4_t __reint1_821 = __noswap_splatq_lane_f32(*(float32x2_t *) &__reint_821, __p3_821); \ - __ret_821 = __noswap_vbfdotq_f32(__rev0_821, __rev1_821, *(bfloat16x8_t *) &__reint1_821); \ - __ret_821 = __builtin_shufflevector(__ret_821, __ret_821, 3, 2, 1, 0); \ - __ret_821; \ +#define vbfdotq_lane_f32(__p0_793, __p1_793, __p2_793, __p3_793) __extension__ ({ \ + float32x4_t __ret_793; \ + float32x4_t __s0_793 = __p0_793; \ + bfloat16x8_t __s1_793 = __p1_793; \ + bfloat16x4_t __s2_793 = __p2_793; \ + float32x4_t __rev0_793; __rev0_793 = __builtin_shufflevector(__s0_793, __s0_793, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_793; __rev1_793 = __builtin_shufflevector(__s1_793, __s1_793, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x4_t __rev2_793; __rev2_793 = __builtin_shufflevector(__s2_793, __s2_793, 3, 2, 1, 0); \ + __ret_793 = __noswap_vbfdotq_f32(__rev0_793, __rev1_793, __builtin_bit_cast(bfloat16x8_t, __noswap_splatq_lane_f32(__builtin_bit_cast(float32x2_t, __rev2_793), __p3_793))); \ + __ret_793 = __builtin_shufflevector(__ret_793, __ret_793, 3, 2, 1, 0); \ + __ret_793; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfdot_lane_f32(__p0_822, __p1_822, __p2_822, __p3_822) __extension__ ({ \ - float32x2_t __ret_822; \ - float32x2_t __s0_822 = __p0_822; \ - bfloat16x4_t __s1_822 = __p1_822; \ - bfloat16x4_t __s2_822 = __p2_822; \ -bfloat16x4_t __reint_822 = __s2_822; \ -float32x2_t __reint1_822 = splat_lane_f32(*(float32x2_t *) &__reint_822, __p3_822); \ - __ret_822 = vbfdot_f32(__s0_822, __s1_822, *(bfloat16x4_t *) &__reint1_822); \ - __ret_822; \ +#define vbfdot_lane_f32(__p0_794, __p1_794, __p2_794, __p3_794) __extension__ ({ \ + float32x2_t __ret_794; \ + float32x2_t __s0_794 = __p0_794; \ + bfloat16x4_t __s1_794 = __p1_794; \ + bfloat16x4_t __s2_794 = __p2_794; \ + __ret_794 = vbfdot_f32(__s0_794, __s1_794, __builtin_bit_cast(bfloat16x4_t, splat_lane_f32(__builtin_bit_cast(float32x2_t, __s2_794), __p3_794))); \ + __ret_794; \ }) #else -#define vbfdot_lane_f32(__p0_823, __p1_823, __p2_823, __p3_823) __extension__ ({ \ - float32x2_t __ret_823; \ - float32x2_t __s0_823 = __p0_823; \ - bfloat16x4_t __s1_823 = __p1_823; \ - bfloat16x4_t __s2_823 = __p2_823; \ - float32x2_t __rev0_823; __rev0_823 = __builtin_shufflevector(__s0_823, __s0_823, 1, 0); \ - bfloat16x4_t __rev1_823; __rev1_823 = __builtin_shufflevector(__s1_823, __s1_823, 3, 2, 1, 0); \ - bfloat16x4_t __rev2_823; __rev2_823 = __builtin_shufflevector(__s2_823, __s2_823, 3, 2, 1, 0); \ -bfloat16x4_t __reint_823 = __rev2_823; \ -float32x2_t __reint1_823 = __noswap_splat_lane_f32(*(float32x2_t *) &__reint_823, __p3_823); \ - __ret_823 = __noswap_vbfdot_f32(__rev0_823, __rev1_823, *(bfloat16x4_t *) &__reint1_823); \ - __ret_823 = __builtin_shufflevector(__ret_823, __ret_823, 1, 0); \ - __ret_823; \ +#define vbfdot_lane_f32(__p0_795, __p1_795, __p2_795, __p3_795) __extension__ ({ \ + float32x2_t __ret_795; \ + float32x2_t __s0_795 = __p0_795; \ + bfloat16x4_t __s1_795 = __p1_795; \ + bfloat16x4_t __s2_795 = __p2_795; \ + float32x2_t __rev0_795; __rev0_795 = __builtin_shufflevector(__s0_795, __s0_795, 1, 0); \ + bfloat16x4_t __rev1_795; __rev1_795 = __builtin_shufflevector(__s1_795, __s1_795, 3, 2, 1, 0); \ + bfloat16x4_t __rev2_795; __rev2_795 = __builtin_shufflevector(__s2_795, __s2_795, 3, 2, 1, 0); \ + __ret_795 = __noswap_vbfdot_f32(__rev0_795, __rev1_795, __builtin_bit_cast(bfloat16x4_t, __noswap_splat_lane_f32(__builtin_bit_cast(float32x2_t, __rev2_795), __p3_795))); \ + __ret_795 = __builtin_shufflevector(__ret_795, __ret_795, 1, 0); \ + __ret_795; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfdotq_laneq_f32(__p0_824, __p1_824, __p2_824, __p3_824) __extension__ ({ \ - float32x4_t __ret_824; \ - float32x4_t __s0_824 = __p0_824; \ - bfloat16x8_t __s1_824 = __p1_824; \ - bfloat16x8_t __s2_824 = __p2_824; \ -bfloat16x8_t __reint_824 = __s2_824; \ -float32x4_t __reint1_824 = splatq_laneq_f32(*(float32x4_t *) &__reint_824, __p3_824); \ - __ret_824 = vbfdotq_f32(__s0_824, __s1_824, *(bfloat16x8_t *) &__reint1_824); \ - __ret_824; \ +#define vbfdotq_laneq_f32(__p0_796, __p1_796, __p2_796, __p3_796) __extension__ ({ \ + float32x4_t __ret_796; \ + float32x4_t __s0_796 = __p0_796; \ + bfloat16x8_t __s1_796 = __p1_796; \ + bfloat16x8_t __s2_796 = __p2_796; \ + __ret_796 = vbfdotq_f32(__s0_796, __s1_796, __builtin_bit_cast(bfloat16x8_t, splatq_laneq_f32(__builtin_bit_cast(float32x4_t, __s2_796), __p3_796))); \ + __ret_796; \ }) #else -#define vbfdotq_laneq_f32(__p0_825, __p1_825, __p2_825, __p3_825) __extension__ ({ \ - float32x4_t __ret_825; \ - float32x4_t __s0_825 = __p0_825; \ - bfloat16x8_t __s1_825 = __p1_825; \ - bfloat16x8_t __s2_825 = __p2_825; \ - float32x4_t __rev0_825; __rev0_825 = __builtin_shufflevector(__s0_825, __s0_825, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_825; __rev1_825 = __builtin_shufflevector(__s1_825, __s1_825, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x8_t __rev2_825; __rev2_825 = __builtin_shufflevector(__s2_825, __s2_825, 7, 6, 5, 4, 3, 2, 1, 0); \ -bfloat16x8_t __reint_825 = __rev2_825; \ -float32x4_t __reint1_825 = __noswap_splatq_laneq_f32(*(float32x4_t *) &__reint_825, __p3_825); \ - __ret_825 = __noswap_vbfdotq_f32(__rev0_825, __rev1_825, *(bfloat16x8_t *) &__reint1_825); \ - __ret_825 = __builtin_shufflevector(__ret_825, __ret_825, 3, 2, 1, 0); \ - __ret_825; \ +#define vbfdotq_laneq_f32(__p0_797, __p1_797, __p2_797, __p3_797) __extension__ ({ \ + float32x4_t __ret_797; \ + float32x4_t __s0_797 = __p0_797; \ + bfloat16x8_t __s1_797 = __p1_797; \ + bfloat16x8_t __s2_797 = __p2_797; \ + float32x4_t __rev0_797; __rev0_797 = __builtin_shufflevector(__s0_797, __s0_797, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_797; __rev1_797 = __builtin_shufflevector(__s1_797, __s1_797, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x8_t __rev2_797; __rev2_797 = __builtin_shufflevector(__s2_797, __s2_797, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_797 = __noswap_vbfdotq_f32(__rev0_797, __rev1_797, __builtin_bit_cast(bfloat16x8_t, __noswap_splatq_laneq_f32(__builtin_bit_cast(float32x4_t, __rev2_797), __p3_797))); \ + __ret_797 = __builtin_shufflevector(__ret_797, __ret_797, 3, 2, 1, 0); \ + __ret_797; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfdot_laneq_f32(__p0_826, __p1_826, __p2_826, __p3_826) __extension__ ({ \ - float32x2_t __ret_826; \ - float32x2_t __s0_826 = __p0_826; \ - bfloat16x4_t __s1_826 = __p1_826; \ - bfloat16x8_t __s2_826 = __p2_826; \ -bfloat16x8_t __reint_826 = __s2_826; \ -float32x2_t __reint1_826 = splat_laneq_f32(*(float32x4_t *) &__reint_826, __p3_826); \ - __ret_826 = vbfdot_f32(__s0_826, __s1_826, *(bfloat16x4_t *) &__reint1_826); \ - __ret_826; \ +#define vbfdot_laneq_f32(__p0_798, __p1_798, __p2_798, __p3_798) __extension__ ({ \ + float32x2_t __ret_798; \ + float32x2_t __s0_798 = __p0_798; \ + bfloat16x4_t __s1_798 = __p1_798; \ + bfloat16x8_t __s2_798 = __p2_798; \ + __ret_798 = vbfdot_f32(__s0_798, __s1_798, __builtin_bit_cast(bfloat16x4_t, splat_laneq_f32(__builtin_bit_cast(float32x4_t, __s2_798), __p3_798))); \ + __ret_798; \ }) #else -#define vbfdot_laneq_f32(__p0_827, __p1_827, __p2_827, __p3_827) __extension__ ({ \ - float32x2_t __ret_827; \ - float32x2_t __s0_827 = __p0_827; \ - bfloat16x4_t __s1_827 = __p1_827; \ - bfloat16x8_t __s2_827 = __p2_827; \ - float32x2_t __rev0_827; __rev0_827 = __builtin_shufflevector(__s0_827, __s0_827, 1, 0); \ - bfloat16x4_t __rev1_827; __rev1_827 = __builtin_shufflevector(__s1_827, __s1_827, 3, 2, 1, 0); \ - bfloat16x8_t __rev2_827; __rev2_827 = __builtin_shufflevector(__s2_827, __s2_827, 7, 6, 5, 4, 3, 2, 1, 0); \ -bfloat16x8_t __reint_827 = __rev2_827; \ -float32x2_t __reint1_827 = __noswap_splat_laneq_f32(*(float32x4_t *) &__reint_827, __p3_827); \ - __ret_827 = __noswap_vbfdot_f32(__rev0_827, __rev1_827, *(bfloat16x4_t *) &__reint1_827); \ - __ret_827 = __builtin_shufflevector(__ret_827, __ret_827, 1, 0); \ - __ret_827; \ +#define vbfdot_laneq_f32(__p0_799, __p1_799, __p2_799, __p3_799) __extension__ ({ \ + float32x2_t __ret_799; \ + float32x2_t __s0_799 = __p0_799; \ + bfloat16x4_t __s1_799 = __p1_799; \ + bfloat16x8_t __s2_799 = __p2_799; \ + float32x2_t __rev0_799; __rev0_799 = __builtin_shufflevector(__s0_799, __s0_799, 1, 0); \ + bfloat16x4_t __rev1_799; __rev1_799 = __builtin_shufflevector(__s1_799, __s1_799, 3, 2, 1, 0); \ + bfloat16x8_t __rev2_799; __rev2_799 = __builtin_shufflevector(__s2_799, __s2_799, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_799 = __noswap_vbfdot_f32(__rev0_799, __rev1_799, __builtin_bit_cast(bfloat16x4_t, __noswap_splat_laneq_f32(__builtin_bit_cast(float32x4_t, __rev2_799), __p3_799))); \ + __ret_799 = __builtin_shufflevector(__ret_799, __ret_799, 1, 0); \ + __ret_799; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfmlalbq_lane_f32(__p0_828, __p1_828, __p2_828, __p3_828) __extension__ ({ \ - float32x4_t __ret_828; \ - float32x4_t __s0_828 = __p0_828; \ - bfloat16x8_t __s1_828 = __p1_828; \ - bfloat16x4_t __s2_828 = __p2_828; \ - __ret_828 = vbfmlalbq_f32(__s0_828, __s1_828, (bfloat16x8_t) {vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828), vget_lane_bf16(__s2_828, __p3_828)}); \ - __ret_828; \ +#define vbfmlalbq_lane_f32(__p0_800, __p1_800, __p2_800, __p3_800) __extension__ ({ \ + float32x4_t __ret_800; \ + float32x4_t __s0_800 = __p0_800; \ + bfloat16x8_t __s1_800 = __p1_800; \ + bfloat16x4_t __s2_800 = __p2_800; \ + __ret_800 = vbfmlalbq_f32(__s0_800, __s1_800, (bfloat16x8_t) {vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800), vget_lane_bf16(__s2_800, __p3_800)}); \ + __ret_800; \ }) #else -#define vbfmlalbq_lane_f32(__p0_829, __p1_829, __p2_829, __p3_829) __extension__ ({ \ - float32x4_t __ret_829; \ - float32x4_t __s0_829 = __p0_829; \ - bfloat16x8_t __s1_829 = __p1_829; \ - bfloat16x4_t __s2_829 = __p2_829; \ - float32x4_t __rev0_829; __rev0_829 = __builtin_shufflevector(__s0_829, __s0_829, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_829; __rev1_829 = __builtin_shufflevector(__s1_829, __s1_829, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x4_t __rev2_829; __rev2_829 = __builtin_shufflevector(__s2_829, __s2_829, 3, 2, 1, 0); \ - __ret_829 = __noswap_vbfmlalbq_f32(__rev0_829, __rev1_829, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829), __noswap_vget_lane_bf16(__rev2_829, __p3_829)}); \ - __ret_829 = __builtin_shufflevector(__ret_829, __ret_829, 3, 2, 1, 0); \ - __ret_829; \ +#define vbfmlalbq_lane_f32(__p0_801, __p1_801, __p2_801, __p3_801) __extension__ ({ \ + float32x4_t __ret_801; \ + float32x4_t __s0_801 = __p0_801; \ + bfloat16x8_t __s1_801 = __p1_801; \ + bfloat16x4_t __s2_801 = __p2_801; \ + float32x4_t __rev0_801; __rev0_801 = __builtin_shufflevector(__s0_801, __s0_801, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_801; __rev1_801 = __builtin_shufflevector(__s1_801, __s1_801, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x4_t __rev2_801; __rev2_801 = __builtin_shufflevector(__s2_801, __s2_801, 3, 2, 1, 0); \ + __ret_801 = __noswap_vbfmlalbq_f32(__rev0_801, __rev1_801, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801), __noswap_vget_lane_bf16(__rev2_801, __p3_801)}); \ + __ret_801 = __builtin_shufflevector(__ret_801, __ret_801, 3, 2, 1, 0); \ + __ret_801; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfmlalbq_laneq_f32(__p0_830, __p1_830, __p2_830, __p3_830) __extension__ ({ \ - float32x4_t __ret_830; \ - float32x4_t __s0_830 = __p0_830; \ - bfloat16x8_t __s1_830 = __p1_830; \ - bfloat16x8_t __s2_830 = __p2_830; \ - __ret_830 = vbfmlalbq_f32(__s0_830, __s1_830, (bfloat16x8_t) {vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830), vgetq_lane_bf16(__s2_830, __p3_830)}); \ - __ret_830; \ +#define vbfmlalbq_laneq_f32(__p0_802, __p1_802, __p2_802, __p3_802) __extension__ ({ \ + float32x4_t __ret_802; \ + float32x4_t __s0_802 = __p0_802; \ + bfloat16x8_t __s1_802 = __p1_802; \ + bfloat16x8_t __s2_802 = __p2_802; \ + __ret_802 = vbfmlalbq_f32(__s0_802, __s1_802, (bfloat16x8_t) {vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802), vgetq_lane_bf16(__s2_802, __p3_802)}); \ + __ret_802; \ }) #else -#define vbfmlalbq_laneq_f32(__p0_831, __p1_831, __p2_831, __p3_831) __extension__ ({ \ - float32x4_t __ret_831; \ - float32x4_t __s0_831 = __p0_831; \ - bfloat16x8_t __s1_831 = __p1_831; \ - bfloat16x8_t __s2_831 = __p2_831; \ - float32x4_t __rev0_831; __rev0_831 = __builtin_shufflevector(__s0_831, __s0_831, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_831; __rev1_831 = __builtin_shufflevector(__s1_831, __s1_831, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x8_t __rev2_831; __rev2_831 = __builtin_shufflevector(__s2_831, __s2_831, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_831 = __noswap_vbfmlalbq_f32(__rev0_831, __rev1_831, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831), __noswap_vgetq_lane_bf16(__rev2_831, __p3_831)}); \ - __ret_831 = __builtin_shufflevector(__ret_831, __ret_831, 3, 2, 1, 0); \ - __ret_831; \ +#define vbfmlalbq_laneq_f32(__p0_803, __p1_803, __p2_803, __p3_803) __extension__ ({ \ + float32x4_t __ret_803; \ + float32x4_t __s0_803 = __p0_803; \ + bfloat16x8_t __s1_803 = __p1_803; \ + bfloat16x8_t __s2_803 = __p2_803; \ + float32x4_t __rev0_803; __rev0_803 = __builtin_shufflevector(__s0_803, __s0_803, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_803; __rev1_803 = __builtin_shufflevector(__s1_803, __s1_803, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x8_t __rev2_803; __rev2_803 = __builtin_shufflevector(__s2_803, __s2_803, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_803 = __noswap_vbfmlalbq_f32(__rev0_803, __rev1_803, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803), __noswap_vgetq_lane_bf16(__rev2_803, __p3_803)}); \ + __ret_803 = __builtin_shufflevector(__ret_803, __ret_803, 3, 2, 1, 0); \ + __ret_803; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfmlaltq_lane_f32(__p0_832, __p1_832, __p2_832, __p3_832) __extension__ ({ \ - float32x4_t __ret_832; \ - float32x4_t __s0_832 = __p0_832; \ - bfloat16x8_t __s1_832 = __p1_832; \ - bfloat16x4_t __s2_832 = __p2_832; \ - __ret_832 = vbfmlaltq_f32(__s0_832, __s1_832, (bfloat16x8_t) {vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832), vget_lane_bf16(__s2_832, __p3_832)}); \ - __ret_832; \ +#define vbfmlaltq_lane_f32(__p0_804, __p1_804, __p2_804, __p3_804) __extension__ ({ \ + float32x4_t __ret_804; \ + float32x4_t __s0_804 = __p0_804; \ + bfloat16x8_t __s1_804 = __p1_804; \ + bfloat16x4_t __s2_804 = __p2_804; \ + __ret_804 = vbfmlaltq_f32(__s0_804, __s1_804, (bfloat16x8_t) {vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804), vget_lane_bf16(__s2_804, __p3_804)}); \ + __ret_804; \ }) #else -#define vbfmlaltq_lane_f32(__p0_833, __p1_833, __p2_833, __p3_833) __extension__ ({ \ - float32x4_t __ret_833; \ - float32x4_t __s0_833 = __p0_833; \ - bfloat16x8_t __s1_833 = __p1_833; \ - bfloat16x4_t __s2_833 = __p2_833; \ - float32x4_t __rev0_833; __rev0_833 = __builtin_shufflevector(__s0_833, __s0_833, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_833; __rev1_833 = __builtin_shufflevector(__s1_833, __s1_833, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x4_t __rev2_833; __rev2_833 = __builtin_shufflevector(__s2_833, __s2_833, 3, 2, 1, 0); \ - __ret_833 = __noswap_vbfmlaltq_f32(__rev0_833, __rev1_833, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833), __noswap_vget_lane_bf16(__rev2_833, __p3_833)}); \ - __ret_833 = __builtin_shufflevector(__ret_833, __ret_833, 3, 2, 1, 0); \ - __ret_833; \ +#define vbfmlaltq_lane_f32(__p0_805, __p1_805, __p2_805, __p3_805) __extension__ ({ \ + float32x4_t __ret_805; \ + float32x4_t __s0_805 = __p0_805; \ + bfloat16x8_t __s1_805 = __p1_805; \ + bfloat16x4_t __s2_805 = __p2_805; \ + float32x4_t __rev0_805; __rev0_805 = __builtin_shufflevector(__s0_805, __s0_805, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_805; __rev1_805 = __builtin_shufflevector(__s1_805, __s1_805, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x4_t __rev2_805; __rev2_805 = __builtin_shufflevector(__s2_805, __s2_805, 3, 2, 1, 0); \ + __ret_805 = __noswap_vbfmlaltq_f32(__rev0_805, __rev1_805, (bfloat16x8_t) {__noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805), __noswap_vget_lane_bf16(__rev2_805, __p3_805)}); \ + __ret_805 = __builtin_shufflevector(__ret_805, __ret_805, 3, 2, 1, 0); \ + __ret_805; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vbfmlaltq_laneq_f32(__p0_834, __p1_834, __p2_834, __p3_834) __extension__ ({ \ - float32x4_t __ret_834; \ - float32x4_t __s0_834 = __p0_834; \ - bfloat16x8_t __s1_834 = __p1_834; \ - bfloat16x8_t __s2_834 = __p2_834; \ - __ret_834 = vbfmlaltq_f32(__s0_834, __s1_834, (bfloat16x8_t) {vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834), vgetq_lane_bf16(__s2_834, __p3_834)}); \ - __ret_834; \ +#define vbfmlaltq_laneq_f32(__p0_806, __p1_806, __p2_806, __p3_806) __extension__ ({ \ + float32x4_t __ret_806; \ + float32x4_t __s0_806 = __p0_806; \ + bfloat16x8_t __s1_806 = __p1_806; \ + bfloat16x8_t __s2_806 = __p2_806; \ + __ret_806 = vbfmlaltq_f32(__s0_806, __s1_806, (bfloat16x8_t) {vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806), vgetq_lane_bf16(__s2_806, __p3_806)}); \ + __ret_806; \ }) #else -#define vbfmlaltq_laneq_f32(__p0_835, __p1_835, __p2_835, __p3_835) __extension__ ({ \ - float32x4_t __ret_835; \ - float32x4_t __s0_835 = __p0_835; \ - bfloat16x8_t __s1_835 = __p1_835; \ - bfloat16x8_t __s2_835 = __p2_835; \ - float32x4_t __rev0_835; __rev0_835 = __builtin_shufflevector(__s0_835, __s0_835, 3, 2, 1, 0); \ - bfloat16x8_t __rev1_835; __rev1_835 = __builtin_shufflevector(__s1_835, __s1_835, 7, 6, 5, 4, 3, 2, 1, 0); \ - bfloat16x8_t __rev2_835; __rev2_835 = __builtin_shufflevector(__s2_835, __s2_835, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_835 = __noswap_vbfmlaltq_f32(__rev0_835, __rev1_835, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835), __noswap_vgetq_lane_bf16(__rev2_835, __p3_835)}); \ - __ret_835 = __builtin_shufflevector(__ret_835, __ret_835, 3, 2, 1, 0); \ - __ret_835; \ +#define vbfmlaltq_laneq_f32(__p0_807, __p1_807, __p2_807, __p3_807) __extension__ ({ \ + float32x4_t __ret_807; \ + float32x4_t __s0_807 = __p0_807; \ + bfloat16x8_t __s1_807 = __p1_807; \ + bfloat16x8_t __s2_807 = __p2_807; \ + float32x4_t __rev0_807; __rev0_807 = __builtin_shufflevector(__s0_807, __s0_807, 3, 2, 1, 0); \ + bfloat16x8_t __rev1_807; __rev1_807 = __builtin_shufflevector(__s1_807, __s1_807, 7, 6, 5, 4, 3, 2, 1, 0); \ + bfloat16x8_t __rev2_807; __rev2_807 = __builtin_shufflevector(__s2_807, __s2_807, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_807 = __noswap_vbfmlaltq_f32(__rev0_807, __rev1_807, (bfloat16x8_t) {__noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807), __noswap_vgetq_lane_bf16(__rev2_807, __p3_807)}); \ + __ret_807 = __builtin_shufflevector(__ret_807, __ret_807, 3, 2, 1, 0); \ + __ret_807; \ }) #endif #ifdef __LITTLE_ENDIAN__ -__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_836) { - float32x4_t __ret_836; -bfloat16x4_t __reint_836 = __p0_836; -int32x4_t __reint1_836 = vshll_n_s16(*(int16x4_t *) &__reint_836, 16); - __ret_836 = *(float32x4_t *) &__reint1_836; - return __ret_836; +__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_808) { + float32x4_t __ret_808; + __ret_808 = __builtin_bit_cast(float32x4_t, vshll_n_u16(__builtin_bit_cast(uint16x4_t, __p0_808), 16)); + return __ret_808; } #else -__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_837) { - float32x4_t __ret_837; - bfloat16x4_t __rev0_837; __rev0_837 = __builtin_shufflevector(__p0_837, __p0_837, 3, 2, 1, 0); -bfloat16x4_t __reint_837 = __rev0_837; -int32x4_t __reint1_837 = __noswap_vshll_n_s16(*(int16x4_t *) &__reint_837, 16); - __ret_837 = *(float32x4_t *) &__reint1_837; - __ret_837 = __builtin_shufflevector(__ret_837, __ret_837, 3, 2, 1, 0); - return __ret_837; +__ai __attribute__((target("bf16,neon"))) float32x4_t vcvt_f32_bf16(bfloat16x4_t __p0_809) { + float32x4_t __ret_809; + bfloat16x4_t __rev0_809; __rev0_809 = __builtin_shufflevector(__p0_809, __p0_809, 3, 2, 1, 0); + __ret_809 = __builtin_bit_cast(float32x4_t, __noswap_vshll_n_u16(__builtin_bit_cast(uint16x4_t, __rev0_809), 16)); + __ret_809 = __builtin_shufflevector(__ret_809, __ret_809, 3, 2, 1, 0); + return __ret_809; } -__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vcvt_f32_bf16(bfloat16x4_t __p0_838) { - float32x4_t __ret_838; -bfloat16x4_t __reint_838 = __p0_838; -int32x4_t __reint1_838 = __noswap_vshll_n_s16(*(int16x4_t *) &__reint_838, 16); - __ret_838 = *(float32x4_t *) &__reint1_838; - return __ret_838; +__ai __attribute__((target("bf16,neon"))) float32x4_t __noswap_vcvt_f32_bf16(bfloat16x4_t __p0_810) { + float32x4_t __ret_810; + __ret_810 = __builtin_bit_cast(float32x4_t, __noswap_vshll_n_u16(__builtin_bit_cast(uint16x4_t, __p0_810), 16)); + return __ret_810; } #endif @@ -66553,260 +68492,236 @@ __ai __attribute__((target("bf16,neon"))) float32x4_t vcvtq_low_f32_bf16(bfloat1 #endif #ifdef __LITTLE_ENDIAN__ -#define vdotq_lane_u32(__p0_839, __p1_839, __p2_839, __p3_839) __extension__ ({ \ - uint32x4_t __ret_839; \ - uint32x4_t __s0_839 = __p0_839; \ - uint8x16_t __s1_839 = __p1_839; \ - uint8x8_t __s2_839 = __p2_839; \ -uint8x8_t __reint_839 = __s2_839; \ -uint32x4_t __reint1_839 = splatq_lane_u32(*(uint32x2_t *) &__reint_839, __p3_839); \ - __ret_839 = vdotq_u32(__s0_839, __s1_839, *(uint8x16_t *) &__reint1_839); \ - __ret_839; \ +#define vdotq_lane_u32(__p0_811, __p1_811, __p2_811, __p3_811) __extension__ ({ \ + uint32x4_t __ret_811; \ + uint32x4_t __s0_811 = __p0_811; \ + uint8x16_t __s1_811 = __p1_811; \ + uint8x8_t __s2_811 = __p2_811; \ + __ret_811 = vdotq_u32(__s0_811, __s1_811, __builtin_bit_cast(uint8x16_t, splatq_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_811), __p3_811))); \ + __ret_811; \ }) #else -#define vdotq_lane_u32(__p0_840, __p1_840, __p2_840, __p3_840) __extension__ ({ \ - uint32x4_t __ret_840; \ - uint32x4_t __s0_840 = __p0_840; \ - uint8x16_t __s1_840 = __p1_840; \ - uint8x8_t __s2_840 = __p2_840; \ - uint32x4_t __rev0_840; __rev0_840 = __builtin_shufflevector(__s0_840, __s0_840, 3, 2, 1, 0); \ - uint8x16_t __rev1_840; __rev1_840 = __builtin_shufflevector(__s1_840, __s1_840, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_840; __rev2_840 = __builtin_shufflevector(__s2_840, __s2_840, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x8_t __reint_840 = __rev2_840; \ -uint32x4_t __reint1_840 = __noswap_splatq_lane_u32(*(uint32x2_t *) &__reint_840, __p3_840); \ - __ret_840 = __noswap_vdotq_u32(__rev0_840, __rev1_840, *(uint8x16_t *) &__reint1_840); \ - __ret_840 = __builtin_shufflevector(__ret_840, __ret_840, 3, 2, 1, 0); \ - __ret_840; \ +#define vdotq_lane_u32(__p0_812, __p1_812, __p2_812, __p3_812) __extension__ ({ \ + uint32x4_t __ret_812; \ + uint32x4_t __s0_812 = __p0_812; \ + uint8x16_t __s1_812 = __p1_812; \ + uint8x8_t __s2_812 = __p2_812; \ + uint32x4_t __rev0_812; __rev0_812 = __builtin_shufflevector(__s0_812, __s0_812, 3, 2, 1, 0); \ + uint8x16_t __rev1_812; __rev1_812 = __builtin_shufflevector(__s1_812, __s1_812, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_812; __rev2_812 = __builtin_shufflevector(__s2_812, __s2_812, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_812 = __noswap_vdotq_u32(__rev0_812, __rev1_812, __builtin_bit_cast(uint8x16_t, __noswap_splatq_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_812), __p3_812))); \ + __ret_812 = __builtin_shufflevector(__ret_812, __ret_812, 3, 2, 1, 0); \ + __ret_812; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vdotq_lane_s32(__p0_841, __p1_841, __p2_841, __p3_841) __extension__ ({ \ - int32x4_t __ret_841; \ - int32x4_t __s0_841 = __p0_841; \ - int8x16_t __s1_841 = __p1_841; \ - int8x8_t __s2_841 = __p2_841; \ -int8x8_t __reint_841 = __s2_841; \ -int32x4_t __reint1_841 = splatq_lane_s32(*(int32x2_t *) &__reint_841, __p3_841); \ - __ret_841 = vdotq_s32(__s0_841, __s1_841, *(int8x16_t *) &__reint1_841); \ - __ret_841; \ +#define vdotq_lane_s32(__p0_813, __p1_813, __p2_813, __p3_813) __extension__ ({ \ + int32x4_t __ret_813; \ + int32x4_t __s0_813 = __p0_813; \ + int8x16_t __s1_813 = __p1_813; \ + int8x8_t __s2_813 = __p2_813; \ + __ret_813 = vdotq_s32(__s0_813, __s1_813, __builtin_bit_cast(int8x16_t, splatq_lane_s32(__builtin_bit_cast(int32x2_t, __s2_813), __p3_813))); \ + __ret_813; \ }) #else -#define vdotq_lane_s32(__p0_842, __p1_842, __p2_842, __p3_842) __extension__ ({ \ - int32x4_t __ret_842; \ - int32x4_t __s0_842 = __p0_842; \ - int8x16_t __s1_842 = __p1_842; \ - int8x8_t __s2_842 = __p2_842; \ - int32x4_t __rev0_842; __rev0_842 = __builtin_shufflevector(__s0_842, __s0_842, 3, 2, 1, 0); \ - int8x16_t __rev1_842; __rev1_842 = __builtin_shufflevector(__s1_842, __s1_842, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_842; __rev2_842 = __builtin_shufflevector(__s2_842, __s2_842, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x8_t __reint_842 = __rev2_842; \ -int32x4_t __reint1_842 = __noswap_splatq_lane_s32(*(int32x2_t *) &__reint_842, __p3_842); \ - __ret_842 = __noswap_vdotq_s32(__rev0_842, __rev1_842, *(int8x16_t *) &__reint1_842); \ - __ret_842 = __builtin_shufflevector(__ret_842, __ret_842, 3, 2, 1, 0); \ - __ret_842; \ +#define vdotq_lane_s32(__p0_814, __p1_814, __p2_814, __p3_814) __extension__ ({ \ + int32x4_t __ret_814; \ + int32x4_t __s0_814 = __p0_814; \ + int8x16_t __s1_814 = __p1_814; \ + int8x8_t __s2_814 = __p2_814; \ + int32x4_t __rev0_814; __rev0_814 = __builtin_shufflevector(__s0_814, __s0_814, 3, 2, 1, 0); \ + int8x16_t __rev1_814; __rev1_814 = __builtin_shufflevector(__s1_814, __s1_814, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_814; __rev2_814 = __builtin_shufflevector(__s2_814, __s2_814, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_814 = __noswap_vdotq_s32(__rev0_814, __rev1_814, __builtin_bit_cast(int8x16_t, __noswap_splatq_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_814), __p3_814))); \ + __ret_814 = __builtin_shufflevector(__ret_814, __ret_814, 3, 2, 1, 0); \ + __ret_814; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vdot_lane_u32(__p0_843, __p1_843, __p2_843, __p3_843) __extension__ ({ \ - uint32x2_t __ret_843; \ - uint32x2_t __s0_843 = __p0_843; \ - uint8x8_t __s1_843 = __p1_843; \ - uint8x8_t __s2_843 = __p2_843; \ -uint8x8_t __reint_843 = __s2_843; \ -uint32x2_t __reint1_843 = splat_lane_u32(*(uint32x2_t *) &__reint_843, __p3_843); \ - __ret_843 = vdot_u32(__s0_843, __s1_843, *(uint8x8_t *) &__reint1_843); \ - __ret_843; \ +#define vdot_lane_u32(__p0_815, __p1_815, __p2_815, __p3_815) __extension__ ({ \ + uint32x2_t __ret_815; \ + uint32x2_t __s0_815 = __p0_815; \ + uint8x8_t __s1_815 = __p1_815; \ + uint8x8_t __s2_815 = __p2_815; \ + __ret_815 = vdot_u32(__s0_815, __s1_815, __builtin_bit_cast(uint8x8_t, splat_lane_u32(__builtin_bit_cast(uint32x2_t, __s2_815), __p3_815))); \ + __ret_815; \ }) #else -#define vdot_lane_u32(__p0_844, __p1_844, __p2_844, __p3_844) __extension__ ({ \ - uint32x2_t __ret_844; \ - uint32x2_t __s0_844 = __p0_844; \ - uint8x8_t __s1_844 = __p1_844; \ - uint8x8_t __s2_844 = __p2_844; \ - uint32x2_t __rev0_844; __rev0_844 = __builtin_shufflevector(__s0_844, __s0_844, 1, 0); \ - uint8x8_t __rev1_844; __rev1_844 = __builtin_shufflevector(__s1_844, __s1_844, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_844; __rev2_844 = __builtin_shufflevector(__s2_844, __s2_844, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x8_t __reint_844 = __rev2_844; \ -uint32x2_t __reint1_844 = __noswap_splat_lane_u32(*(uint32x2_t *) &__reint_844, __p3_844); \ - __ret_844 = __noswap_vdot_u32(__rev0_844, __rev1_844, *(uint8x8_t *) &__reint1_844); \ - __ret_844 = __builtin_shufflevector(__ret_844, __ret_844, 1, 0); \ - __ret_844; \ +#define vdot_lane_u32(__p0_816, __p1_816, __p2_816, __p3_816) __extension__ ({ \ + uint32x2_t __ret_816; \ + uint32x2_t __s0_816 = __p0_816; \ + uint8x8_t __s1_816 = __p1_816; \ + uint8x8_t __s2_816 = __p2_816; \ + uint32x2_t __rev0_816; __rev0_816 = __builtin_shufflevector(__s0_816, __s0_816, 1, 0); \ + uint8x8_t __rev1_816; __rev1_816 = __builtin_shufflevector(__s1_816, __s1_816, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_816; __rev2_816 = __builtin_shufflevector(__s2_816, __s2_816, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_816 = __noswap_vdot_u32(__rev0_816, __rev1_816, __builtin_bit_cast(uint8x8_t, __noswap_splat_lane_u32(__builtin_bit_cast(uint32x2_t, __rev2_816), __p3_816))); \ + __ret_816 = __builtin_shufflevector(__ret_816, __ret_816, 1, 0); \ + __ret_816; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vdot_lane_s32(__p0_845, __p1_845, __p2_845, __p3_845) __extension__ ({ \ - int32x2_t __ret_845; \ - int32x2_t __s0_845 = __p0_845; \ - int8x8_t __s1_845 = __p1_845; \ - int8x8_t __s2_845 = __p2_845; \ -int8x8_t __reint_845 = __s2_845; \ -int32x2_t __reint1_845 = splat_lane_s32(*(int32x2_t *) &__reint_845, __p3_845); \ - __ret_845 = vdot_s32(__s0_845, __s1_845, *(int8x8_t *) &__reint1_845); \ - __ret_845; \ +#define vdot_lane_s32(__p0_817, __p1_817, __p2_817, __p3_817) __extension__ ({ \ + int32x2_t __ret_817; \ + int32x2_t __s0_817 = __p0_817; \ + int8x8_t __s1_817 = __p1_817; \ + int8x8_t __s2_817 = __p2_817; \ + __ret_817 = vdot_s32(__s0_817, __s1_817, __builtin_bit_cast(int8x8_t, splat_lane_s32(__builtin_bit_cast(int32x2_t, __s2_817), __p3_817))); \ + __ret_817; \ }) #else -#define vdot_lane_s32(__p0_846, __p1_846, __p2_846, __p3_846) __extension__ ({ \ - int32x2_t __ret_846; \ - int32x2_t __s0_846 = __p0_846; \ - int8x8_t __s1_846 = __p1_846; \ - int8x8_t __s2_846 = __p2_846; \ - int32x2_t __rev0_846; __rev0_846 = __builtin_shufflevector(__s0_846, __s0_846, 1, 0); \ - int8x8_t __rev1_846; __rev1_846 = __builtin_shufflevector(__s1_846, __s1_846, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_846; __rev2_846 = __builtin_shufflevector(__s2_846, __s2_846, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x8_t __reint_846 = __rev2_846; \ -int32x2_t __reint1_846 = __noswap_splat_lane_s32(*(int32x2_t *) &__reint_846, __p3_846); \ - __ret_846 = __noswap_vdot_s32(__rev0_846, __rev1_846, *(int8x8_t *) &__reint1_846); \ - __ret_846 = __builtin_shufflevector(__ret_846, __ret_846, 1, 0); \ - __ret_846; \ +#define vdot_lane_s32(__p0_818, __p1_818, __p2_818, __p3_818) __extension__ ({ \ + int32x2_t __ret_818; \ + int32x2_t __s0_818 = __p0_818; \ + int8x8_t __s1_818 = __p1_818; \ + int8x8_t __s2_818 = __p2_818; \ + int32x2_t __rev0_818; __rev0_818 = __builtin_shufflevector(__s0_818, __s0_818, 1, 0); \ + int8x8_t __rev1_818; __rev1_818 = __builtin_shufflevector(__s1_818, __s1_818, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_818; __rev2_818 = __builtin_shufflevector(__s2_818, __s2_818, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_818 = __noswap_vdot_s32(__rev0_818, __rev1_818, __builtin_bit_cast(int8x8_t, __noswap_splat_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_818), __p3_818))); \ + __ret_818 = __builtin_shufflevector(__ret_818, __ret_818, 1, 0); \ + __ret_818; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulq_lane_f16(__p0_847, __p1_847, __p2_847) __extension__ ({ \ - float16x8_t __ret_847; \ - float16x8_t __s0_847 = __p0_847; \ - float16x4_t __s1_847 = __p1_847; \ - __ret_847 = __s0_847 * splatq_lane_f16(__s1_847, __p2_847); \ - __ret_847; \ +#define vmulq_lane_f16(__p0_819, __p1_819, __p2_819) __extension__ ({ \ + float16x8_t __ret_819; \ + float16x8_t __s0_819 = __p0_819; \ + float16x4_t __s1_819 = __p1_819; \ + __ret_819 = __s0_819 * splatq_lane_f16(__s1_819, __p2_819); \ + __ret_819; \ }) #else -#define vmulq_lane_f16(__p0_848, __p1_848, __p2_848) __extension__ ({ \ - float16x8_t __ret_848; \ - float16x8_t __s0_848 = __p0_848; \ - float16x4_t __s1_848 = __p1_848; \ - float16x8_t __rev0_848; __rev0_848 = __builtin_shufflevector(__s0_848, __s0_848, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev1_848; __rev1_848 = __builtin_shufflevector(__s1_848, __s1_848, 3, 2, 1, 0); \ - __ret_848 = __rev0_848 * __noswap_splatq_lane_f16(__rev1_848, __p2_848); \ - __ret_848 = __builtin_shufflevector(__ret_848, __ret_848, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_848; \ +#define vmulq_lane_f16(__p0_820, __p1_820, __p2_820) __extension__ ({ \ + float16x8_t __ret_820; \ + float16x8_t __s0_820 = __p0_820; \ + float16x4_t __s1_820 = __p1_820; \ + float16x8_t __rev0_820; __rev0_820 = __builtin_shufflevector(__s0_820, __s0_820, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1_820; __rev1_820 = __builtin_shufflevector(__s1_820, __s1_820, 3, 2, 1, 0); \ + __ret_820 = __rev0_820 * __noswap_splatq_lane_f16(__rev1_820, __p2_820); \ + __ret_820 = __builtin_shufflevector(__ret_820, __ret_820, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_820; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmul_lane_f16(__p0_849, __p1_849, __p2_849) __extension__ ({ \ - float16x4_t __ret_849; \ - float16x4_t __s0_849 = __p0_849; \ - float16x4_t __s1_849 = __p1_849; \ - __ret_849 = __s0_849 * splat_lane_f16(__s1_849, __p2_849); \ - __ret_849; \ +#define vmul_lane_f16(__p0_821, __p1_821, __p2_821) __extension__ ({ \ + float16x4_t __ret_821; \ + float16x4_t __s0_821 = __p0_821; \ + float16x4_t __s1_821 = __p1_821; \ + __ret_821 = __s0_821 * splat_lane_f16(__s1_821, __p2_821); \ + __ret_821; \ }) #else -#define vmul_lane_f16(__p0_850, __p1_850, __p2_850) __extension__ ({ \ - float16x4_t __ret_850; \ - float16x4_t __s0_850 = __p0_850; \ - float16x4_t __s1_850 = __p1_850; \ - float16x4_t __rev0_850; __rev0_850 = __builtin_shufflevector(__s0_850, __s0_850, 3, 2, 1, 0); \ - float16x4_t __rev1_850; __rev1_850 = __builtin_shufflevector(__s1_850, __s1_850, 3, 2, 1, 0); \ - __ret_850 = __rev0_850 * __noswap_splat_lane_f16(__rev1_850, __p2_850); \ - __ret_850 = __builtin_shufflevector(__ret_850, __ret_850, 3, 2, 1, 0); \ - __ret_850; \ +#define vmul_lane_f16(__p0_822, __p1_822, __p2_822) __extension__ ({ \ + float16x4_t __ret_822; \ + float16x4_t __s0_822 = __p0_822; \ + float16x4_t __s1_822 = __p1_822; \ + float16x4_t __rev0_822; __rev0_822 = __builtin_shufflevector(__s0_822, __s0_822, 3, 2, 1, 0); \ + float16x4_t __rev1_822; __rev1_822 = __builtin_shufflevector(__s1_822, __s1_822, 3, 2, 1, 0); \ + __ret_822 = __rev0_822 * __noswap_splat_lane_f16(__rev1_822, __p2_822); \ + __ret_822 = __builtin_shufflevector(__ret_822, __ret_822, 3, 2, 1, 0); \ + __ret_822; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vsudotq_lane_s32(__p0_851, __p1_851, __p2_851, __p3_851) __extension__ ({ \ - int32x4_t __ret_851; \ - int32x4_t __s0_851 = __p0_851; \ - int8x16_t __s1_851 = __p1_851; \ - uint8x8_t __s2_851 = __p2_851; \ -uint8x8_t __reint_851 = __s2_851; \ - __ret_851 = vusdotq_s32(__s0_851, (uint8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_851, __p3_851)), __s1_851); \ - __ret_851; \ +#define vsudotq_lane_s32(__p0_823, __p1_823, __p2_823, __p3_823) __extension__ ({ \ + int32x4_t __ret_823; \ + int32x4_t __s0_823 = __p0_823; \ + int8x16_t __s1_823 = __p1_823; \ + uint8x8_t __s2_823 = __p2_823; \ + __ret_823 = vusdotq_s32(__s0_823, (uint8x16_t)(splatq_lane_s32(__builtin_bit_cast(int32x2_t, __s2_823), __p3_823)), __s1_823); \ + __ret_823; \ }) #else -#define vsudotq_lane_s32(__p0_852, __p1_852, __p2_852, __p3_852) __extension__ ({ \ - int32x4_t __ret_852; \ - int32x4_t __s0_852 = __p0_852; \ - int8x16_t __s1_852 = __p1_852; \ - uint8x8_t __s2_852 = __p2_852; \ - int32x4_t __rev0_852; __rev0_852 = __builtin_shufflevector(__s0_852, __s0_852, 3, 2, 1, 0); \ - int8x16_t __rev1_852; __rev1_852 = __builtin_shufflevector(__s1_852, __s1_852, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_852; __rev2_852 = __builtin_shufflevector(__s2_852, __s2_852, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x8_t __reint_852 = __rev2_852; \ - __ret_852 = __noswap_vusdotq_s32(__rev0_852, (uint8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_852, __p3_852)), __rev1_852); \ - __ret_852 = __builtin_shufflevector(__ret_852, __ret_852, 3, 2, 1, 0); \ - __ret_852; \ +#define vsudotq_lane_s32(__p0_824, __p1_824, __p2_824, __p3_824) __extension__ ({ \ + int32x4_t __ret_824; \ + int32x4_t __s0_824 = __p0_824; \ + int8x16_t __s1_824 = __p1_824; \ + uint8x8_t __s2_824 = __p2_824; \ + int32x4_t __rev0_824; __rev0_824 = __builtin_shufflevector(__s0_824, __s0_824, 3, 2, 1, 0); \ + int8x16_t __rev1_824; __rev1_824 = __builtin_shufflevector(__s1_824, __s1_824, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_824; __rev2_824 = __builtin_shufflevector(__s2_824, __s2_824, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_824 = __noswap_vusdotq_s32(__rev0_824, (uint8x16_t)(__noswap_splatq_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_824), __p3_824)), __rev1_824); \ + __ret_824 = __builtin_shufflevector(__ret_824, __ret_824, 3, 2, 1, 0); \ + __ret_824; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vsudot_lane_s32(__p0_853, __p1_853, __p2_853, __p3_853) __extension__ ({ \ - int32x2_t __ret_853; \ - int32x2_t __s0_853 = __p0_853; \ - int8x8_t __s1_853 = __p1_853; \ - uint8x8_t __s2_853 = __p2_853; \ -uint8x8_t __reint_853 = __s2_853; \ - __ret_853 = vusdot_s32(__s0_853, (uint8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_853, __p3_853)), __s1_853); \ - __ret_853; \ +#define vsudot_lane_s32(__p0_825, __p1_825, __p2_825, __p3_825) __extension__ ({ \ + int32x2_t __ret_825; \ + int32x2_t __s0_825 = __p0_825; \ + int8x8_t __s1_825 = __p1_825; \ + uint8x8_t __s2_825 = __p2_825; \ + __ret_825 = vusdot_s32(__s0_825, (uint8x8_t)(splat_lane_s32(__builtin_bit_cast(int32x2_t, __s2_825), __p3_825)), __s1_825); \ + __ret_825; \ }) #else -#define vsudot_lane_s32(__p0_854, __p1_854, __p2_854, __p3_854) __extension__ ({ \ - int32x2_t __ret_854; \ - int32x2_t __s0_854 = __p0_854; \ - int8x8_t __s1_854 = __p1_854; \ - uint8x8_t __s2_854 = __p2_854; \ - int32x2_t __rev0_854; __rev0_854 = __builtin_shufflevector(__s0_854, __s0_854, 1, 0); \ - int8x8_t __rev1_854; __rev1_854 = __builtin_shufflevector(__s1_854, __s1_854, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_854; __rev2_854 = __builtin_shufflevector(__s2_854, __s2_854, 7, 6, 5, 4, 3, 2, 1, 0); \ -uint8x8_t __reint_854 = __rev2_854; \ - __ret_854 = __noswap_vusdot_s32(__rev0_854, (uint8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_854, __p3_854)), __rev1_854); \ - __ret_854 = __builtin_shufflevector(__ret_854, __ret_854, 1, 0); \ - __ret_854; \ +#define vsudot_lane_s32(__p0_826, __p1_826, __p2_826, __p3_826) __extension__ ({ \ + int32x2_t __ret_826; \ + int32x2_t __s0_826 = __p0_826; \ + int8x8_t __s1_826 = __p1_826; \ + uint8x8_t __s2_826 = __p2_826; \ + int32x2_t __rev0_826; __rev0_826 = __builtin_shufflevector(__s0_826, __s0_826, 1, 0); \ + int8x8_t __rev1_826; __rev1_826 = __builtin_shufflevector(__s1_826, __s1_826, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_826; __rev2_826 = __builtin_shufflevector(__s2_826, __s2_826, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_826 = __noswap_vusdot_s32(__rev0_826, (uint8x8_t)(__noswap_splat_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_826), __p3_826)), __rev1_826); \ + __ret_826 = __builtin_shufflevector(__ret_826, __ret_826, 1, 0); \ + __ret_826; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vusdotq_lane_s32(__p0_855, __p1_855, __p2_855, __p3_855) __extension__ ({ \ - int32x4_t __ret_855; \ - int32x4_t __s0_855 = __p0_855; \ - uint8x16_t __s1_855 = __p1_855; \ - int8x8_t __s2_855 = __p2_855; \ -int8x8_t __reint_855 = __s2_855; \ - __ret_855 = vusdotq_s32(__s0_855, __s1_855, (int8x16_t)(splatq_lane_s32(*(int32x2_t *) &__reint_855, __p3_855))); \ - __ret_855; \ +#define vusdotq_lane_s32(__p0_827, __p1_827, __p2_827, __p3_827) __extension__ ({ \ + int32x4_t __ret_827; \ + int32x4_t __s0_827 = __p0_827; \ + uint8x16_t __s1_827 = __p1_827; \ + int8x8_t __s2_827 = __p2_827; \ + __ret_827 = vusdotq_s32(__s0_827, __s1_827, (int8x16_t)(splatq_lane_s32(__builtin_bit_cast(int32x2_t, __s2_827), __p3_827))); \ + __ret_827; \ }) #else -#define vusdotq_lane_s32(__p0_856, __p1_856, __p2_856, __p3_856) __extension__ ({ \ - int32x4_t __ret_856; \ - int32x4_t __s0_856 = __p0_856; \ - uint8x16_t __s1_856 = __p1_856; \ - int8x8_t __s2_856 = __p2_856; \ - int32x4_t __rev0_856; __rev0_856 = __builtin_shufflevector(__s0_856, __s0_856, 3, 2, 1, 0); \ - uint8x16_t __rev1_856; __rev1_856 = __builtin_shufflevector(__s1_856, __s1_856, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_856; __rev2_856 = __builtin_shufflevector(__s2_856, __s2_856, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x8_t __reint_856 = __rev2_856; \ - __ret_856 = __noswap_vusdotq_s32(__rev0_856, __rev1_856, (int8x16_t)(__noswap_splatq_lane_s32(*(int32x2_t *) &__reint_856, __p3_856))); \ - __ret_856 = __builtin_shufflevector(__ret_856, __ret_856, 3, 2, 1, 0); \ - __ret_856; \ +#define vusdotq_lane_s32(__p0_828, __p1_828, __p2_828, __p3_828) __extension__ ({ \ + int32x4_t __ret_828; \ + int32x4_t __s0_828 = __p0_828; \ + uint8x16_t __s1_828 = __p1_828; \ + int8x8_t __s2_828 = __p2_828; \ + int32x4_t __rev0_828; __rev0_828 = __builtin_shufflevector(__s0_828, __s0_828, 3, 2, 1, 0); \ + uint8x16_t __rev1_828; __rev1_828 = __builtin_shufflevector(__s1_828, __s1_828, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_828; __rev2_828 = __builtin_shufflevector(__s2_828, __s2_828, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_828 = __noswap_vusdotq_s32(__rev0_828, __rev1_828, (int8x16_t)(__noswap_splatq_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_828), __p3_828))); \ + __ret_828 = __builtin_shufflevector(__ret_828, __ret_828, 3, 2, 1, 0); \ + __ret_828; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vusdot_lane_s32(__p0_857, __p1_857, __p2_857, __p3_857) __extension__ ({ \ - int32x2_t __ret_857; \ - int32x2_t __s0_857 = __p0_857; \ - uint8x8_t __s1_857 = __p1_857; \ - int8x8_t __s2_857 = __p2_857; \ -int8x8_t __reint_857 = __s2_857; \ - __ret_857 = vusdot_s32(__s0_857, __s1_857, (int8x8_t)(splat_lane_s32(*(int32x2_t *) &__reint_857, __p3_857))); \ - __ret_857; \ +#define vusdot_lane_s32(__p0_829, __p1_829, __p2_829, __p3_829) __extension__ ({ \ + int32x2_t __ret_829; \ + int32x2_t __s0_829 = __p0_829; \ + uint8x8_t __s1_829 = __p1_829; \ + int8x8_t __s2_829 = __p2_829; \ + __ret_829 = vusdot_s32(__s0_829, __s1_829, (int8x8_t)(splat_lane_s32(__builtin_bit_cast(int32x2_t, __s2_829), __p3_829))); \ + __ret_829; \ }) #else -#define vusdot_lane_s32(__p0_858, __p1_858, __p2_858, __p3_858) __extension__ ({ \ - int32x2_t __ret_858; \ - int32x2_t __s0_858 = __p0_858; \ - uint8x8_t __s1_858 = __p1_858; \ - int8x8_t __s2_858 = __p2_858; \ - int32x2_t __rev0_858; __rev0_858 = __builtin_shufflevector(__s0_858, __s0_858, 1, 0); \ - uint8x8_t __rev1_858; __rev1_858 = __builtin_shufflevector(__s1_858, __s1_858, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_858; __rev2_858 = __builtin_shufflevector(__s2_858, __s2_858, 7, 6, 5, 4, 3, 2, 1, 0); \ -int8x8_t __reint_858 = __rev2_858; \ - __ret_858 = __noswap_vusdot_s32(__rev0_858, __rev1_858, (int8x8_t)(__noswap_splat_lane_s32(*(int32x2_t *) &__reint_858, __p3_858))); \ - __ret_858 = __builtin_shufflevector(__ret_858, __ret_858, 1, 0); \ - __ret_858; \ +#define vusdot_lane_s32(__p0_830, __p1_830, __p2_830, __p3_830) __extension__ ({ \ + int32x2_t __ret_830; \ + int32x2_t __s0_830 = __p0_830; \ + uint8x8_t __s1_830 = __p1_830; \ + int8x8_t __s2_830 = __p2_830; \ + int32x2_t __rev0_830; __rev0_830 = __builtin_shufflevector(__s0_830, __s0_830, 1, 0); \ + uint8x8_t __rev1_830; __rev1_830 = __builtin_shufflevector(__s1_830, __s1_830, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_830; __rev2_830 = __builtin_shufflevector(__s2_830, __s2_830, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_830 = __noswap_vusdot_s32(__rev0_830, __rev1_830, (int8x8_t)(__noswap_splat_lane_s32(__builtin_bit_cast(int32x2_t, __rev2_830), __p3_830))); \ + __ret_830 = __builtin_shufflevector(__ret_830, __ret_830, 1, 0); \ + __ret_830; \ }) #endif @@ -67363,60 +69278,48 @@ __ai __attribute__((target("neon"))) int32x4_t vaddw_s16(int32x4_t __p0, int16x4 #endif #ifdef __LITTLE_ENDIAN__ -#define vget_lane_f16(__p0_859, __p1_859) __extension__ ({ \ - float16_t __ret_859; \ - float16x4_t __s0_859 = __p0_859; \ -float16x4_t __reint_859 = __s0_859; \ -int16_t __reint1_859 = vget_lane_s16(*(int16x4_t *) &__reint_859, __p1_859); \ - __ret_859 = *(float16_t *) &__reint1_859; \ - __ret_859; \ +#define vget_lane_f16(__p0_831, __p1_831) __extension__ ({ \ + float16_t __ret_831; \ + float16x4_t __s0_831 = __p0_831; \ + __ret_831 = __builtin_bit_cast(float16_t, vget_lane_s16(__builtin_bit_cast(int16x4_t, __s0_831), __p1_831)); \ + __ret_831; \ }) #else -#define vget_lane_f16(__p0_860, __p1_860) __extension__ ({ \ - float16_t __ret_860; \ - float16x4_t __s0_860 = __p0_860; \ - float16x4_t __rev0_860; __rev0_860 = __builtin_shufflevector(__s0_860, __s0_860, 3, 2, 1, 0); \ -float16x4_t __reint_860 = __rev0_860; \ -int16_t __reint1_860 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_860, __p1_860); \ - __ret_860 = *(float16_t *) &__reint1_860; \ - __ret_860; \ +#define vget_lane_f16(__p0_832, __p1_832) __extension__ ({ \ + float16_t __ret_832; \ + float16x4_t __s0_832 = __p0_832; \ + float16x4_t __rev0_832; __rev0_832 = __builtin_shufflevector(__s0_832, __s0_832, 3, 2, 1, 0); \ + __ret_832 = __builtin_bit_cast(float16_t, __noswap_vget_lane_s16(__builtin_bit_cast(int16x4_t, __rev0_832), __p1_832)); \ + __ret_832; \ }) -#define __noswap_vget_lane_f16(__p0_861, __p1_861) __extension__ ({ \ - float16_t __ret_861; \ - float16x4_t __s0_861 = __p0_861; \ -float16x4_t __reint_861 = __s0_861; \ -int16_t __reint1_861 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_861, __p1_861); \ - __ret_861 = *(float16_t *) &__reint1_861; \ - __ret_861; \ +#define __noswap_vget_lane_f16(__p0_833, __p1_833) __extension__ ({ \ + float16_t __ret_833; \ + float16x4_t __s0_833 = __p0_833; \ + __ret_833 = __builtin_bit_cast(float16_t, __noswap_vget_lane_s16(__builtin_bit_cast(int16x4_t, __s0_833), __p1_833)); \ + __ret_833; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vgetq_lane_f16(__p0_862, __p1_862) __extension__ ({ \ - float16_t __ret_862; \ - float16x8_t __s0_862 = __p0_862; \ -float16x8_t __reint_862 = __s0_862; \ -int16_t __reint1_862 = vgetq_lane_s16(*(int16x8_t *) &__reint_862, __p1_862); \ - __ret_862 = *(float16_t *) &__reint1_862; \ - __ret_862; \ +#define vgetq_lane_f16(__p0_834, __p1_834) __extension__ ({ \ + float16_t __ret_834; \ + float16x8_t __s0_834 = __p0_834; \ + __ret_834 = __builtin_bit_cast(float16_t, vgetq_lane_s16(__builtin_bit_cast(int16x8_t, __s0_834), __p1_834)); \ + __ret_834; \ }) #else -#define vgetq_lane_f16(__p0_863, __p1_863) __extension__ ({ \ - float16_t __ret_863; \ - float16x8_t __s0_863 = __p0_863; \ - float16x8_t __rev0_863; __rev0_863 = __builtin_shufflevector(__s0_863, __s0_863, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16x8_t __reint_863 = __rev0_863; \ -int16_t __reint1_863 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_863, __p1_863); \ - __ret_863 = *(float16_t *) &__reint1_863; \ - __ret_863; \ +#define vgetq_lane_f16(__p0_835, __p1_835) __extension__ ({ \ + float16_t __ret_835; \ + float16x8_t __s0_835 = __p0_835; \ + float16x8_t __rev0_835; __rev0_835 = __builtin_shufflevector(__s0_835, __s0_835, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_835 = __builtin_bit_cast(float16_t, __noswap_vgetq_lane_s16(__builtin_bit_cast(int16x8_t, __rev0_835), __p1_835)); \ + __ret_835; \ }) -#define __noswap_vgetq_lane_f16(__p0_864, __p1_864) __extension__ ({ \ - float16_t __ret_864; \ - float16x8_t __s0_864 = __p0_864; \ -float16x8_t __reint_864 = __s0_864; \ -int16_t __reint1_864 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_864, __p1_864); \ - __ret_864 = *(float16_t *) &__reint1_864; \ - __ret_864; \ +#define __noswap_vgetq_lane_f16(__p0_836, __p1_836) __extension__ ({ \ + float16_t __ret_836; \ + float16x8_t __s0_836 = __p0_836; \ + __ret_836 = __builtin_bit_cast(float16_t, __noswap_vgetq_lane_s16(__builtin_bit_cast(int16x8_t, __s0_836), __p1_836)); \ + __ret_836; \ }) #endif @@ -67559,98 +69462,98 @@ __ai __attribute__((target("neon"))) int32x4_t __noswap_vmlal_s16(int32x4_t __p0 #endif #ifdef __LITTLE_ENDIAN__ -#define vmlal_lane_u32(__p0_865, __p1_865, __p2_865, __p3_865) __extension__ ({ \ - uint64x2_t __ret_865; \ - uint64x2_t __s0_865 = __p0_865; \ - uint32x2_t __s1_865 = __p1_865; \ - uint32x2_t __s2_865 = __p2_865; \ - __ret_865 = __s0_865 + vmull_u32(__s1_865, splat_lane_u32(__s2_865, __p3_865)); \ - __ret_865; \ +#define vmlal_lane_u32(__p0_837, __p1_837, __p2_837, __p3_837) __extension__ ({ \ + uint64x2_t __ret_837; \ + uint64x2_t __s0_837 = __p0_837; \ + uint32x2_t __s1_837 = __p1_837; \ + uint32x2_t __s2_837 = __p2_837; \ + __ret_837 = __s0_837 + vmull_u32(__s1_837, splat_lane_u32(__s2_837, __p3_837)); \ + __ret_837; \ }) #else -#define vmlal_lane_u32(__p0_866, __p1_866, __p2_866, __p3_866) __extension__ ({ \ - uint64x2_t __ret_866; \ - uint64x2_t __s0_866 = __p0_866; \ - uint32x2_t __s1_866 = __p1_866; \ - uint32x2_t __s2_866 = __p2_866; \ - uint64x2_t __rev0_866; __rev0_866 = __builtin_shufflevector(__s0_866, __s0_866, 1, 0); \ - uint32x2_t __rev1_866; __rev1_866 = __builtin_shufflevector(__s1_866, __s1_866, 1, 0); \ - uint32x2_t __rev2_866; __rev2_866 = __builtin_shufflevector(__s2_866, __s2_866, 1, 0); \ - __ret_866 = __rev0_866 + __noswap_vmull_u32(__rev1_866, __noswap_splat_lane_u32(__rev2_866, __p3_866)); \ - __ret_866 = __builtin_shufflevector(__ret_866, __ret_866, 1, 0); \ - __ret_866; \ +#define vmlal_lane_u32(__p0_838, __p1_838, __p2_838, __p3_838) __extension__ ({ \ + uint64x2_t __ret_838; \ + uint64x2_t __s0_838 = __p0_838; \ + uint32x2_t __s1_838 = __p1_838; \ + uint32x2_t __s2_838 = __p2_838; \ + uint64x2_t __rev0_838; __rev0_838 = __builtin_shufflevector(__s0_838, __s0_838, 1, 0); \ + uint32x2_t __rev1_838; __rev1_838 = __builtin_shufflevector(__s1_838, __s1_838, 1, 0); \ + uint32x2_t __rev2_838; __rev2_838 = __builtin_shufflevector(__s2_838, __s2_838, 1, 0); \ + __ret_838 = __rev0_838 + __noswap_vmull_u32(__rev1_838, __noswap_splat_lane_u32(__rev2_838, __p3_838)); \ + __ret_838 = __builtin_shufflevector(__ret_838, __ret_838, 1, 0); \ + __ret_838; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlal_lane_u16(__p0_867, __p1_867, __p2_867, __p3_867) __extension__ ({ \ - uint32x4_t __ret_867; \ - uint32x4_t __s0_867 = __p0_867; \ - uint16x4_t __s1_867 = __p1_867; \ - uint16x4_t __s2_867 = __p2_867; \ - __ret_867 = __s0_867 + vmull_u16(__s1_867, splat_lane_u16(__s2_867, __p3_867)); \ - __ret_867; \ +#define vmlal_lane_u16(__p0_839, __p1_839, __p2_839, __p3_839) __extension__ ({ \ + uint32x4_t __ret_839; \ + uint32x4_t __s0_839 = __p0_839; \ + uint16x4_t __s1_839 = __p1_839; \ + uint16x4_t __s2_839 = __p2_839; \ + __ret_839 = __s0_839 + vmull_u16(__s1_839, splat_lane_u16(__s2_839, __p3_839)); \ + __ret_839; \ }) #else -#define vmlal_lane_u16(__p0_868, __p1_868, __p2_868, __p3_868) __extension__ ({ \ - uint32x4_t __ret_868; \ - uint32x4_t __s0_868 = __p0_868; \ - uint16x4_t __s1_868 = __p1_868; \ - uint16x4_t __s2_868 = __p2_868; \ - uint32x4_t __rev0_868; __rev0_868 = __builtin_shufflevector(__s0_868, __s0_868, 3, 2, 1, 0); \ - uint16x4_t __rev1_868; __rev1_868 = __builtin_shufflevector(__s1_868, __s1_868, 3, 2, 1, 0); \ - uint16x4_t __rev2_868; __rev2_868 = __builtin_shufflevector(__s2_868, __s2_868, 3, 2, 1, 0); \ - __ret_868 = __rev0_868 + __noswap_vmull_u16(__rev1_868, __noswap_splat_lane_u16(__rev2_868, __p3_868)); \ - __ret_868 = __builtin_shufflevector(__ret_868, __ret_868, 3, 2, 1, 0); \ - __ret_868; \ +#define vmlal_lane_u16(__p0_840, __p1_840, __p2_840, __p3_840) __extension__ ({ \ + uint32x4_t __ret_840; \ + uint32x4_t __s0_840 = __p0_840; \ + uint16x4_t __s1_840 = __p1_840; \ + uint16x4_t __s2_840 = __p2_840; \ + uint32x4_t __rev0_840; __rev0_840 = __builtin_shufflevector(__s0_840, __s0_840, 3, 2, 1, 0); \ + uint16x4_t __rev1_840; __rev1_840 = __builtin_shufflevector(__s1_840, __s1_840, 3, 2, 1, 0); \ + uint16x4_t __rev2_840; __rev2_840 = __builtin_shufflevector(__s2_840, __s2_840, 3, 2, 1, 0); \ + __ret_840 = __rev0_840 + __noswap_vmull_u16(__rev1_840, __noswap_splat_lane_u16(__rev2_840, __p3_840)); \ + __ret_840 = __builtin_shufflevector(__ret_840, __ret_840, 3, 2, 1, 0); \ + __ret_840; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlal_lane_s32(__p0_869, __p1_869, __p2_869, __p3_869) __extension__ ({ \ - int64x2_t __ret_869; \ - int64x2_t __s0_869 = __p0_869; \ - int32x2_t __s1_869 = __p1_869; \ - int32x2_t __s2_869 = __p2_869; \ - __ret_869 = __s0_869 + vmull_s32(__s1_869, splat_lane_s32(__s2_869, __p3_869)); \ - __ret_869; \ +#define vmlal_lane_s32(__p0_841, __p1_841, __p2_841, __p3_841) __extension__ ({ \ + int64x2_t __ret_841; \ + int64x2_t __s0_841 = __p0_841; \ + int32x2_t __s1_841 = __p1_841; \ + int32x2_t __s2_841 = __p2_841; \ + __ret_841 = __s0_841 + vmull_s32(__s1_841, splat_lane_s32(__s2_841, __p3_841)); \ + __ret_841; \ }) #else -#define vmlal_lane_s32(__p0_870, __p1_870, __p2_870, __p3_870) __extension__ ({ \ - int64x2_t __ret_870; \ - int64x2_t __s0_870 = __p0_870; \ - int32x2_t __s1_870 = __p1_870; \ - int32x2_t __s2_870 = __p2_870; \ - int64x2_t __rev0_870; __rev0_870 = __builtin_shufflevector(__s0_870, __s0_870, 1, 0); \ - int32x2_t __rev1_870; __rev1_870 = __builtin_shufflevector(__s1_870, __s1_870, 1, 0); \ - int32x2_t __rev2_870; __rev2_870 = __builtin_shufflevector(__s2_870, __s2_870, 1, 0); \ - __ret_870 = __rev0_870 + __noswap_vmull_s32(__rev1_870, __noswap_splat_lane_s32(__rev2_870, __p3_870)); \ - __ret_870 = __builtin_shufflevector(__ret_870, __ret_870, 1, 0); \ - __ret_870; \ +#define vmlal_lane_s32(__p0_842, __p1_842, __p2_842, __p3_842) __extension__ ({ \ + int64x2_t __ret_842; \ + int64x2_t __s0_842 = __p0_842; \ + int32x2_t __s1_842 = __p1_842; \ + int32x2_t __s2_842 = __p2_842; \ + int64x2_t __rev0_842; __rev0_842 = __builtin_shufflevector(__s0_842, __s0_842, 1, 0); \ + int32x2_t __rev1_842; __rev1_842 = __builtin_shufflevector(__s1_842, __s1_842, 1, 0); \ + int32x2_t __rev2_842; __rev2_842 = __builtin_shufflevector(__s2_842, __s2_842, 1, 0); \ + __ret_842 = __rev0_842 + __noswap_vmull_s32(__rev1_842, __noswap_splat_lane_s32(__rev2_842, __p3_842)); \ + __ret_842 = __builtin_shufflevector(__ret_842, __ret_842, 1, 0); \ + __ret_842; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlal_lane_s16(__p0_871, __p1_871, __p2_871, __p3_871) __extension__ ({ \ - int32x4_t __ret_871; \ - int32x4_t __s0_871 = __p0_871; \ - int16x4_t __s1_871 = __p1_871; \ - int16x4_t __s2_871 = __p2_871; \ - __ret_871 = __s0_871 + vmull_s16(__s1_871, splat_lane_s16(__s2_871, __p3_871)); \ - __ret_871; \ +#define vmlal_lane_s16(__p0_843, __p1_843, __p2_843, __p3_843) __extension__ ({ \ + int32x4_t __ret_843; \ + int32x4_t __s0_843 = __p0_843; \ + int16x4_t __s1_843 = __p1_843; \ + int16x4_t __s2_843 = __p2_843; \ + __ret_843 = __s0_843 + vmull_s16(__s1_843, splat_lane_s16(__s2_843, __p3_843)); \ + __ret_843; \ }) #else -#define vmlal_lane_s16(__p0_872, __p1_872, __p2_872, __p3_872) __extension__ ({ \ - int32x4_t __ret_872; \ - int32x4_t __s0_872 = __p0_872; \ - int16x4_t __s1_872 = __p1_872; \ - int16x4_t __s2_872 = __p2_872; \ - int32x4_t __rev0_872; __rev0_872 = __builtin_shufflevector(__s0_872, __s0_872, 3, 2, 1, 0); \ - int16x4_t __rev1_872; __rev1_872 = __builtin_shufflevector(__s1_872, __s1_872, 3, 2, 1, 0); \ - int16x4_t __rev2_872; __rev2_872 = __builtin_shufflevector(__s2_872, __s2_872, 3, 2, 1, 0); \ - __ret_872 = __rev0_872 + __noswap_vmull_s16(__rev1_872, __noswap_splat_lane_s16(__rev2_872, __p3_872)); \ - __ret_872 = __builtin_shufflevector(__ret_872, __ret_872, 3, 2, 1, 0); \ - __ret_872; \ +#define vmlal_lane_s16(__p0_844, __p1_844, __p2_844, __p3_844) __extension__ ({ \ + int32x4_t __ret_844; \ + int32x4_t __s0_844 = __p0_844; \ + int16x4_t __s1_844 = __p1_844; \ + int16x4_t __s2_844 = __p2_844; \ + int32x4_t __rev0_844; __rev0_844 = __builtin_shufflevector(__s0_844, __s0_844, 3, 2, 1, 0); \ + int16x4_t __rev1_844; __rev1_844 = __builtin_shufflevector(__s1_844, __s1_844, 3, 2, 1, 0); \ + int16x4_t __rev2_844; __rev2_844 = __builtin_shufflevector(__s2_844, __s2_844, 3, 2, 1, 0); \ + __ret_844 = __rev0_844 + __noswap_vmull_s16(__rev1_844, __noswap_splat_lane_s16(__rev2_844, __p3_844)); \ + __ret_844 = __builtin_shufflevector(__ret_844, __ret_844, 3, 2, 1, 0); \ + __ret_844; \ }) #endif @@ -67881,98 +69784,98 @@ __ai __attribute__((target("neon"))) int32x4_t __noswap_vmlsl_s16(int32x4_t __p0 #endif #ifdef __LITTLE_ENDIAN__ -#define vmlsl_lane_u32(__p0_873, __p1_873, __p2_873, __p3_873) __extension__ ({ \ - uint64x2_t __ret_873; \ - uint64x2_t __s0_873 = __p0_873; \ - uint32x2_t __s1_873 = __p1_873; \ - uint32x2_t __s2_873 = __p2_873; \ - __ret_873 = __s0_873 - vmull_u32(__s1_873, splat_lane_u32(__s2_873, __p3_873)); \ - __ret_873; \ +#define vmlsl_lane_u32(__p0_845, __p1_845, __p2_845, __p3_845) __extension__ ({ \ + uint64x2_t __ret_845; \ + uint64x2_t __s0_845 = __p0_845; \ + uint32x2_t __s1_845 = __p1_845; \ + uint32x2_t __s2_845 = __p2_845; \ + __ret_845 = __s0_845 - vmull_u32(__s1_845, splat_lane_u32(__s2_845, __p3_845)); \ + __ret_845; \ }) #else -#define vmlsl_lane_u32(__p0_874, __p1_874, __p2_874, __p3_874) __extension__ ({ \ - uint64x2_t __ret_874; \ - uint64x2_t __s0_874 = __p0_874; \ - uint32x2_t __s1_874 = __p1_874; \ - uint32x2_t __s2_874 = __p2_874; \ - uint64x2_t __rev0_874; __rev0_874 = __builtin_shufflevector(__s0_874, __s0_874, 1, 0); \ - uint32x2_t __rev1_874; __rev1_874 = __builtin_shufflevector(__s1_874, __s1_874, 1, 0); \ - uint32x2_t __rev2_874; __rev2_874 = __builtin_shufflevector(__s2_874, __s2_874, 1, 0); \ - __ret_874 = __rev0_874 - __noswap_vmull_u32(__rev1_874, __noswap_splat_lane_u32(__rev2_874, __p3_874)); \ - __ret_874 = __builtin_shufflevector(__ret_874, __ret_874, 1, 0); \ - __ret_874; \ +#define vmlsl_lane_u32(__p0_846, __p1_846, __p2_846, __p3_846) __extension__ ({ \ + uint64x2_t __ret_846; \ + uint64x2_t __s0_846 = __p0_846; \ + uint32x2_t __s1_846 = __p1_846; \ + uint32x2_t __s2_846 = __p2_846; \ + uint64x2_t __rev0_846; __rev0_846 = __builtin_shufflevector(__s0_846, __s0_846, 1, 0); \ + uint32x2_t __rev1_846; __rev1_846 = __builtin_shufflevector(__s1_846, __s1_846, 1, 0); \ + uint32x2_t __rev2_846; __rev2_846 = __builtin_shufflevector(__s2_846, __s2_846, 1, 0); \ + __ret_846 = __rev0_846 - __noswap_vmull_u32(__rev1_846, __noswap_splat_lane_u32(__rev2_846, __p3_846)); \ + __ret_846 = __builtin_shufflevector(__ret_846, __ret_846, 1, 0); \ + __ret_846; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlsl_lane_u16(__p0_875, __p1_875, __p2_875, __p3_875) __extension__ ({ \ - uint32x4_t __ret_875; \ - uint32x4_t __s0_875 = __p0_875; \ - uint16x4_t __s1_875 = __p1_875; \ - uint16x4_t __s2_875 = __p2_875; \ - __ret_875 = __s0_875 - vmull_u16(__s1_875, splat_lane_u16(__s2_875, __p3_875)); \ - __ret_875; \ +#define vmlsl_lane_u16(__p0_847, __p1_847, __p2_847, __p3_847) __extension__ ({ \ + uint32x4_t __ret_847; \ + uint32x4_t __s0_847 = __p0_847; \ + uint16x4_t __s1_847 = __p1_847; \ + uint16x4_t __s2_847 = __p2_847; \ + __ret_847 = __s0_847 - vmull_u16(__s1_847, splat_lane_u16(__s2_847, __p3_847)); \ + __ret_847; \ }) #else -#define vmlsl_lane_u16(__p0_876, __p1_876, __p2_876, __p3_876) __extension__ ({ \ - uint32x4_t __ret_876; \ - uint32x4_t __s0_876 = __p0_876; \ - uint16x4_t __s1_876 = __p1_876; \ - uint16x4_t __s2_876 = __p2_876; \ - uint32x4_t __rev0_876; __rev0_876 = __builtin_shufflevector(__s0_876, __s0_876, 3, 2, 1, 0); \ - uint16x4_t __rev1_876; __rev1_876 = __builtin_shufflevector(__s1_876, __s1_876, 3, 2, 1, 0); \ - uint16x4_t __rev2_876; __rev2_876 = __builtin_shufflevector(__s2_876, __s2_876, 3, 2, 1, 0); \ - __ret_876 = __rev0_876 - __noswap_vmull_u16(__rev1_876, __noswap_splat_lane_u16(__rev2_876, __p3_876)); \ - __ret_876 = __builtin_shufflevector(__ret_876, __ret_876, 3, 2, 1, 0); \ - __ret_876; \ +#define vmlsl_lane_u16(__p0_848, __p1_848, __p2_848, __p3_848) __extension__ ({ \ + uint32x4_t __ret_848; \ + uint32x4_t __s0_848 = __p0_848; \ + uint16x4_t __s1_848 = __p1_848; \ + uint16x4_t __s2_848 = __p2_848; \ + uint32x4_t __rev0_848; __rev0_848 = __builtin_shufflevector(__s0_848, __s0_848, 3, 2, 1, 0); \ + uint16x4_t __rev1_848; __rev1_848 = __builtin_shufflevector(__s1_848, __s1_848, 3, 2, 1, 0); \ + uint16x4_t __rev2_848; __rev2_848 = __builtin_shufflevector(__s2_848, __s2_848, 3, 2, 1, 0); \ + __ret_848 = __rev0_848 - __noswap_vmull_u16(__rev1_848, __noswap_splat_lane_u16(__rev2_848, __p3_848)); \ + __ret_848 = __builtin_shufflevector(__ret_848, __ret_848, 3, 2, 1, 0); \ + __ret_848; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlsl_lane_s32(__p0_877, __p1_877, __p2_877, __p3_877) __extension__ ({ \ - int64x2_t __ret_877; \ - int64x2_t __s0_877 = __p0_877; \ - int32x2_t __s1_877 = __p1_877; \ - int32x2_t __s2_877 = __p2_877; \ - __ret_877 = __s0_877 - vmull_s32(__s1_877, splat_lane_s32(__s2_877, __p3_877)); \ - __ret_877; \ +#define vmlsl_lane_s32(__p0_849, __p1_849, __p2_849, __p3_849) __extension__ ({ \ + int64x2_t __ret_849; \ + int64x2_t __s0_849 = __p0_849; \ + int32x2_t __s1_849 = __p1_849; \ + int32x2_t __s2_849 = __p2_849; \ + __ret_849 = __s0_849 - vmull_s32(__s1_849, splat_lane_s32(__s2_849, __p3_849)); \ + __ret_849; \ }) #else -#define vmlsl_lane_s32(__p0_878, __p1_878, __p2_878, __p3_878) __extension__ ({ \ - int64x2_t __ret_878; \ - int64x2_t __s0_878 = __p0_878; \ - int32x2_t __s1_878 = __p1_878; \ - int32x2_t __s2_878 = __p2_878; \ - int64x2_t __rev0_878; __rev0_878 = __builtin_shufflevector(__s0_878, __s0_878, 1, 0); \ - int32x2_t __rev1_878; __rev1_878 = __builtin_shufflevector(__s1_878, __s1_878, 1, 0); \ - int32x2_t __rev2_878; __rev2_878 = __builtin_shufflevector(__s2_878, __s2_878, 1, 0); \ - __ret_878 = __rev0_878 - __noswap_vmull_s32(__rev1_878, __noswap_splat_lane_s32(__rev2_878, __p3_878)); \ - __ret_878 = __builtin_shufflevector(__ret_878, __ret_878, 1, 0); \ - __ret_878; \ +#define vmlsl_lane_s32(__p0_850, __p1_850, __p2_850, __p3_850) __extension__ ({ \ + int64x2_t __ret_850; \ + int64x2_t __s0_850 = __p0_850; \ + int32x2_t __s1_850 = __p1_850; \ + int32x2_t __s2_850 = __p2_850; \ + int64x2_t __rev0_850; __rev0_850 = __builtin_shufflevector(__s0_850, __s0_850, 1, 0); \ + int32x2_t __rev1_850; __rev1_850 = __builtin_shufflevector(__s1_850, __s1_850, 1, 0); \ + int32x2_t __rev2_850; __rev2_850 = __builtin_shufflevector(__s2_850, __s2_850, 1, 0); \ + __ret_850 = __rev0_850 - __noswap_vmull_s32(__rev1_850, __noswap_splat_lane_s32(__rev2_850, __p3_850)); \ + __ret_850 = __builtin_shufflevector(__ret_850, __ret_850, 1, 0); \ + __ret_850; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmlsl_lane_s16(__p0_879, __p1_879, __p2_879, __p3_879) __extension__ ({ \ - int32x4_t __ret_879; \ - int32x4_t __s0_879 = __p0_879; \ - int16x4_t __s1_879 = __p1_879; \ - int16x4_t __s2_879 = __p2_879; \ - __ret_879 = __s0_879 - vmull_s16(__s1_879, splat_lane_s16(__s2_879, __p3_879)); \ - __ret_879; \ +#define vmlsl_lane_s16(__p0_851, __p1_851, __p2_851, __p3_851) __extension__ ({ \ + int32x4_t __ret_851; \ + int32x4_t __s0_851 = __p0_851; \ + int16x4_t __s1_851 = __p1_851; \ + int16x4_t __s2_851 = __p2_851; \ + __ret_851 = __s0_851 - vmull_s16(__s1_851, splat_lane_s16(__s2_851, __p3_851)); \ + __ret_851; \ }) #else -#define vmlsl_lane_s16(__p0_880, __p1_880, __p2_880, __p3_880) __extension__ ({ \ - int32x4_t __ret_880; \ - int32x4_t __s0_880 = __p0_880; \ - int16x4_t __s1_880 = __p1_880; \ - int16x4_t __s2_880 = __p2_880; \ - int32x4_t __rev0_880; __rev0_880 = __builtin_shufflevector(__s0_880, __s0_880, 3, 2, 1, 0); \ - int16x4_t __rev1_880; __rev1_880 = __builtin_shufflevector(__s1_880, __s1_880, 3, 2, 1, 0); \ - int16x4_t __rev2_880; __rev2_880 = __builtin_shufflevector(__s2_880, __s2_880, 3, 2, 1, 0); \ - __ret_880 = __rev0_880 - __noswap_vmull_s16(__rev1_880, __noswap_splat_lane_s16(__rev2_880, __p3_880)); \ - __ret_880 = __builtin_shufflevector(__ret_880, __ret_880, 3, 2, 1, 0); \ - __ret_880; \ +#define vmlsl_lane_s16(__p0_852, __p1_852, __p2_852, __p3_852) __extension__ ({ \ + int32x4_t __ret_852; \ + int32x4_t __s0_852 = __p0_852; \ + int16x4_t __s1_852 = __p1_852; \ + int16x4_t __s2_852 = __p2_852; \ + int32x4_t __rev0_852; __rev0_852 = __builtin_shufflevector(__s0_852, __s0_852, 3, 2, 1, 0); \ + int16x4_t __rev1_852; __rev1_852 = __builtin_shufflevector(__s1_852, __s1_852, 3, 2, 1, 0); \ + int16x4_t __rev2_852; __rev2_852 = __builtin_shufflevector(__s2_852, __s2_852, 3, 2, 1, 0); \ + __ret_852 = __rev0_852 - __noswap_vmull_s16(__rev1_852, __noswap_splat_lane_s16(__rev2_852, __p3_852)); \ + __ret_852 = __builtin_shufflevector(__ret_852, __ret_852, 3, 2, 1, 0); \ + __ret_852; \ }) #endif @@ -68065,54 +69968,42 @@ __ai __attribute__((target("neon"))) int32x4_t __noswap_vmlsl_n_s16(int32x4_t __ #endif #ifdef __LITTLE_ENDIAN__ -#define vset_lane_f16(__p0_881, __p1_881, __p2_881) __extension__ ({ \ - float16x4_t __ret_881; \ - float16_t __s0_881 = __p0_881; \ - float16x4_t __s1_881 = __p1_881; \ -float16_t __reint_881 = __s0_881; \ -float16x4_t __reint1_881 = __s1_881; \ -int16x4_t __reint2_881 = vset_lane_s16(*(int16_t *) &__reint_881, *(int16x4_t *) &__reint1_881, __p2_881); \ - __ret_881 = *(float16x4_t *) &__reint2_881; \ - __ret_881; \ +#define vset_lane_f16(__p0_853, __p1_853, __p2_853) __extension__ ({ \ + float16x4_t __ret_853; \ + float16_t __s0_853 = __p0_853; \ + float16x4_t __s1_853 = __p1_853; \ + __ret_853 = __builtin_bit_cast(float16x4_t, vset_lane_s16(__builtin_bit_cast(int16_t, __s0_853), __builtin_bit_cast(int16x4_t, __s1_853), __p2_853)); \ + __ret_853; \ }) #else -#define vset_lane_f16(__p0_882, __p1_882, __p2_882) __extension__ ({ \ - float16x4_t __ret_882; \ - float16_t __s0_882 = __p0_882; \ - float16x4_t __s1_882 = __p1_882; \ - float16x4_t __rev1_882; __rev1_882 = __builtin_shufflevector(__s1_882, __s1_882, 3, 2, 1, 0); \ -float16_t __reint_882 = __s0_882; \ -float16x4_t __reint1_882 = __rev1_882; \ -int16x4_t __reint2_882 = __noswap_vset_lane_s16(*(int16_t *) &__reint_882, *(int16x4_t *) &__reint1_882, __p2_882); \ - __ret_882 = *(float16x4_t *) &__reint2_882; \ - __ret_882 = __builtin_shufflevector(__ret_882, __ret_882, 3, 2, 1, 0); \ - __ret_882; \ +#define vset_lane_f16(__p0_854, __p1_854, __p2_854) __extension__ ({ \ + float16x4_t __ret_854; \ + float16_t __s0_854 = __p0_854; \ + float16x4_t __s1_854 = __p1_854; \ + float16x4_t __rev1_854; __rev1_854 = __builtin_shufflevector(__s1_854, __s1_854, 3, 2, 1, 0); \ + __ret_854 = __builtin_bit_cast(float16x4_t, __noswap_vset_lane_s16(__builtin_bit_cast(int16_t, __s0_854), __builtin_bit_cast(int16x4_t, __rev1_854), __p2_854)); \ + __ret_854 = __builtin_shufflevector(__ret_854, __ret_854, 3, 2, 1, 0); \ + __ret_854; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vsetq_lane_f16(__p0_883, __p1_883, __p2_883) __extension__ ({ \ - float16x8_t __ret_883; \ - float16_t __s0_883 = __p0_883; \ - float16x8_t __s1_883 = __p1_883; \ -float16_t __reint_883 = __s0_883; \ -float16x8_t __reint1_883 = __s1_883; \ -int16x8_t __reint2_883 = vsetq_lane_s16(*(int16_t *) &__reint_883, *(int16x8_t *) &__reint1_883, __p2_883); \ - __ret_883 = *(float16x8_t *) &__reint2_883; \ - __ret_883; \ +#define vsetq_lane_f16(__p0_855, __p1_855, __p2_855) __extension__ ({ \ + float16x8_t __ret_855; \ + float16_t __s0_855 = __p0_855; \ + float16x8_t __s1_855 = __p1_855; \ + __ret_855 = __builtin_bit_cast(float16x8_t, vsetq_lane_s16(__builtin_bit_cast(int16_t, __s0_855), __builtin_bit_cast(int16x8_t, __s1_855), __p2_855)); \ + __ret_855; \ }) #else -#define vsetq_lane_f16(__p0_884, __p1_884, __p2_884) __extension__ ({ \ - float16x8_t __ret_884; \ - float16_t __s0_884 = __p0_884; \ - float16x8_t __s1_884 = __p1_884; \ - float16x8_t __rev1_884; __rev1_884 = __builtin_shufflevector(__s1_884, __s1_884, 7, 6, 5, 4, 3, 2, 1, 0); \ -float16_t __reint_884 = __s0_884; \ -float16x8_t __reint1_884 = __rev1_884; \ -int16x8_t __reint2_884 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_884, *(int16x8_t *) &__reint1_884, __p2_884); \ - __ret_884 = *(float16x8_t *) &__reint2_884; \ - __ret_884 = __builtin_shufflevector(__ret_884, __ret_884, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_884; \ +#define vsetq_lane_f16(__p0_856, __p1_856, __p2_856) __extension__ ({ \ + float16x8_t __ret_856; \ + float16_t __s0_856 = __p0_856; \ + float16x8_t __s1_856 = __p1_856; \ + float16x8_t __rev1_856; __rev1_856 = __builtin_shufflevector(__s1_856, __s1_856, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_856 = __builtin_bit_cast(float16x8_t, __noswap_vsetq_lane_s16(__builtin_bit_cast(int16_t, __s0_856), __builtin_bit_cast(int16x8_t, __rev1_856), __p2_856)); \ + __ret_856 = __builtin_shufflevector(__ret_856, __ret_856, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_856; \ }) #endif @@ -68134,427 +70025,427 @@ __ai __attribute__((target("aes,neon"))) poly128_t vmull_high_p64(poly64x2_t __p #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_lane_high_f16(__p0_885, __p1_885, __p2_885, __p3_885) __extension__ ({ \ +#define vfmlalq_lane_high_f16(__p0_857, __p1_857, __p2_857, __p3_857) __extension__ ({ \ + float32x4_t __ret_857; \ + float32x4_t __s0_857 = __p0_857; \ + float16x8_t __s1_857 = __p1_857; \ + float16x4_t __s2_857 = __p2_857; \ + __ret_857 = vfmlalq_high_f16(__s0_857, __s1_857, (float16x8_t) {vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857), vget_lane_f16(__s2_857, __p3_857)}); \ + __ret_857; \ +}) +#else +#define vfmlalq_lane_high_f16(__p0_858, __p1_858, __p2_858, __p3_858) __extension__ ({ \ + float32x4_t __ret_858; \ + float32x4_t __s0_858 = __p0_858; \ + float16x8_t __s1_858 = __p1_858; \ + float16x4_t __s2_858 = __p2_858; \ + float32x4_t __rev0_858; __rev0_858 = __builtin_shufflevector(__s0_858, __s0_858, 3, 2, 1, 0); \ + float16x8_t __rev1_858; __rev1_858 = __builtin_shufflevector(__s1_858, __s1_858, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_858; __rev2_858 = __builtin_shufflevector(__s2_858, __s2_858, 3, 2, 1, 0); \ + __ret_858 = __noswap_vfmlalq_high_f16(__rev0_858, __rev1_858, (float16x8_t) {__noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858), __noswap_vget_lane_f16(__rev2_858, __p3_858)}); \ + __ret_858 = __builtin_shufflevector(__ret_858, __ret_858, 3, 2, 1, 0); \ + __ret_858; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlal_lane_high_f16(__p0_859, __p1_859, __p2_859, __p3_859) __extension__ ({ \ + float32x2_t __ret_859; \ + float32x2_t __s0_859 = __p0_859; \ + float16x4_t __s1_859 = __p1_859; \ + float16x4_t __s2_859 = __p2_859; \ + __ret_859 = vfmlal_high_f16(__s0_859, __s1_859, (float16x4_t) {vget_lane_f16(__s2_859, __p3_859), vget_lane_f16(__s2_859, __p3_859), vget_lane_f16(__s2_859, __p3_859), vget_lane_f16(__s2_859, __p3_859)}); \ + __ret_859; \ +}) +#else +#define vfmlal_lane_high_f16(__p0_860, __p1_860, __p2_860, __p3_860) __extension__ ({ \ + float32x2_t __ret_860; \ + float32x2_t __s0_860 = __p0_860; \ + float16x4_t __s1_860 = __p1_860; \ + float16x4_t __s2_860 = __p2_860; \ + float32x2_t __rev0_860; __rev0_860 = __builtin_shufflevector(__s0_860, __s0_860, 1, 0); \ + float16x4_t __rev1_860; __rev1_860 = __builtin_shufflevector(__s1_860, __s1_860, 3, 2, 1, 0); \ + float16x4_t __rev2_860; __rev2_860 = __builtin_shufflevector(__s2_860, __s2_860, 3, 2, 1, 0); \ + __ret_860 = __noswap_vfmlal_high_f16(__rev0_860, __rev1_860, (float16x4_t) {__noswap_vget_lane_f16(__rev2_860, __p3_860), __noswap_vget_lane_f16(__rev2_860, __p3_860), __noswap_vget_lane_f16(__rev2_860, __p3_860), __noswap_vget_lane_f16(__rev2_860, __p3_860)}); \ + __ret_860 = __builtin_shufflevector(__ret_860, __ret_860, 1, 0); \ + __ret_860; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlalq_lane_low_f16(__p0_861, __p1_861, __p2_861, __p3_861) __extension__ ({ \ + float32x4_t __ret_861; \ + float32x4_t __s0_861 = __p0_861; \ + float16x8_t __s1_861 = __p1_861; \ + float16x4_t __s2_861 = __p2_861; \ + __ret_861 = vfmlalq_low_f16(__s0_861, __s1_861, (float16x8_t) {vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861), vget_lane_f16(__s2_861, __p3_861)}); \ + __ret_861; \ +}) +#else +#define vfmlalq_lane_low_f16(__p0_862, __p1_862, __p2_862, __p3_862) __extension__ ({ \ + float32x4_t __ret_862; \ + float32x4_t __s0_862 = __p0_862; \ + float16x8_t __s1_862 = __p1_862; \ + float16x4_t __s2_862 = __p2_862; \ + float32x4_t __rev0_862; __rev0_862 = __builtin_shufflevector(__s0_862, __s0_862, 3, 2, 1, 0); \ + float16x8_t __rev1_862; __rev1_862 = __builtin_shufflevector(__s1_862, __s1_862, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_862; __rev2_862 = __builtin_shufflevector(__s2_862, __s2_862, 3, 2, 1, 0); \ + __ret_862 = __noswap_vfmlalq_low_f16(__rev0_862, __rev1_862, (float16x8_t) {__noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862), __noswap_vget_lane_f16(__rev2_862, __p3_862)}); \ + __ret_862 = __builtin_shufflevector(__ret_862, __ret_862, 3, 2, 1, 0); \ + __ret_862; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlal_lane_low_f16(__p0_863, __p1_863, __p2_863, __p3_863) __extension__ ({ \ + float32x2_t __ret_863; \ + float32x2_t __s0_863 = __p0_863; \ + float16x4_t __s1_863 = __p1_863; \ + float16x4_t __s2_863 = __p2_863; \ + __ret_863 = vfmlal_low_f16(__s0_863, __s1_863, (float16x4_t) {vget_lane_f16(__s2_863, __p3_863), vget_lane_f16(__s2_863, __p3_863), vget_lane_f16(__s2_863, __p3_863), vget_lane_f16(__s2_863, __p3_863)}); \ + __ret_863; \ +}) +#else +#define vfmlal_lane_low_f16(__p0_864, __p1_864, __p2_864, __p3_864) __extension__ ({ \ + float32x2_t __ret_864; \ + float32x2_t __s0_864 = __p0_864; \ + float16x4_t __s1_864 = __p1_864; \ + float16x4_t __s2_864 = __p2_864; \ + float32x2_t __rev0_864; __rev0_864 = __builtin_shufflevector(__s0_864, __s0_864, 1, 0); \ + float16x4_t __rev1_864; __rev1_864 = __builtin_shufflevector(__s1_864, __s1_864, 3, 2, 1, 0); \ + float16x4_t __rev2_864; __rev2_864 = __builtin_shufflevector(__s2_864, __s2_864, 3, 2, 1, 0); \ + __ret_864 = __noswap_vfmlal_low_f16(__rev0_864, __rev1_864, (float16x4_t) {__noswap_vget_lane_f16(__rev2_864, __p3_864), __noswap_vget_lane_f16(__rev2_864, __p3_864), __noswap_vget_lane_f16(__rev2_864, __p3_864), __noswap_vget_lane_f16(__rev2_864, __p3_864)}); \ + __ret_864 = __builtin_shufflevector(__ret_864, __ret_864, 1, 0); \ + __ret_864; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlalq_laneq_high_f16(__p0_865, __p1_865, __p2_865, __p3_865) __extension__ ({ \ + float32x4_t __ret_865; \ + float32x4_t __s0_865 = __p0_865; \ + float16x8_t __s1_865 = __p1_865; \ + float16x8_t __s2_865 = __p2_865; \ + __ret_865 = vfmlalq_high_f16(__s0_865, __s1_865, (float16x8_t) {vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865), vgetq_lane_f16(__s2_865, __p3_865)}); \ + __ret_865; \ +}) +#else +#define vfmlalq_laneq_high_f16(__p0_866, __p1_866, __p2_866, __p3_866) __extension__ ({ \ + float32x4_t __ret_866; \ + float32x4_t __s0_866 = __p0_866; \ + float16x8_t __s1_866 = __p1_866; \ + float16x8_t __s2_866 = __p2_866; \ + float32x4_t __rev0_866; __rev0_866 = __builtin_shufflevector(__s0_866, __s0_866, 3, 2, 1, 0); \ + float16x8_t __rev1_866; __rev1_866 = __builtin_shufflevector(__s1_866, __s1_866, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2_866; __rev2_866 = __builtin_shufflevector(__s2_866, __s2_866, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_866 = __noswap_vfmlalq_high_f16(__rev0_866, __rev1_866, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866), __noswap_vgetq_lane_f16(__rev2_866, __p3_866)}); \ + __ret_866 = __builtin_shufflevector(__ret_866, __ret_866, 3, 2, 1, 0); \ + __ret_866; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlal_laneq_high_f16(__p0_867, __p1_867, __p2_867, __p3_867) __extension__ ({ \ + float32x2_t __ret_867; \ + float32x2_t __s0_867 = __p0_867; \ + float16x4_t __s1_867 = __p1_867; \ + float16x8_t __s2_867 = __p2_867; \ + __ret_867 = vfmlal_high_f16(__s0_867, __s1_867, (float16x4_t) {vgetq_lane_f16(__s2_867, __p3_867), vgetq_lane_f16(__s2_867, __p3_867), vgetq_lane_f16(__s2_867, __p3_867), vgetq_lane_f16(__s2_867, __p3_867)}); \ + __ret_867; \ +}) +#else +#define vfmlal_laneq_high_f16(__p0_868, __p1_868, __p2_868, __p3_868) __extension__ ({ \ + float32x2_t __ret_868; \ + float32x2_t __s0_868 = __p0_868; \ + float16x4_t __s1_868 = __p1_868; \ + float16x8_t __s2_868 = __p2_868; \ + float32x2_t __rev0_868; __rev0_868 = __builtin_shufflevector(__s0_868, __s0_868, 1, 0); \ + float16x4_t __rev1_868; __rev1_868 = __builtin_shufflevector(__s1_868, __s1_868, 3, 2, 1, 0); \ + float16x8_t __rev2_868; __rev2_868 = __builtin_shufflevector(__s2_868, __s2_868, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_868 = __noswap_vfmlal_high_f16(__rev0_868, __rev1_868, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_868, __p3_868), __noswap_vgetq_lane_f16(__rev2_868, __p3_868), __noswap_vgetq_lane_f16(__rev2_868, __p3_868), __noswap_vgetq_lane_f16(__rev2_868, __p3_868)}); \ + __ret_868 = __builtin_shufflevector(__ret_868, __ret_868, 1, 0); \ + __ret_868; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlalq_laneq_low_f16(__p0_869, __p1_869, __p2_869, __p3_869) __extension__ ({ \ + float32x4_t __ret_869; \ + float32x4_t __s0_869 = __p0_869; \ + float16x8_t __s1_869 = __p1_869; \ + float16x8_t __s2_869 = __p2_869; \ + __ret_869 = vfmlalq_low_f16(__s0_869, __s1_869, (float16x8_t) {vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869), vgetq_lane_f16(__s2_869, __p3_869)}); \ + __ret_869; \ +}) +#else +#define vfmlalq_laneq_low_f16(__p0_870, __p1_870, __p2_870, __p3_870) __extension__ ({ \ + float32x4_t __ret_870; \ + float32x4_t __s0_870 = __p0_870; \ + float16x8_t __s1_870 = __p1_870; \ + float16x8_t __s2_870 = __p2_870; \ + float32x4_t __rev0_870; __rev0_870 = __builtin_shufflevector(__s0_870, __s0_870, 3, 2, 1, 0); \ + float16x8_t __rev1_870; __rev1_870 = __builtin_shufflevector(__s1_870, __s1_870, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2_870; __rev2_870 = __builtin_shufflevector(__s2_870, __s2_870, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_870 = __noswap_vfmlalq_low_f16(__rev0_870, __rev1_870, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870), __noswap_vgetq_lane_f16(__rev2_870, __p3_870)}); \ + __ret_870 = __builtin_shufflevector(__ret_870, __ret_870, 3, 2, 1, 0); \ + __ret_870; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlal_laneq_low_f16(__p0_871, __p1_871, __p2_871, __p3_871) __extension__ ({ \ + float32x2_t __ret_871; \ + float32x2_t __s0_871 = __p0_871; \ + float16x4_t __s1_871 = __p1_871; \ + float16x8_t __s2_871 = __p2_871; \ + __ret_871 = vfmlal_low_f16(__s0_871, __s1_871, (float16x4_t) {vgetq_lane_f16(__s2_871, __p3_871), vgetq_lane_f16(__s2_871, __p3_871), vgetq_lane_f16(__s2_871, __p3_871), vgetq_lane_f16(__s2_871, __p3_871)}); \ + __ret_871; \ +}) +#else +#define vfmlal_laneq_low_f16(__p0_872, __p1_872, __p2_872, __p3_872) __extension__ ({ \ + float32x2_t __ret_872; \ + float32x2_t __s0_872 = __p0_872; \ + float16x4_t __s1_872 = __p1_872; \ + float16x8_t __s2_872 = __p2_872; \ + float32x2_t __rev0_872; __rev0_872 = __builtin_shufflevector(__s0_872, __s0_872, 1, 0); \ + float16x4_t __rev1_872; __rev1_872 = __builtin_shufflevector(__s1_872, __s1_872, 3, 2, 1, 0); \ + float16x8_t __rev2_872; __rev2_872 = __builtin_shufflevector(__s2_872, __s2_872, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_872 = __noswap_vfmlal_low_f16(__rev0_872, __rev1_872, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_872, __p3_872), __noswap_vgetq_lane_f16(__rev2_872, __p3_872), __noswap_vgetq_lane_f16(__rev2_872, __p3_872), __noswap_vgetq_lane_f16(__rev2_872, __p3_872)}); \ + __ret_872 = __builtin_shufflevector(__ret_872, __ret_872, 1, 0); \ + __ret_872; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlslq_lane_high_f16(__p0_873, __p1_873, __p2_873, __p3_873) __extension__ ({ \ + float32x4_t __ret_873; \ + float32x4_t __s0_873 = __p0_873; \ + float16x8_t __s1_873 = __p1_873; \ + float16x4_t __s2_873 = __p2_873; \ + __ret_873 = vfmlslq_high_f16(__s0_873, __s1_873, (float16x8_t) {vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873), vget_lane_f16(__s2_873, __p3_873)}); \ + __ret_873; \ +}) +#else +#define vfmlslq_lane_high_f16(__p0_874, __p1_874, __p2_874, __p3_874) __extension__ ({ \ + float32x4_t __ret_874; \ + float32x4_t __s0_874 = __p0_874; \ + float16x8_t __s1_874 = __p1_874; \ + float16x4_t __s2_874 = __p2_874; \ + float32x4_t __rev0_874; __rev0_874 = __builtin_shufflevector(__s0_874, __s0_874, 3, 2, 1, 0); \ + float16x8_t __rev1_874; __rev1_874 = __builtin_shufflevector(__s1_874, __s1_874, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_874; __rev2_874 = __builtin_shufflevector(__s2_874, __s2_874, 3, 2, 1, 0); \ + __ret_874 = __noswap_vfmlslq_high_f16(__rev0_874, __rev1_874, (float16x8_t) {__noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874), __noswap_vget_lane_f16(__rev2_874, __p3_874)}); \ + __ret_874 = __builtin_shufflevector(__ret_874, __ret_874, 3, 2, 1, 0); \ + __ret_874; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlsl_lane_high_f16(__p0_875, __p1_875, __p2_875, __p3_875) __extension__ ({ \ + float32x2_t __ret_875; \ + float32x2_t __s0_875 = __p0_875; \ + float16x4_t __s1_875 = __p1_875; \ + float16x4_t __s2_875 = __p2_875; \ + __ret_875 = vfmlsl_high_f16(__s0_875, __s1_875, (float16x4_t) {vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875), vget_lane_f16(__s2_875, __p3_875)}); \ + __ret_875; \ +}) +#else +#define vfmlsl_lane_high_f16(__p0_876, __p1_876, __p2_876, __p3_876) __extension__ ({ \ + float32x2_t __ret_876; \ + float32x2_t __s0_876 = __p0_876; \ + float16x4_t __s1_876 = __p1_876; \ + float16x4_t __s2_876 = __p2_876; \ + float32x2_t __rev0_876; __rev0_876 = __builtin_shufflevector(__s0_876, __s0_876, 1, 0); \ + float16x4_t __rev1_876; __rev1_876 = __builtin_shufflevector(__s1_876, __s1_876, 3, 2, 1, 0); \ + float16x4_t __rev2_876; __rev2_876 = __builtin_shufflevector(__s2_876, __s2_876, 3, 2, 1, 0); \ + __ret_876 = __noswap_vfmlsl_high_f16(__rev0_876, __rev1_876, (float16x4_t) {__noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876), __noswap_vget_lane_f16(__rev2_876, __p3_876)}); \ + __ret_876 = __builtin_shufflevector(__ret_876, __ret_876, 1, 0); \ + __ret_876; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlslq_lane_low_f16(__p0_877, __p1_877, __p2_877, __p3_877) __extension__ ({ \ + float32x4_t __ret_877; \ + float32x4_t __s0_877 = __p0_877; \ + float16x8_t __s1_877 = __p1_877; \ + float16x4_t __s2_877 = __p2_877; \ + __ret_877 = vfmlslq_low_f16(__s0_877, __s1_877, (float16x8_t) {vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877), vget_lane_f16(__s2_877, __p3_877)}); \ + __ret_877; \ +}) +#else +#define vfmlslq_lane_low_f16(__p0_878, __p1_878, __p2_878, __p3_878) __extension__ ({ \ + float32x4_t __ret_878; \ + float32x4_t __s0_878 = __p0_878; \ + float16x8_t __s1_878 = __p1_878; \ + float16x4_t __s2_878 = __p2_878; \ + float32x4_t __rev0_878; __rev0_878 = __builtin_shufflevector(__s0_878, __s0_878, 3, 2, 1, 0); \ + float16x8_t __rev1_878; __rev1_878 = __builtin_shufflevector(__s1_878, __s1_878, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_878; __rev2_878 = __builtin_shufflevector(__s2_878, __s2_878, 3, 2, 1, 0); \ + __ret_878 = __noswap_vfmlslq_low_f16(__rev0_878, __rev1_878, (float16x8_t) {__noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878), __noswap_vget_lane_f16(__rev2_878, __p3_878)}); \ + __ret_878 = __builtin_shufflevector(__ret_878, __ret_878, 3, 2, 1, 0); \ + __ret_878; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlsl_lane_low_f16(__p0_879, __p1_879, __p2_879, __p3_879) __extension__ ({ \ + float32x2_t __ret_879; \ + float32x2_t __s0_879 = __p0_879; \ + float16x4_t __s1_879 = __p1_879; \ + float16x4_t __s2_879 = __p2_879; \ + __ret_879 = vfmlsl_low_f16(__s0_879, __s1_879, (float16x4_t) {vget_lane_f16(__s2_879, __p3_879), vget_lane_f16(__s2_879, __p3_879), vget_lane_f16(__s2_879, __p3_879), vget_lane_f16(__s2_879, __p3_879)}); \ + __ret_879; \ +}) +#else +#define vfmlsl_lane_low_f16(__p0_880, __p1_880, __p2_880, __p3_880) __extension__ ({ \ + float32x2_t __ret_880; \ + float32x2_t __s0_880 = __p0_880; \ + float16x4_t __s1_880 = __p1_880; \ + float16x4_t __s2_880 = __p2_880; \ + float32x2_t __rev0_880; __rev0_880 = __builtin_shufflevector(__s0_880, __s0_880, 1, 0); \ + float16x4_t __rev1_880; __rev1_880 = __builtin_shufflevector(__s1_880, __s1_880, 3, 2, 1, 0); \ + float16x4_t __rev2_880; __rev2_880 = __builtin_shufflevector(__s2_880, __s2_880, 3, 2, 1, 0); \ + __ret_880 = __noswap_vfmlsl_low_f16(__rev0_880, __rev1_880, (float16x4_t) {__noswap_vget_lane_f16(__rev2_880, __p3_880), __noswap_vget_lane_f16(__rev2_880, __p3_880), __noswap_vget_lane_f16(__rev2_880, __p3_880), __noswap_vget_lane_f16(__rev2_880, __p3_880)}); \ + __ret_880 = __builtin_shufflevector(__ret_880, __ret_880, 1, 0); \ + __ret_880; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlslq_laneq_high_f16(__p0_881, __p1_881, __p2_881, __p3_881) __extension__ ({ \ + float32x4_t __ret_881; \ + float32x4_t __s0_881 = __p0_881; \ + float16x8_t __s1_881 = __p1_881; \ + float16x8_t __s2_881 = __p2_881; \ + __ret_881 = vfmlslq_high_f16(__s0_881, __s1_881, (float16x8_t) {vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881), vgetq_lane_f16(__s2_881, __p3_881)}); \ + __ret_881; \ +}) +#else +#define vfmlslq_laneq_high_f16(__p0_882, __p1_882, __p2_882, __p3_882) __extension__ ({ \ + float32x4_t __ret_882; \ + float32x4_t __s0_882 = __p0_882; \ + float16x8_t __s1_882 = __p1_882; \ + float16x8_t __s2_882 = __p2_882; \ + float32x4_t __rev0_882; __rev0_882 = __builtin_shufflevector(__s0_882, __s0_882, 3, 2, 1, 0); \ + float16x8_t __rev1_882; __rev1_882 = __builtin_shufflevector(__s1_882, __s1_882, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2_882; __rev2_882 = __builtin_shufflevector(__s2_882, __s2_882, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_882 = __noswap_vfmlslq_high_f16(__rev0_882, __rev1_882, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882), __noswap_vgetq_lane_f16(__rev2_882, __p3_882)}); \ + __ret_882 = __builtin_shufflevector(__ret_882, __ret_882, 3, 2, 1, 0); \ + __ret_882; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlsl_laneq_high_f16(__p0_883, __p1_883, __p2_883, __p3_883) __extension__ ({ \ + float32x2_t __ret_883; \ + float32x2_t __s0_883 = __p0_883; \ + float16x4_t __s1_883 = __p1_883; \ + float16x8_t __s2_883 = __p2_883; \ + __ret_883 = vfmlsl_high_f16(__s0_883, __s1_883, (float16x4_t) {vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883), vgetq_lane_f16(__s2_883, __p3_883)}); \ + __ret_883; \ +}) +#else +#define vfmlsl_laneq_high_f16(__p0_884, __p1_884, __p2_884, __p3_884) __extension__ ({ \ + float32x2_t __ret_884; \ + float32x2_t __s0_884 = __p0_884; \ + float16x4_t __s1_884 = __p1_884; \ + float16x8_t __s2_884 = __p2_884; \ + float32x2_t __rev0_884; __rev0_884 = __builtin_shufflevector(__s0_884, __s0_884, 1, 0); \ + float16x4_t __rev1_884; __rev1_884 = __builtin_shufflevector(__s1_884, __s1_884, 3, 2, 1, 0); \ + float16x8_t __rev2_884; __rev2_884 = __builtin_shufflevector(__s2_884, __s2_884, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_884 = __noswap_vfmlsl_high_f16(__rev0_884, __rev1_884, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884), __noswap_vgetq_lane_f16(__rev2_884, __p3_884)}); \ + __ret_884 = __builtin_shufflevector(__ret_884, __ret_884, 1, 0); \ + __ret_884; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmlslq_laneq_low_f16(__p0_885, __p1_885, __p2_885, __p3_885) __extension__ ({ \ float32x4_t __ret_885; \ float32x4_t __s0_885 = __p0_885; \ float16x8_t __s1_885 = __p1_885; \ - float16x4_t __s2_885 = __p2_885; \ - __ret_885 = vfmlalq_high_f16(__s0_885, __s1_885, (float16x8_t) {vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885), vget_lane_f16(__s2_885, __p3_885)}); \ + float16x8_t __s2_885 = __p2_885; \ + __ret_885 = vfmlslq_low_f16(__s0_885, __s1_885, (float16x8_t) {vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885), vgetq_lane_f16(__s2_885, __p3_885)}); \ __ret_885; \ }) #else -#define vfmlalq_lane_high_f16(__p0_886, __p1_886, __p2_886, __p3_886) __extension__ ({ \ +#define vfmlslq_laneq_low_f16(__p0_886, __p1_886, __p2_886, __p3_886) __extension__ ({ \ float32x4_t __ret_886; \ float32x4_t __s0_886 = __p0_886; \ float16x8_t __s1_886 = __p1_886; \ - float16x4_t __s2_886 = __p2_886; \ + float16x8_t __s2_886 = __p2_886; \ float32x4_t __rev0_886; __rev0_886 = __builtin_shufflevector(__s0_886, __s0_886, 3, 2, 1, 0); \ float16x8_t __rev1_886; __rev1_886 = __builtin_shufflevector(__s1_886, __s1_886, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev2_886; __rev2_886 = __builtin_shufflevector(__s2_886, __s2_886, 3, 2, 1, 0); \ - __ret_886 = __noswap_vfmlalq_high_f16(__rev0_886, __rev1_886, (float16x8_t) {__noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886), __noswap_vget_lane_f16(__rev2_886, __p3_886)}); \ + float16x8_t __rev2_886; __rev2_886 = __builtin_shufflevector(__s2_886, __s2_886, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_886 = __noswap_vfmlslq_low_f16(__rev0_886, __rev1_886, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886), __noswap_vgetq_lane_f16(__rev2_886, __p3_886)}); \ __ret_886 = __builtin_shufflevector(__ret_886, __ret_886, 3, 2, 1, 0); \ __ret_886; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_lane_high_f16(__p0_887, __p1_887, __p2_887, __p3_887) __extension__ ({ \ +#define vfmlsl_laneq_low_f16(__p0_887, __p1_887, __p2_887, __p3_887) __extension__ ({ \ float32x2_t __ret_887; \ float32x2_t __s0_887 = __p0_887; \ float16x4_t __s1_887 = __p1_887; \ - float16x4_t __s2_887 = __p2_887; \ - __ret_887 = vfmlal_high_f16(__s0_887, __s1_887, (float16x4_t) {vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887), vget_lane_f16(__s2_887, __p3_887)}); \ + float16x8_t __s2_887 = __p2_887; \ + __ret_887 = vfmlsl_low_f16(__s0_887, __s1_887, (float16x4_t) {vgetq_lane_f16(__s2_887, __p3_887), vgetq_lane_f16(__s2_887, __p3_887), vgetq_lane_f16(__s2_887, __p3_887), vgetq_lane_f16(__s2_887, __p3_887)}); \ __ret_887; \ }) #else -#define vfmlal_lane_high_f16(__p0_888, __p1_888, __p2_888, __p3_888) __extension__ ({ \ +#define vfmlsl_laneq_low_f16(__p0_888, __p1_888, __p2_888, __p3_888) __extension__ ({ \ float32x2_t __ret_888; \ float32x2_t __s0_888 = __p0_888; \ float16x4_t __s1_888 = __p1_888; \ - float16x4_t __s2_888 = __p2_888; \ + float16x8_t __s2_888 = __p2_888; \ float32x2_t __rev0_888; __rev0_888 = __builtin_shufflevector(__s0_888, __s0_888, 1, 0); \ float16x4_t __rev1_888; __rev1_888 = __builtin_shufflevector(__s1_888, __s1_888, 3, 2, 1, 0); \ - float16x4_t __rev2_888; __rev2_888 = __builtin_shufflevector(__s2_888, __s2_888, 3, 2, 1, 0); \ - __ret_888 = __noswap_vfmlal_high_f16(__rev0_888, __rev1_888, (float16x4_t) {__noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888), __noswap_vget_lane_f16(__rev2_888, __p3_888)}); \ + float16x8_t __rev2_888; __rev2_888 = __builtin_shufflevector(__s2_888, __s2_888, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_888 = __noswap_vfmlsl_low_f16(__rev0_888, __rev1_888, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_888, __p3_888), __noswap_vgetq_lane_f16(__rev2_888, __p3_888), __noswap_vgetq_lane_f16(__rev2_888, __p3_888), __noswap_vgetq_lane_f16(__rev2_888, __p3_888)}); \ __ret_888 = __builtin_shufflevector(__ret_888, __ret_888, 1, 0); \ __ret_888; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlalq_lane_low_f16(__p0_889, __p1_889, __p2_889, __p3_889) __extension__ ({ \ - float32x4_t __ret_889; \ - float32x4_t __s0_889 = __p0_889; \ - float16x8_t __s1_889 = __p1_889; \ - float16x4_t __s2_889 = __p2_889; \ - __ret_889 = vfmlalq_low_f16(__s0_889, __s1_889, (float16x8_t) {vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889), vget_lane_f16(__s2_889, __p3_889)}); \ +#define vmulh_lane_f16(__p0_889, __p1_889, __p2_889) __extension__ ({ \ + float16_t __ret_889; \ + float16_t __s0_889 = __p0_889; \ + float16x4_t __s1_889 = __p1_889; \ + __ret_889 = __s0_889 * vget_lane_f16(__s1_889, __p2_889); \ __ret_889; \ }) #else -#define vfmlalq_lane_low_f16(__p0_890, __p1_890, __p2_890, __p3_890) __extension__ ({ \ - float32x4_t __ret_890; \ - float32x4_t __s0_890 = __p0_890; \ - float16x8_t __s1_890 = __p1_890; \ - float16x4_t __s2_890 = __p2_890; \ - float32x4_t __rev0_890; __rev0_890 = __builtin_shufflevector(__s0_890, __s0_890, 3, 2, 1, 0); \ - float16x8_t __rev1_890; __rev1_890 = __builtin_shufflevector(__s1_890, __s1_890, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev2_890; __rev2_890 = __builtin_shufflevector(__s2_890, __s2_890, 3, 2, 1, 0); \ - __ret_890 = __noswap_vfmlalq_low_f16(__rev0_890, __rev1_890, (float16x8_t) {__noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890), __noswap_vget_lane_f16(__rev2_890, __p3_890)}); \ - __ret_890 = __builtin_shufflevector(__ret_890, __ret_890, 3, 2, 1, 0); \ +#define vmulh_lane_f16(__p0_890, __p1_890, __p2_890) __extension__ ({ \ + float16_t __ret_890; \ + float16_t __s0_890 = __p0_890; \ + float16x4_t __s1_890 = __p1_890; \ + float16x4_t __rev1_890; __rev1_890 = __builtin_shufflevector(__s1_890, __s1_890, 3, 2, 1, 0); \ + __ret_890 = __s0_890 * __noswap_vget_lane_f16(__rev1_890, __p2_890); \ __ret_890; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmlal_lane_low_f16(__p0_891, __p1_891, __p2_891, __p3_891) __extension__ ({ \ - float32x2_t __ret_891; \ - float32x2_t __s0_891 = __p0_891; \ - float16x4_t __s1_891 = __p1_891; \ - float16x4_t __s2_891 = __p2_891; \ - __ret_891 = vfmlal_low_f16(__s0_891, __s1_891, (float16x4_t) {vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891), vget_lane_f16(__s2_891, __p3_891)}); \ +#define vmulh_laneq_f16(__p0_891, __p1_891, __p2_891) __extension__ ({ \ + float16_t __ret_891; \ + float16_t __s0_891 = __p0_891; \ + float16x8_t __s1_891 = __p1_891; \ + __ret_891 = __s0_891 * vgetq_lane_f16(__s1_891, __p2_891); \ __ret_891; \ }) #else -#define vfmlal_lane_low_f16(__p0_892, __p1_892, __p2_892, __p3_892) __extension__ ({ \ - float32x2_t __ret_892; \ - float32x2_t __s0_892 = __p0_892; \ - float16x4_t __s1_892 = __p1_892; \ - float16x4_t __s2_892 = __p2_892; \ - float32x2_t __rev0_892; __rev0_892 = __builtin_shufflevector(__s0_892, __s0_892, 1, 0); \ - float16x4_t __rev1_892; __rev1_892 = __builtin_shufflevector(__s1_892, __s1_892, 3, 2, 1, 0); \ - float16x4_t __rev2_892; __rev2_892 = __builtin_shufflevector(__s2_892, __s2_892, 3, 2, 1, 0); \ - __ret_892 = __noswap_vfmlal_low_f16(__rev0_892, __rev1_892, (float16x4_t) {__noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892), __noswap_vget_lane_f16(__rev2_892, __p3_892)}); \ - __ret_892 = __builtin_shufflevector(__ret_892, __ret_892, 1, 0); \ +#define vmulh_laneq_f16(__p0_892, __p1_892, __p2_892) __extension__ ({ \ + float16_t __ret_892; \ + float16_t __s0_892 = __p0_892; \ + float16x8_t __s1_892 = __p1_892; \ + float16x8_t __rev1_892; __rev1_892 = __builtin_shufflevector(__s1_892, __s1_892, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_892 = __s0_892 * __noswap_vgetq_lane_f16(__rev1_892, __p2_892); \ __ret_892; \ }) #endif -#ifdef __LITTLE_ENDIAN__ -#define vfmlalq_laneq_high_f16(__p0_893, __p1_893, __p2_893, __p3_893) __extension__ ({ \ - float32x4_t __ret_893; \ - float32x4_t __s0_893 = __p0_893; \ - float16x8_t __s1_893 = __p1_893; \ - float16x8_t __s2_893 = __p2_893; \ - __ret_893 = vfmlalq_high_f16(__s0_893, __s1_893, (float16x8_t) {vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893), vgetq_lane_f16(__s2_893, __p3_893)}); \ - __ret_893; \ -}) -#else -#define vfmlalq_laneq_high_f16(__p0_894, __p1_894, __p2_894, __p3_894) __extension__ ({ \ - float32x4_t __ret_894; \ - float32x4_t __s0_894 = __p0_894; \ - float16x8_t __s1_894 = __p1_894; \ - float16x8_t __s2_894 = __p2_894; \ - float32x4_t __rev0_894; __rev0_894 = __builtin_shufflevector(__s0_894, __s0_894, 3, 2, 1, 0); \ - float16x8_t __rev1_894; __rev1_894 = __builtin_shufflevector(__s1_894, __s1_894, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __rev2_894; __rev2_894 = __builtin_shufflevector(__s2_894, __s2_894, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_894 = __noswap_vfmlalq_high_f16(__rev0_894, __rev1_894, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894), __noswap_vgetq_lane_f16(__rev2_894, __p3_894)}); \ - __ret_894 = __builtin_shufflevector(__ret_894, __ret_894, 3, 2, 1, 0); \ - __ret_894; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlal_laneq_high_f16(__p0_895, __p1_895, __p2_895, __p3_895) __extension__ ({ \ - float32x2_t __ret_895; \ - float32x2_t __s0_895 = __p0_895; \ - float16x4_t __s1_895 = __p1_895; \ - float16x8_t __s2_895 = __p2_895; \ - __ret_895 = vfmlal_high_f16(__s0_895, __s1_895, (float16x4_t) {vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895), vgetq_lane_f16(__s2_895, __p3_895)}); \ - __ret_895; \ -}) -#else -#define vfmlal_laneq_high_f16(__p0_896, __p1_896, __p2_896, __p3_896) __extension__ ({ \ - float32x2_t __ret_896; \ - float32x2_t __s0_896 = __p0_896; \ - float16x4_t __s1_896 = __p1_896; \ - float16x8_t __s2_896 = __p2_896; \ - float32x2_t __rev0_896; __rev0_896 = __builtin_shufflevector(__s0_896, __s0_896, 1, 0); \ - float16x4_t __rev1_896; __rev1_896 = __builtin_shufflevector(__s1_896, __s1_896, 3, 2, 1, 0); \ - float16x8_t __rev2_896; __rev2_896 = __builtin_shufflevector(__s2_896, __s2_896, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_896 = __noswap_vfmlal_high_f16(__rev0_896, __rev1_896, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896), __noswap_vgetq_lane_f16(__rev2_896, __p3_896)}); \ - __ret_896 = __builtin_shufflevector(__ret_896, __ret_896, 1, 0); \ - __ret_896; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlalq_laneq_low_f16(__p0_897, __p1_897, __p2_897, __p3_897) __extension__ ({ \ - float32x4_t __ret_897; \ - float32x4_t __s0_897 = __p0_897; \ - float16x8_t __s1_897 = __p1_897; \ - float16x8_t __s2_897 = __p2_897; \ - __ret_897 = vfmlalq_low_f16(__s0_897, __s1_897, (float16x8_t) {vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897), vgetq_lane_f16(__s2_897, __p3_897)}); \ - __ret_897; \ -}) -#else -#define vfmlalq_laneq_low_f16(__p0_898, __p1_898, __p2_898, __p3_898) __extension__ ({ \ - float32x4_t __ret_898; \ - float32x4_t __s0_898 = __p0_898; \ - float16x8_t __s1_898 = __p1_898; \ - float16x8_t __s2_898 = __p2_898; \ - float32x4_t __rev0_898; __rev0_898 = __builtin_shufflevector(__s0_898, __s0_898, 3, 2, 1, 0); \ - float16x8_t __rev1_898; __rev1_898 = __builtin_shufflevector(__s1_898, __s1_898, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __rev2_898; __rev2_898 = __builtin_shufflevector(__s2_898, __s2_898, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_898 = __noswap_vfmlalq_low_f16(__rev0_898, __rev1_898, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898), __noswap_vgetq_lane_f16(__rev2_898, __p3_898)}); \ - __ret_898 = __builtin_shufflevector(__ret_898, __ret_898, 3, 2, 1, 0); \ - __ret_898; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlal_laneq_low_f16(__p0_899, __p1_899, __p2_899, __p3_899) __extension__ ({ \ - float32x2_t __ret_899; \ - float32x2_t __s0_899 = __p0_899; \ - float16x4_t __s1_899 = __p1_899; \ - float16x8_t __s2_899 = __p2_899; \ - __ret_899 = vfmlal_low_f16(__s0_899, __s1_899, (float16x4_t) {vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899), vgetq_lane_f16(__s2_899, __p3_899)}); \ - __ret_899; \ -}) -#else -#define vfmlal_laneq_low_f16(__p0_900, __p1_900, __p2_900, __p3_900) __extension__ ({ \ - float32x2_t __ret_900; \ - float32x2_t __s0_900 = __p0_900; \ - float16x4_t __s1_900 = __p1_900; \ - float16x8_t __s2_900 = __p2_900; \ - float32x2_t __rev0_900; __rev0_900 = __builtin_shufflevector(__s0_900, __s0_900, 1, 0); \ - float16x4_t __rev1_900; __rev1_900 = __builtin_shufflevector(__s1_900, __s1_900, 3, 2, 1, 0); \ - float16x8_t __rev2_900; __rev2_900 = __builtin_shufflevector(__s2_900, __s2_900, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_900 = __noswap_vfmlal_low_f16(__rev0_900, __rev1_900, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900), __noswap_vgetq_lane_f16(__rev2_900, __p3_900)}); \ - __ret_900 = __builtin_shufflevector(__ret_900, __ret_900, 1, 0); \ - __ret_900; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlslq_lane_high_f16(__p0_901, __p1_901, __p2_901, __p3_901) __extension__ ({ \ - float32x4_t __ret_901; \ - float32x4_t __s0_901 = __p0_901; \ - float16x8_t __s1_901 = __p1_901; \ - float16x4_t __s2_901 = __p2_901; \ - __ret_901 = vfmlslq_high_f16(__s0_901, __s1_901, (float16x8_t) {vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901), vget_lane_f16(__s2_901, __p3_901)}); \ - __ret_901; \ -}) -#else -#define vfmlslq_lane_high_f16(__p0_902, __p1_902, __p2_902, __p3_902) __extension__ ({ \ - float32x4_t __ret_902; \ - float32x4_t __s0_902 = __p0_902; \ - float16x8_t __s1_902 = __p1_902; \ - float16x4_t __s2_902 = __p2_902; \ - float32x4_t __rev0_902; __rev0_902 = __builtin_shufflevector(__s0_902, __s0_902, 3, 2, 1, 0); \ - float16x8_t __rev1_902; __rev1_902 = __builtin_shufflevector(__s1_902, __s1_902, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev2_902; __rev2_902 = __builtin_shufflevector(__s2_902, __s2_902, 3, 2, 1, 0); \ - __ret_902 = __noswap_vfmlslq_high_f16(__rev0_902, __rev1_902, (float16x8_t) {__noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902), __noswap_vget_lane_f16(__rev2_902, __p3_902)}); \ - __ret_902 = __builtin_shufflevector(__ret_902, __ret_902, 3, 2, 1, 0); \ - __ret_902; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlsl_lane_high_f16(__p0_903, __p1_903, __p2_903, __p3_903) __extension__ ({ \ - float32x2_t __ret_903; \ - float32x2_t __s0_903 = __p0_903; \ - float16x4_t __s1_903 = __p1_903; \ - float16x4_t __s2_903 = __p2_903; \ - __ret_903 = vfmlsl_high_f16(__s0_903, __s1_903, (float16x4_t) {vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903), vget_lane_f16(__s2_903, __p3_903)}); \ - __ret_903; \ -}) -#else -#define vfmlsl_lane_high_f16(__p0_904, __p1_904, __p2_904, __p3_904) __extension__ ({ \ - float32x2_t __ret_904; \ - float32x2_t __s0_904 = __p0_904; \ - float16x4_t __s1_904 = __p1_904; \ - float16x4_t __s2_904 = __p2_904; \ - float32x2_t __rev0_904; __rev0_904 = __builtin_shufflevector(__s0_904, __s0_904, 1, 0); \ - float16x4_t __rev1_904; __rev1_904 = __builtin_shufflevector(__s1_904, __s1_904, 3, 2, 1, 0); \ - float16x4_t __rev2_904; __rev2_904 = __builtin_shufflevector(__s2_904, __s2_904, 3, 2, 1, 0); \ - __ret_904 = __noswap_vfmlsl_high_f16(__rev0_904, __rev1_904, (float16x4_t) {__noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904), __noswap_vget_lane_f16(__rev2_904, __p3_904)}); \ - __ret_904 = __builtin_shufflevector(__ret_904, __ret_904, 1, 0); \ - __ret_904; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlslq_lane_low_f16(__p0_905, __p1_905, __p2_905, __p3_905) __extension__ ({ \ - float32x4_t __ret_905; \ - float32x4_t __s0_905 = __p0_905; \ - float16x8_t __s1_905 = __p1_905; \ - float16x4_t __s2_905 = __p2_905; \ - __ret_905 = vfmlslq_low_f16(__s0_905, __s1_905, (float16x8_t) {vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905), vget_lane_f16(__s2_905, __p3_905)}); \ - __ret_905; \ -}) -#else -#define vfmlslq_lane_low_f16(__p0_906, __p1_906, __p2_906, __p3_906) __extension__ ({ \ - float32x4_t __ret_906; \ - float32x4_t __s0_906 = __p0_906; \ - float16x8_t __s1_906 = __p1_906; \ - float16x4_t __s2_906 = __p2_906; \ - float32x4_t __rev0_906; __rev0_906 = __builtin_shufflevector(__s0_906, __s0_906, 3, 2, 1, 0); \ - float16x8_t __rev1_906; __rev1_906 = __builtin_shufflevector(__s1_906, __s1_906, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev2_906; __rev2_906 = __builtin_shufflevector(__s2_906, __s2_906, 3, 2, 1, 0); \ - __ret_906 = __noswap_vfmlslq_low_f16(__rev0_906, __rev1_906, (float16x8_t) {__noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906), __noswap_vget_lane_f16(__rev2_906, __p3_906)}); \ - __ret_906 = __builtin_shufflevector(__ret_906, __ret_906, 3, 2, 1, 0); \ - __ret_906; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlsl_lane_low_f16(__p0_907, __p1_907, __p2_907, __p3_907) __extension__ ({ \ - float32x2_t __ret_907; \ - float32x2_t __s0_907 = __p0_907; \ - float16x4_t __s1_907 = __p1_907; \ - float16x4_t __s2_907 = __p2_907; \ - __ret_907 = vfmlsl_low_f16(__s0_907, __s1_907, (float16x4_t) {vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907), vget_lane_f16(__s2_907, __p3_907)}); \ - __ret_907; \ -}) -#else -#define vfmlsl_lane_low_f16(__p0_908, __p1_908, __p2_908, __p3_908) __extension__ ({ \ - float32x2_t __ret_908; \ - float32x2_t __s0_908 = __p0_908; \ - float16x4_t __s1_908 = __p1_908; \ - float16x4_t __s2_908 = __p2_908; \ - float32x2_t __rev0_908; __rev0_908 = __builtin_shufflevector(__s0_908, __s0_908, 1, 0); \ - float16x4_t __rev1_908; __rev1_908 = __builtin_shufflevector(__s1_908, __s1_908, 3, 2, 1, 0); \ - float16x4_t __rev2_908; __rev2_908 = __builtin_shufflevector(__s2_908, __s2_908, 3, 2, 1, 0); \ - __ret_908 = __noswap_vfmlsl_low_f16(__rev0_908, __rev1_908, (float16x4_t) {__noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908), __noswap_vget_lane_f16(__rev2_908, __p3_908)}); \ - __ret_908 = __builtin_shufflevector(__ret_908, __ret_908, 1, 0); \ - __ret_908; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlslq_laneq_high_f16(__p0_909, __p1_909, __p2_909, __p3_909) __extension__ ({ \ - float32x4_t __ret_909; \ - float32x4_t __s0_909 = __p0_909; \ - float16x8_t __s1_909 = __p1_909; \ - float16x8_t __s2_909 = __p2_909; \ - __ret_909 = vfmlslq_high_f16(__s0_909, __s1_909, (float16x8_t) {vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909), vgetq_lane_f16(__s2_909, __p3_909)}); \ - __ret_909; \ -}) -#else -#define vfmlslq_laneq_high_f16(__p0_910, __p1_910, __p2_910, __p3_910) __extension__ ({ \ - float32x4_t __ret_910; \ - float32x4_t __s0_910 = __p0_910; \ - float16x8_t __s1_910 = __p1_910; \ - float16x8_t __s2_910 = __p2_910; \ - float32x4_t __rev0_910; __rev0_910 = __builtin_shufflevector(__s0_910, __s0_910, 3, 2, 1, 0); \ - float16x8_t __rev1_910; __rev1_910 = __builtin_shufflevector(__s1_910, __s1_910, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __rev2_910; __rev2_910 = __builtin_shufflevector(__s2_910, __s2_910, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_910 = __noswap_vfmlslq_high_f16(__rev0_910, __rev1_910, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910), __noswap_vgetq_lane_f16(__rev2_910, __p3_910)}); \ - __ret_910 = __builtin_shufflevector(__ret_910, __ret_910, 3, 2, 1, 0); \ - __ret_910; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlsl_laneq_high_f16(__p0_911, __p1_911, __p2_911, __p3_911) __extension__ ({ \ - float32x2_t __ret_911; \ - float32x2_t __s0_911 = __p0_911; \ - float16x4_t __s1_911 = __p1_911; \ - float16x8_t __s2_911 = __p2_911; \ - __ret_911 = vfmlsl_high_f16(__s0_911, __s1_911, (float16x4_t) {vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911), vgetq_lane_f16(__s2_911, __p3_911)}); \ - __ret_911; \ -}) -#else -#define vfmlsl_laneq_high_f16(__p0_912, __p1_912, __p2_912, __p3_912) __extension__ ({ \ - float32x2_t __ret_912; \ - float32x2_t __s0_912 = __p0_912; \ - float16x4_t __s1_912 = __p1_912; \ - float16x8_t __s2_912 = __p2_912; \ - float32x2_t __rev0_912; __rev0_912 = __builtin_shufflevector(__s0_912, __s0_912, 1, 0); \ - float16x4_t __rev1_912; __rev1_912 = __builtin_shufflevector(__s1_912, __s1_912, 3, 2, 1, 0); \ - float16x8_t __rev2_912; __rev2_912 = __builtin_shufflevector(__s2_912, __s2_912, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_912 = __noswap_vfmlsl_high_f16(__rev0_912, __rev1_912, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912), __noswap_vgetq_lane_f16(__rev2_912, __p3_912)}); \ - __ret_912 = __builtin_shufflevector(__ret_912, __ret_912, 1, 0); \ - __ret_912; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlslq_laneq_low_f16(__p0_913, __p1_913, __p2_913, __p3_913) __extension__ ({ \ - float32x4_t __ret_913; \ - float32x4_t __s0_913 = __p0_913; \ - float16x8_t __s1_913 = __p1_913; \ - float16x8_t __s2_913 = __p2_913; \ - __ret_913 = vfmlslq_low_f16(__s0_913, __s1_913, (float16x8_t) {vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913), vgetq_lane_f16(__s2_913, __p3_913)}); \ - __ret_913; \ -}) -#else -#define vfmlslq_laneq_low_f16(__p0_914, __p1_914, __p2_914, __p3_914) __extension__ ({ \ - float32x4_t __ret_914; \ - float32x4_t __s0_914 = __p0_914; \ - float16x8_t __s1_914 = __p1_914; \ - float16x8_t __s2_914 = __p2_914; \ - float32x4_t __rev0_914; __rev0_914 = __builtin_shufflevector(__s0_914, __s0_914, 3, 2, 1, 0); \ - float16x8_t __rev1_914; __rev1_914 = __builtin_shufflevector(__s1_914, __s1_914, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __rev2_914; __rev2_914 = __builtin_shufflevector(__s2_914, __s2_914, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_914 = __noswap_vfmlslq_low_f16(__rev0_914, __rev1_914, (float16x8_t) {__noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914), __noswap_vgetq_lane_f16(__rev2_914, __p3_914)}); \ - __ret_914 = __builtin_shufflevector(__ret_914, __ret_914, 3, 2, 1, 0); \ - __ret_914; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmlsl_laneq_low_f16(__p0_915, __p1_915, __p2_915, __p3_915) __extension__ ({ \ - float32x2_t __ret_915; \ - float32x2_t __s0_915 = __p0_915; \ - float16x4_t __s1_915 = __p1_915; \ - float16x8_t __s2_915 = __p2_915; \ - __ret_915 = vfmlsl_low_f16(__s0_915, __s1_915, (float16x4_t) {vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915), vgetq_lane_f16(__s2_915, __p3_915)}); \ - __ret_915; \ -}) -#else -#define vfmlsl_laneq_low_f16(__p0_916, __p1_916, __p2_916, __p3_916) __extension__ ({ \ - float32x2_t __ret_916; \ - float32x2_t __s0_916 = __p0_916; \ - float16x4_t __s1_916 = __p1_916; \ - float16x8_t __s2_916 = __p2_916; \ - float32x2_t __rev0_916; __rev0_916 = __builtin_shufflevector(__s0_916, __s0_916, 1, 0); \ - float16x4_t __rev1_916; __rev1_916 = __builtin_shufflevector(__s1_916, __s1_916, 3, 2, 1, 0); \ - float16x8_t __rev2_916; __rev2_916 = __builtin_shufflevector(__s2_916, __s2_916, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_916 = __noswap_vfmlsl_low_f16(__rev0_916, __rev1_916, (float16x4_t) {__noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916), __noswap_vgetq_lane_f16(__rev2_916, __p3_916)}); \ - __ret_916 = __builtin_shufflevector(__ret_916, __ret_916, 1, 0); \ - __ret_916; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vmulh_lane_f16(__p0_917, __p1_917, __p2_917) __extension__ ({ \ - float16_t __ret_917; \ - float16_t __s0_917 = __p0_917; \ - float16x4_t __s1_917 = __p1_917; \ - __ret_917 = __s0_917 * vget_lane_f16(__s1_917, __p2_917); \ - __ret_917; \ -}) -#else -#define vmulh_lane_f16(__p0_918, __p1_918, __p2_918) __extension__ ({ \ - float16_t __ret_918; \ - float16_t __s0_918 = __p0_918; \ - float16x4_t __s1_918 = __p1_918; \ - float16x4_t __rev1_918; __rev1_918 = __builtin_shufflevector(__s1_918, __s1_918, 3, 2, 1, 0); \ - __ret_918 = __s0_918 * __noswap_vget_lane_f16(__rev1_918, __p2_918); \ - __ret_918; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vmulh_laneq_f16(__p0_919, __p1_919, __p2_919) __extension__ ({ \ - float16_t __ret_919; \ - float16_t __s0_919 = __p0_919; \ - float16x8_t __s1_919 = __p1_919; \ - __ret_919 = __s0_919 * vgetq_lane_f16(__s1_919, __p2_919); \ - __ret_919; \ -}) -#else -#define vmulh_laneq_f16(__p0_920, __p1_920, __p2_920) __extension__ ({ \ - float16_t __ret_920; \ - float16_t __s0_920 = __p0_920; \ - float16x8_t __s1_920 = __p1_920; \ - float16x8_t __rev1_920; __rev1_920 = __builtin_shufflevector(__s1_920, __s1_920, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_920 = __s0_920 * __noswap_vgetq_lane_f16(__rev1_920, __p2_920); \ - __ret_920; \ -}) -#endif - #ifdef __LITTLE_ENDIAN__ __ai __attribute__((target("neon"))) uint16x8_t vabdl_high_u8(uint8x16_t __p0, uint8x16_t __p1) { uint16x8_t __ret; @@ -68862,136 +70753,136 @@ __ai __attribute__((target("neon"))) int32x4_t vaddw_high_s16(int32x4_t __p0, in #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p64(__p0_921, __p1_921, __p2_921, __p3_921) __extension__ ({ \ - poly64x2_t __ret_921; \ - poly64x2_t __s0_921 = __p0_921; \ - poly64x1_t __s2_921 = __p2_921; \ - __ret_921 = vsetq_lane_p64(vget_lane_p64(__s2_921, __p3_921), __s0_921, __p1_921); \ - __ret_921; \ +#define vcopyq_lane_p64(__p0_893, __p1_893, __p2_893, __p3_893) __extension__ ({ \ + poly64x2_t __ret_893; \ + poly64x2_t __s0_893 = __p0_893; \ + poly64x1_t __s2_893 = __p2_893; \ + __ret_893 = vsetq_lane_p64(vget_lane_p64(__s2_893, __p3_893), __s0_893, __p1_893); \ + __ret_893; \ }) #else -#define vcopyq_lane_p64(__p0_922, __p1_922, __p2_922, __p3_922) __extension__ ({ \ - poly64x2_t __ret_922; \ - poly64x2_t __s0_922 = __p0_922; \ - poly64x1_t __s2_922 = __p2_922; \ - poly64x2_t __rev0_922; __rev0_922 = __builtin_shufflevector(__s0_922, __s0_922, 1, 0); \ - __ret_922 = __noswap_vsetq_lane_p64(vget_lane_p64(__s2_922, __p3_922), __rev0_922, __p1_922); \ - __ret_922 = __builtin_shufflevector(__ret_922, __ret_922, 1, 0); \ - __ret_922; \ +#define vcopyq_lane_p64(__p0_894, __p1_894, __p2_894, __p3_894) __extension__ ({ \ + poly64x2_t __ret_894; \ + poly64x2_t __s0_894 = __p0_894; \ + poly64x1_t __s2_894 = __p2_894; \ + poly64x2_t __rev0_894; __rev0_894 = __builtin_shufflevector(__s0_894, __s0_894, 1, 0); \ + __ret_894 = __noswap_vsetq_lane_p64(vget_lane_p64(__s2_894, __p3_894), __rev0_894, __p1_894); \ + __ret_894 = __builtin_shufflevector(__ret_894, __ret_894, 1, 0); \ + __ret_894; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_f64(__p0_923, __p1_923, __p2_923, __p3_923) __extension__ ({ \ - float64x2_t __ret_923; \ - float64x2_t __s0_923 = __p0_923; \ - float64x1_t __s2_923 = __p2_923; \ - __ret_923 = vsetq_lane_f64(vget_lane_f64(__s2_923, __p3_923), __s0_923, __p1_923); \ - __ret_923; \ +#define vcopyq_lane_f64(__p0_895, __p1_895, __p2_895, __p3_895) __extension__ ({ \ + float64x2_t __ret_895; \ + float64x2_t __s0_895 = __p0_895; \ + float64x1_t __s2_895 = __p2_895; \ + __ret_895 = vsetq_lane_f64(vget_lane_f64(__s2_895, __p3_895), __s0_895, __p1_895); \ + __ret_895; \ }) #else -#define vcopyq_lane_f64(__p0_924, __p1_924, __p2_924, __p3_924) __extension__ ({ \ - float64x2_t __ret_924; \ - float64x2_t __s0_924 = __p0_924; \ - float64x1_t __s2_924 = __p2_924; \ - float64x2_t __rev0_924; __rev0_924 = __builtin_shufflevector(__s0_924, __s0_924, 1, 0); \ - __ret_924 = __noswap_vsetq_lane_f64(vget_lane_f64(__s2_924, __p3_924), __rev0_924, __p1_924); \ - __ret_924 = __builtin_shufflevector(__ret_924, __ret_924, 1, 0); \ - __ret_924; \ +#define vcopyq_lane_f64(__p0_896, __p1_896, __p2_896, __p3_896) __extension__ ({ \ + float64x2_t __ret_896; \ + float64x2_t __s0_896 = __p0_896; \ + float64x1_t __s2_896 = __p2_896; \ + float64x2_t __rev0_896; __rev0_896 = __builtin_shufflevector(__s0_896, __s0_896, 1, 0); \ + __ret_896 = __noswap_vsetq_lane_f64(vget_lane_f64(__s2_896, __p3_896), __rev0_896, __p1_896); \ + __ret_896 = __builtin_shufflevector(__ret_896, __ret_896, 1, 0); \ + __ret_896; \ }) #endif -#define vcopy_lane_p64(__p0_925, __p1_925, __p2_925, __p3_925) __extension__ ({ \ - poly64x1_t __ret_925; \ - poly64x1_t __s0_925 = __p0_925; \ - poly64x1_t __s2_925 = __p2_925; \ - __ret_925 = vset_lane_p64(vget_lane_p64(__s2_925, __p3_925), __s0_925, __p1_925); \ - __ret_925; \ +#define vcopy_lane_p64(__p0_897, __p1_897, __p2_897, __p3_897) __extension__ ({ \ + poly64x1_t __ret_897; \ + poly64x1_t __s0_897 = __p0_897; \ + poly64x1_t __s2_897 = __p2_897; \ + __ret_897 = vset_lane_p64(vget_lane_p64(__s2_897, __p3_897), __s0_897, __p1_897); \ + __ret_897; \ }) -#define vcopy_lane_f64(__p0_926, __p1_926, __p2_926, __p3_926) __extension__ ({ \ - float64x1_t __ret_926; \ - float64x1_t __s0_926 = __p0_926; \ - float64x1_t __s2_926 = __p2_926; \ - __ret_926 = vset_lane_f64(vget_lane_f64(__s2_926, __p3_926), __s0_926, __p1_926); \ - __ret_926; \ +#define vcopy_lane_f64(__p0_898, __p1_898, __p2_898, __p3_898) __extension__ ({ \ + float64x1_t __ret_898; \ + float64x1_t __s0_898 = __p0_898; \ + float64x1_t __s2_898 = __p2_898; \ + __ret_898 = vset_lane_f64(vget_lane_f64(__s2_898, __p3_898), __s0_898, __p1_898); \ + __ret_898; \ }) #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p64(__p0_927, __p1_927, __p2_927, __p3_927) __extension__ ({ \ - poly64x2_t __ret_927; \ - poly64x2_t __s0_927 = __p0_927; \ - poly64x2_t __s2_927 = __p2_927; \ - __ret_927 = vsetq_lane_p64(vgetq_lane_p64(__s2_927, __p3_927), __s0_927, __p1_927); \ - __ret_927; \ +#define vcopyq_laneq_p64(__p0_899, __p1_899, __p2_899, __p3_899) __extension__ ({ \ + poly64x2_t __ret_899; \ + poly64x2_t __s0_899 = __p0_899; \ + poly64x2_t __s2_899 = __p2_899; \ + __ret_899 = vsetq_lane_p64(vgetq_lane_p64(__s2_899, __p3_899), __s0_899, __p1_899); \ + __ret_899; \ }) #else -#define vcopyq_laneq_p64(__p0_928, __p1_928, __p2_928, __p3_928) __extension__ ({ \ - poly64x2_t __ret_928; \ - poly64x2_t __s0_928 = __p0_928; \ - poly64x2_t __s2_928 = __p2_928; \ - poly64x2_t __rev0_928; __rev0_928 = __builtin_shufflevector(__s0_928, __s0_928, 1, 0); \ - poly64x2_t __rev2_928; __rev2_928 = __builtin_shufflevector(__s2_928, __s2_928, 1, 0); \ - __ret_928 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_928, __p3_928), __rev0_928, __p1_928); \ - __ret_928 = __builtin_shufflevector(__ret_928, __ret_928, 1, 0); \ - __ret_928; \ +#define vcopyq_laneq_p64(__p0_900, __p1_900, __p2_900, __p3_900) __extension__ ({ \ + poly64x2_t __ret_900; \ + poly64x2_t __s0_900 = __p0_900; \ + poly64x2_t __s2_900 = __p2_900; \ + poly64x2_t __rev0_900; __rev0_900 = __builtin_shufflevector(__s0_900, __s0_900, 1, 0); \ + poly64x2_t __rev2_900; __rev2_900 = __builtin_shufflevector(__s2_900, __s2_900, 1, 0); \ + __ret_900 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_900, __p3_900), __rev0_900, __p1_900); \ + __ret_900 = __builtin_shufflevector(__ret_900, __ret_900, 1, 0); \ + __ret_900; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_f64(__p0_929, __p1_929, __p2_929, __p3_929) __extension__ ({ \ - float64x2_t __ret_929; \ - float64x2_t __s0_929 = __p0_929; \ - float64x2_t __s2_929 = __p2_929; \ - __ret_929 = vsetq_lane_f64(vgetq_lane_f64(__s2_929, __p3_929), __s0_929, __p1_929); \ - __ret_929; \ +#define vcopyq_laneq_f64(__p0_901, __p1_901, __p2_901, __p3_901) __extension__ ({ \ + float64x2_t __ret_901; \ + float64x2_t __s0_901 = __p0_901; \ + float64x2_t __s2_901 = __p2_901; \ + __ret_901 = vsetq_lane_f64(vgetq_lane_f64(__s2_901, __p3_901), __s0_901, __p1_901); \ + __ret_901; \ }) #else -#define vcopyq_laneq_f64(__p0_930, __p1_930, __p2_930, __p3_930) __extension__ ({ \ - float64x2_t __ret_930; \ - float64x2_t __s0_930 = __p0_930; \ - float64x2_t __s2_930 = __p2_930; \ - float64x2_t __rev0_930; __rev0_930 = __builtin_shufflevector(__s0_930, __s0_930, 1, 0); \ - float64x2_t __rev2_930; __rev2_930 = __builtin_shufflevector(__s2_930, __s2_930, 1, 0); \ - __ret_930 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_930, __p3_930), __rev0_930, __p1_930); \ - __ret_930 = __builtin_shufflevector(__ret_930, __ret_930, 1, 0); \ - __ret_930; \ +#define vcopyq_laneq_f64(__p0_902, __p1_902, __p2_902, __p3_902) __extension__ ({ \ + float64x2_t __ret_902; \ + float64x2_t __s0_902 = __p0_902; \ + float64x2_t __s2_902 = __p2_902; \ + float64x2_t __rev0_902; __rev0_902 = __builtin_shufflevector(__s0_902, __s0_902, 1, 0); \ + float64x2_t __rev2_902; __rev2_902 = __builtin_shufflevector(__s2_902, __s2_902, 1, 0); \ + __ret_902 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_902, __p3_902), __rev0_902, __p1_902); \ + __ret_902 = __builtin_shufflevector(__ret_902, __ret_902, 1, 0); \ + __ret_902; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p64(__p0_931, __p1_931, __p2_931, __p3_931) __extension__ ({ \ - poly64x1_t __ret_931; \ - poly64x1_t __s0_931 = __p0_931; \ - poly64x2_t __s2_931 = __p2_931; \ - __ret_931 = vset_lane_p64(vgetq_lane_p64(__s2_931, __p3_931), __s0_931, __p1_931); \ - __ret_931; \ +#define vcopy_laneq_p64(__p0_903, __p1_903, __p2_903, __p3_903) __extension__ ({ \ + poly64x1_t __ret_903; \ + poly64x1_t __s0_903 = __p0_903; \ + poly64x2_t __s2_903 = __p2_903; \ + __ret_903 = vset_lane_p64(vgetq_lane_p64(__s2_903, __p3_903), __s0_903, __p1_903); \ + __ret_903; \ }) #else -#define vcopy_laneq_p64(__p0_932, __p1_932, __p2_932, __p3_932) __extension__ ({ \ - poly64x1_t __ret_932; \ - poly64x1_t __s0_932 = __p0_932; \ - poly64x2_t __s2_932 = __p2_932; \ - poly64x2_t __rev2_932; __rev2_932 = __builtin_shufflevector(__s2_932, __s2_932, 1, 0); \ - __ret_932 = vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_932, __p3_932), __s0_932, __p1_932); \ - __ret_932; \ +#define vcopy_laneq_p64(__p0_904, __p1_904, __p2_904, __p3_904) __extension__ ({ \ + poly64x1_t __ret_904; \ + poly64x1_t __s0_904 = __p0_904; \ + poly64x2_t __s2_904 = __p2_904; \ + poly64x2_t __rev2_904; __rev2_904 = __builtin_shufflevector(__s2_904, __s2_904, 1, 0); \ + __ret_904 = vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_904, __p3_904), __s0_904, __p1_904); \ + __ret_904; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_f64(__p0_933, __p1_933, __p2_933, __p3_933) __extension__ ({ \ - float64x1_t __ret_933; \ - float64x1_t __s0_933 = __p0_933; \ - float64x2_t __s2_933 = __p2_933; \ - __ret_933 = vset_lane_f64(vgetq_lane_f64(__s2_933, __p3_933), __s0_933, __p1_933); \ - __ret_933; \ +#define vcopy_laneq_f64(__p0_905, __p1_905, __p2_905, __p3_905) __extension__ ({ \ + float64x1_t __ret_905; \ + float64x1_t __s0_905 = __p0_905; \ + float64x2_t __s2_905 = __p2_905; \ + __ret_905 = vset_lane_f64(vgetq_lane_f64(__s2_905, __p3_905), __s0_905, __p1_905); \ + __ret_905; \ }) #else -#define vcopy_laneq_f64(__p0_934, __p1_934, __p2_934, __p3_934) __extension__ ({ \ - float64x1_t __ret_934; \ - float64x1_t __s0_934 = __p0_934; \ - float64x2_t __s2_934 = __p2_934; \ - float64x2_t __rev2_934; __rev2_934 = __builtin_shufflevector(__s2_934, __s2_934, 1, 0); \ - __ret_934 = vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_934, __p3_934), __s0_934, __p1_934); \ - __ret_934; \ +#define vcopy_laneq_f64(__p0_906, __p1_906, __p2_906, __p3_906) __extension__ ({ \ + float64x1_t __ret_906; \ + float64x1_t __s0_906 = __p0_906; \ + float64x2_t __s2_906 = __p2_906; \ + float64x2_t __rev2_906; __rev2_906 = __builtin_shufflevector(__s2_906, __s2_906, 1, 0); \ + __ret_906 = vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_906, __p3_906), __s0_906, __p1_906); \ + __ret_906; \ }) #endif @@ -69347,38 +71238,38 @@ __ai __attribute__((target("neon"))) int32x4_t vmlsl_high_n_s16(int32x4_t __p0, } #endif -#define vmulx_lane_f64(__p0_935, __p1_935, __p2_935) __extension__ ({ \ - float64x1_t __ret_935; \ - float64x1_t __s0_935 = __p0_935; \ - float64x1_t __s1_935 = __p1_935; \ - float64_t __x_935 = vget_lane_f64(__s0_935, 0); \ - float64_t __y_935 = vget_lane_f64(__s1_935, __p2_935); \ - float64_t __z_935 = vmulxd_f64(__x_935, __y_935); \ - __ret_935 = vset_lane_f64(__z_935, __s0_935, __p2_935); \ - __ret_935; \ +#define vmulx_lane_f64(__p0_907, __p1_907, __p2_907) __extension__ ({ \ + float64x1_t __ret_907; \ + float64x1_t __s0_907 = __p0_907; \ + float64x1_t __s1_907 = __p1_907; \ + float64_t __x_907 = vget_lane_f64(__s0_907, 0); \ + float64_t __y_907 = vget_lane_f64(__s1_907, __p2_907); \ + float64_t __z_907 = vmulxd_f64(__x_907, __y_907); \ + __ret_907 = vset_lane_f64(__z_907, __s0_907, __p2_907); \ + __ret_907; \ }) #ifdef __LITTLE_ENDIAN__ -#define vmulx_laneq_f64(__p0_936, __p1_936, __p2_936) __extension__ ({ \ - float64x1_t __ret_936; \ - float64x1_t __s0_936 = __p0_936; \ - float64x2_t __s1_936 = __p1_936; \ - float64_t __x_936 = vget_lane_f64(__s0_936, 0); \ - float64_t __y_936 = vgetq_lane_f64(__s1_936, __p2_936); \ - float64_t __z_936 = vmulxd_f64(__x_936, __y_936); \ - __ret_936 = vset_lane_f64(__z_936, __s0_936, 0); \ - __ret_936; \ +#define vmulx_laneq_f64(__p0_908, __p1_908, __p2_908) __extension__ ({ \ + float64x1_t __ret_908; \ + float64x1_t __s0_908 = __p0_908; \ + float64x2_t __s1_908 = __p1_908; \ + float64_t __x_908 = vget_lane_f64(__s0_908, 0); \ + float64_t __y_908 = vgetq_lane_f64(__s1_908, __p2_908); \ + float64_t __z_908 = vmulxd_f64(__x_908, __y_908); \ + __ret_908 = vset_lane_f64(__z_908, __s0_908, 0); \ + __ret_908; \ }) #else -#define vmulx_laneq_f64(__p0_937, __p1_937, __p2_937) __extension__ ({ \ - float64x1_t __ret_937; \ - float64x1_t __s0_937 = __p0_937; \ - float64x2_t __s1_937 = __p1_937; \ - float64x2_t __rev1_937; __rev1_937 = __builtin_shufflevector(__s1_937, __s1_937, 1, 0); \ - float64_t __x_937 = vget_lane_f64(__s0_937, 0); \ - float64_t __y_937 = __noswap_vgetq_lane_f64(__rev1_937, __p2_937); \ - float64_t __z_937 = vmulxd_f64(__x_937, __y_937); \ - __ret_937 = vset_lane_f64(__z_937, __s0_937, 0); \ - __ret_937; \ +#define vmulx_laneq_f64(__p0_909, __p1_909, __p2_909) __extension__ ({ \ + float64x1_t __ret_909; \ + float64x1_t __s0_909 = __p0_909; \ + float64x2_t __s1_909 = __p1_909; \ + float64x2_t __rev1_909; __rev1_909 = __builtin_shufflevector(__s1_909, __s1_909, 1, 0); \ + float64_t __x_909 = vget_lane_f64(__s0_909, 0); \ + float64_t __y_909 = __noswap_vgetq_lane_f64(__rev1_909, __p2_909); \ + float64_t __z_909 = vmulxd_f64(__x_909, __y_909); \ + __ret_909 = vset_lane_f64(__z_909, __s0_909, 0); \ + __ret_909; \ }) #endif diff --git a/lib/include/arm_sme.h b/lib/include/arm_sme.h index cbfea38fe4..19f0191ac5 100644 --- a/lib/include/arm_sme.h +++ b/lib/include/arm_sme.h @@ -35,12 +35,6 @@ __ai bool __arm_has_sme(void) __arm_streaming_compatible { return x0 & (1ULL << 63); } -__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible { - uint64_t x0, x1; - __builtin_arm_get_sme_state(&x0, &x1); - return x0 & 1; -} - void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible; void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible; void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible; @@ -48,6 +42,8 @@ void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible; __ai __attribute__((target("sme"))) void svundef_za(void) __arm_streaming_compatible __arm_out("za") { } +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme___arm_in_streaming_mode))) +bool __arm_in_streaming_mode(void); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m))) void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m))) @@ -604,6 +600,94 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_ void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m))) void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2))) +void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4))) +void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2))) +void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4))) +void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2))) +void svadd_za16_vg1x2(uint32_t, svfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4))) +void svadd_za16_vg1x4(uint32_t, svfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2))) +void svsub_za16_vg1x2(uint32_t, svfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4))) +void svsub_za16_vg1x4(uint32_t, svfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2))) +void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4))) +void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2))) +void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4))) +void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2))) +void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4))) +void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2))) +void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4))) +void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2))) +void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4))) +void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2))) +void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4))) +void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2))) +void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4))) +void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m))) +void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m))) +void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2))) +void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4))) +void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2))) +void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4))) +void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2))) +void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4))) +void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2))) +void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4))) +void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2))) +void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4))) +void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2))) +void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4))) +void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2))) +void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4))) +void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2))) +void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4))) +void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m))) +void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m))) +void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2))) +void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4))) +void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2))) void svmla_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4))) @@ -660,22 +744,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m)) void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m))) void svmops_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2))) -void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4))) -void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2))) -void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4))) -void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2))) -void svadd_za16_vg1x2(uint32_t, svfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4))) -void svadd_za16_vg1x4(uint32_t, svfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2))) -void svsub_za16_vg1x2(uint32_t, svfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4))) -void svsub_za16_vg1x4(uint32_t, svfloat16x4_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m))) void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m))) @@ -684,6 +752,138 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)) void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m))) void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm))) +void svdot_single_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm))) +void svdot_single_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm))) +void svdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm))) +void svdot_lane_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm))) +void svdot_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm))) +void svdot_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm))) +void svmla_single_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm))) +void svmla_single_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm))) +void svmla_single_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm))) +void svmla_lane_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm))) +void svmla_lane_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm))) +void svmla_lane_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm))) +void svmla_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm))) +void svmla_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm))) +void svmopa_za16_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm))) +void svvdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm))) +void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm))) +void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm))) +void svdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm))) +void svdot_lane_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm))) +void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm))) +void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm))) +void svmla_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm))) +void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm))) +void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm))) +void svmla_lane_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm))) +void svmla_lane_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm))) +void svmla_lane_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm))) +void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm))) +void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm))) +void svmopa_za16_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm))) +void svvdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm))) +void svdot_single_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm))) +void svdot_single_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm))) +void svdot_lane_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm))) +void svdot_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm))) +void svdot_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm))) +void svdot_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm))) +void svmla_single_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm))) +void svmla_single_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm))) +void svmla_single_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm))) +void svmla_lane_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm))) +void svmla_lane_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm))) +void svmla_lane_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm))) +void svmla_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm))) +void svmla_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm))) +void svmopa_za32_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm))) +void svvdotb_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm))) +void svvdott_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm))) +void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm))) +void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm))) +void svdot_lane_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm))) +void svdot_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm))) +void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm))) +void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm))) +void svmla_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm))) +void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm))) +void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm))) +void svmla_lane_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm))) +void svmla_lane_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm))) +void svmla_lane_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm))) +void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm))) +void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm))) +void svmopa_za32_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm))) +void svvdotb_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm))) +void svvdott_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m))) void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m))) @@ -732,6 +932,106 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m))) void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_u8_x4))) +svuint8x4_t svluti4_zt_u8_x4(uint64_t, svuint8x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_s8_x4))) +svint8x4_t svluti4_zt_s8_x4(uint64_t, svuint8x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8))) +void svwrite_lane_zt_u8(uint64_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32))) +void svwrite_lane_zt_u32(uint64_t, svuint32_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64))) +void svwrite_lane_zt_u64(uint64_t, svuint64_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16))) +void svwrite_lane_zt_u16(uint64_t, svuint16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16))) +void svwrite_lane_zt_bf16(uint64_t, svbfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8))) +void svwrite_lane_zt_s8(uint64_t, svint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64))) +void svwrite_lane_zt_f64(uint64_t, svfloat64_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32))) +void svwrite_lane_zt_f32(uint64_t, svfloat32_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16))) +void svwrite_lane_zt_f16(uint64_t, svfloat16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32))) +void svwrite_lane_zt_s32(uint64_t, svint32_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64))) +void svwrite_lane_zt_s64(uint64_t, svint64_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16))) +void svwrite_lane_zt_s16(uint64_t, svint16_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8))) +void svwrite_zt_u8(uint64_t, svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32))) +void svwrite_zt_u32(uint64_t, svuint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64))) +void svwrite_zt_u64(uint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16))) +void svwrite_zt_u16(uint64_t, svuint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16))) +void svwrite_zt_bf16(uint64_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8))) +void svwrite_zt_s8(uint64_t, svint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64))) +void svwrite_zt_f64(uint64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32))) +void svwrite_zt_f32(uint64_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16))) +void svwrite_zt_f16(uint64_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32))) +void svwrite_zt_s32(uint64_t, svint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64))) +void svwrite_zt_s64(uint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16))) +void svwrite_zt_s16(uint64_t, svint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8))) +void svwrite_lane_zt(uint64_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32))) +void svwrite_lane_zt(uint64_t, svuint32_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64))) +void svwrite_lane_zt(uint64_t, svuint64_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16))) +void svwrite_lane_zt(uint64_t, svuint16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16))) +void svwrite_lane_zt(uint64_t, svbfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8))) +void svwrite_lane_zt(uint64_t, svint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64))) +void svwrite_lane_zt(uint64_t, svfloat64_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32))) +void svwrite_lane_zt(uint64_t, svfloat32_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16))) +void svwrite_lane_zt(uint64_t, svfloat16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32))) +void svwrite_lane_zt(uint64_t, svint32_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64))) +void svwrite_lane_zt(uint64_t, svint64_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16))) +void svwrite_lane_zt(uint64_t, svint16_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8))) +void svwrite_zt(uint64_t, svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32))) +void svwrite_zt(uint64_t, svuint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64))) +void svwrite_zt(uint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16))) +void svwrite_zt(uint64_t, svuint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16))) +void svwrite_zt(uint64_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8))) +void svwrite_zt(uint64_t, svint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64))) +void svwrite_zt(uint64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32))) +void svwrite_zt(uint64_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16))) +void svwrite_zt(uint64_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32))) +void svwrite_zt(uint64_t, svint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64))) +void svwrite_zt(uint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16))) +void svwrite_zt(uint64_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x2))) void svadd_write_single_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x2))) @@ -2138,78 +2438,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x void svwrite_za8_vg1x4(uint32_t, svuint8x4_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4))) void svwrite_za8_vg1x4(uint32_t, svint8x4_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2))) -void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4))) -void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2))) -void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4))) -void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2))) -void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4))) -void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2))) -void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4))) -void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2))) -void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4))) -void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2))) -void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4))) -void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2))) -void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4))) -void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m))) -void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m))) -void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2))) -void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4))) -void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2))) -void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4))) -void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2))) -void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4))) -void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2))) -void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4))) -void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2))) -void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4))) -void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2))) -void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4))) -void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2))) -void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4))) -void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2))) -void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4))) -void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m))) -void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m))) -void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2))) -void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4))) -void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2))) void svadd_za64_f64_vg1x2(uint32_t, svfloat64x2_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4))) diff --git a/lib/include/arm_sve.h b/lib/include/arm_sve.h index 87691e03ce..35e4644b60 100644 --- a/lib/include/arm_sve.h +++ b/lib/include/arm_sve.h @@ -38,6 +38,8 @@ typedef __SVFloat16_t svfloat16_t; typedef __SVBfloat16_t svbfloat16_t; #include #include +typedef __SVMfloat8_t svmfloat8_t; + typedef __SVFloat32_t svfloat32_t; typedef __SVFloat64_t svfloat64_t; typedef __clang_svint8x2_t svint8x2_t; @@ -80,6 +82,9 @@ typedef __clang_svboolx4_t svboolx4_t; typedef __clang_svbfloat16x2_t svbfloat16x2_t; typedef __clang_svbfloat16x3_t svbfloat16x3_t; typedef __clang_svbfloat16x4_t svbfloat16x4_t; +typedef __clang_svmfloat8x2_t svmfloat8x2_t; +typedef __clang_svmfloat8x3_t svmfloat8x3_t; +typedef __clang_svmfloat8x4_t svmfloat8x4_t; typedef __SVCount_t svcount_t; enum svpattern @@ -128,6 +133,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8)) svint8_t svreinterpret_s8_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8))) svint8_t svreinterpret_s8_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8))) +svint8_t svreinterpret_s8_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16))) svint8_t svreinterpret_s8_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16))) @@ -152,6 +159,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8)) svuint8_t svreinterpret_u8_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8))) svuint8_t svreinterpret_u8_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8))) +svuint8_t svreinterpret_u8_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16))) svuint8_t svreinterpret_u8_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16))) @@ -172,10 +181,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32) svuint8_t svreinterpret_u8_f32(svfloat32_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64))) svuint8_t svreinterpret_u8_f64(svfloat64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8))) +svmfloat8_t svreinterpret_mf8_s8(svint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8))) +svmfloat8_t svreinterpret_mf8_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8))) +svmfloat8_t svreinterpret_mf8_mf8(svmfloat8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16))) +svmfloat8_t svreinterpret_mf8_s16(svint16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16))) +svmfloat8_t svreinterpret_mf8_u16(svuint16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32))) +svmfloat8_t svreinterpret_mf8_s32(svint32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32))) +svmfloat8_t svreinterpret_mf8_u32(svuint32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64))) +svmfloat8_t svreinterpret_mf8_s64(svint64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64))) +svmfloat8_t svreinterpret_mf8_u64(svuint64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16))) +svmfloat8_t svreinterpret_mf8_f16(svfloat16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16))) +svmfloat8_t svreinterpret_mf8_bf16(svbfloat16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32))) +svmfloat8_t svreinterpret_mf8_f32(svfloat32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64))) +svmfloat8_t svreinterpret_mf8_f64(svfloat64_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8))) svint16_t svreinterpret_s16_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8))) svint16_t svreinterpret_s16_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8))) +svint16_t svreinterpret_s16_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16))) svint16_t svreinterpret_s16_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16))) @@ -200,6 +237,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8) svuint16_t svreinterpret_u16_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8))) svuint16_t svreinterpret_u16_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8))) +svuint16_t svreinterpret_u16_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16))) svuint16_t svreinterpret_u16_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16))) @@ -224,6 +263,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8) svint32_t svreinterpret_s32_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8))) svint32_t svreinterpret_s32_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8))) +svint32_t svreinterpret_s32_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16))) svint32_t svreinterpret_s32_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16))) @@ -248,6 +289,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8) svuint32_t svreinterpret_u32_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8))) svuint32_t svreinterpret_u32_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8))) +svuint32_t svreinterpret_u32_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16))) svuint32_t svreinterpret_u32_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16))) @@ -272,6 +315,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8) svint64_t svreinterpret_s64_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8))) svint64_t svreinterpret_s64_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8))) +svint64_t svreinterpret_s64_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16))) svint64_t svreinterpret_s64_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16))) @@ -296,6 +341,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8) svuint64_t svreinterpret_u64_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8))) svuint64_t svreinterpret_u64_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8))) +svuint64_t svreinterpret_u64_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16))) svuint64_t svreinterpret_u64_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16))) @@ -320,6 +367,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8) svfloat16_t svreinterpret_f16_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8))) svfloat16_t svreinterpret_f16_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8))) +svfloat16_t svreinterpret_f16_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16))) svfloat16_t svreinterpret_f16_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16))) @@ -344,6 +393,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16_t svreinterpret_bf16_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8))) svbfloat16_t svreinterpret_bf16_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8))) +svbfloat16_t svreinterpret_bf16_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16))) svbfloat16_t svreinterpret_bf16_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16))) @@ -368,6 +419,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8) svfloat32_t svreinterpret_f32_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8))) svfloat32_t svreinterpret_f32_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8))) +svfloat32_t svreinterpret_f32_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16))) svfloat32_t svreinterpret_f32_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16))) @@ -392,6 +445,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8) svfloat64_t svreinterpret_f64_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8))) svfloat64_t svreinterpret_f64_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8))) +svfloat64_t svreinterpret_f64_mf8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16))) svfloat64_t svreinterpret_f64_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16))) @@ -416,6 +471,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8)) svint8_t svreinterpret_s8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8))) svint8_t svreinterpret_s8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8))) +svint8_t svreinterpret_s8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16))) svint8_t svreinterpret_s8(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16))) @@ -440,6 +497,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8)) svuint8_t svreinterpret_u8(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8))) svuint8_t svreinterpret_u8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8))) +svuint8_t svreinterpret_u8(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16))) svuint8_t svreinterpret_u8(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16))) @@ -460,10 +519,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32) svuint8_t svreinterpret_u8(svfloat32_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64))) svuint8_t svreinterpret_u8(svfloat64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8))) +svmfloat8_t svreinterpret_mf8(svint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8))) +svmfloat8_t svreinterpret_mf8(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8))) +svmfloat8_t svreinterpret_mf8(svmfloat8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16))) +svmfloat8_t svreinterpret_mf8(svint16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16))) +svmfloat8_t svreinterpret_mf8(svuint16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32))) +svmfloat8_t svreinterpret_mf8(svint32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32))) +svmfloat8_t svreinterpret_mf8(svuint32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64))) +svmfloat8_t svreinterpret_mf8(svint64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64))) +svmfloat8_t svreinterpret_mf8(svuint64_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16))) +svmfloat8_t svreinterpret_mf8(svfloat16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16))) +svmfloat8_t svreinterpret_mf8(svbfloat16_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32))) +svmfloat8_t svreinterpret_mf8(svfloat32_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64))) +svmfloat8_t svreinterpret_mf8(svfloat64_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8))) svint16_t svreinterpret_s16(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8))) svint16_t svreinterpret_s16(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8))) +svint16_t svreinterpret_s16(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16))) svint16_t svreinterpret_s16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16))) @@ -488,6 +575,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8) svuint16_t svreinterpret_u16(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8))) svuint16_t svreinterpret_u16(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8))) +svuint16_t svreinterpret_u16(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16))) svuint16_t svreinterpret_u16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16))) @@ -512,6 +601,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8) svint32_t svreinterpret_s32(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8))) svint32_t svreinterpret_s32(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8))) +svint32_t svreinterpret_s32(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16))) svint32_t svreinterpret_s32(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16))) @@ -536,6 +627,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8) svuint32_t svreinterpret_u32(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8))) svuint32_t svreinterpret_u32(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8))) +svuint32_t svreinterpret_u32(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16))) svuint32_t svreinterpret_u32(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16))) @@ -560,6 +653,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8) svint64_t svreinterpret_s64(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8))) svint64_t svreinterpret_s64(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8))) +svint64_t svreinterpret_s64(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16))) svint64_t svreinterpret_s64(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16))) @@ -584,6 +679,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8) svuint64_t svreinterpret_u64(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8))) svuint64_t svreinterpret_u64(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8))) +svuint64_t svreinterpret_u64(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16))) svuint64_t svreinterpret_u64(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16))) @@ -608,6 +705,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8) svfloat16_t svreinterpret_f16(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8))) svfloat16_t svreinterpret_f16(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8))) +svfloat16_t svreinterpret_f16(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16))) svfloat16_t svreinterpret_f16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16))) @@ -632,6 +731,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16_t svreinterpret_bf16(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8))) svbfloat16_t svreinterpret_bf16(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8))) +svbfloat16_t svreinterpret_bf16(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16))) svbfloat16_t svreinterpret_bf16(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16))) @@ -656,6 +757,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8) svfloat32_t svreinterpret_f32(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8))) svfloat32_t svreinterpret_f32(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8))) +svfloat32_t svreinterpret_f32(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16))) svfloat32_t svreinterpret_f32(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16))) @@ -680,6 +783,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8) svfloat64_t svreinterpret_f64(svint8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8))) svfloat64_t svreinterpret_f64(svuint8_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8))) +svfloat64_t svreinterpret_f64(svmfloat8_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16))) svfloat64_t svreinterpret_f64(svint16_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16))) @@ -704,6 +809,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x2_t svreinterpret_s8_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x2))) svint8x2_t svreinterpret_s8_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x2))) +svint8x2_t svreinterpret_s8_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x2))) svint8x2_t svreinterpret_s8_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x2))) @@ -728,6 +835,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x2_t svreinterpret_u8_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x2))) svuint8x2_t svreinterpret_u8_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x2))) +svuint8x2_t svreinterpret_u8_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x2))) svuint8x2_t svreinterpret_u8_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x2))) @@ -748,10 +857,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x2_t svreinterpret_u8_f32_x2(svfloat32x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x2))) svuint8x2_t svreinterpret_u8_f64_x2(svfloat64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x2))) +svmfloat8x2_t svreinterpret_mf8_s8_x2(svint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x2))) +svmfloat8x2_t svreinterpret_mf8_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x2))) +svmfloat8x2_t svreinterpret_mf8_mf8_x2(svmfloat8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x2))) +svmfloat8x2_t svreinterpret_mf8_s16_x2(svint16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x2))) +svmfloat8x2_t svreinterpret_mf8_u16_x2(svuint16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x2))) +svmfloat8x2_t svreinterpret_mf8_s32_x2(svint32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x2))) +svmfloat8x2_t svreinterpret_mf8_u32_x2(svuint32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x2))) +svmfloat8x2_t svreinterpret_mf8_s64_x2(svint64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x2))) +svmfloat8x2_t svreinterpret_mf8_u64_x2(svuint64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x2))) +svmfloat8x2_t svreinterpret_mf8_f16_x2(svfloat16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x2))) +svmfloat8x2_t svreinterpret_mf8_bf16_x2(svbfloat16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x2))) +svmfloat8x2_t svreinterpret_mf8_f32_x2(svfloat32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x2))) +svmfloat8x2_t svreinterpret_mf8_f64_x2(svfloat64x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x2))) svint16x2_t svreinterpret_s16_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x2))) svint16x2_t svreinterpret_s16_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x2))) +svint16x2_t svreinterpret_s16_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x2))) svint16x2_t svreinterpret_s16_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x2))) @@ -776,6 +913,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x2_t svreinterpret_u16_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x2))) svuint16x2_t svreinterpret_u16_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x2))) +svuint16x2_t svreinterpret_u16_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x2))) svuint16x2_t svreinterpret_u16_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x2))) @@ -800,6 +939,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x2_t svreinterpret_s32_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x2))) svint32x2_t svreinterpret_s32_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x2))) +svint32x2_t svreinterpret_s32_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x2))) svint32x2_t svreinterpret_s32_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x2))) @@ -824,6 +965,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x2_t svreinterpret_u32_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x2))) svuint32x2_t svreinterpret_u32_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x2))) +svuint32x2_t svreinterpret_u32_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x2))) svuint32x2_t svreinterpret_u32_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x2))) @@ -848,6 +991,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x2_t svreinterpret_s64_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x2))) svint64x2_t svreinterpret_s64_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x2))) +svint64x2_t svreinterpret_s64_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x2))) svint64x2_t svreinterpret_s64_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x2))) @@ -872,6 +1017,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x2_t svreinterpret_u64_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x2))) svuint64x2_t svreinterpret_u64_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x2))) +svuint64x2_t svreinterpret_u64_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x2))) svuint64x2_t svreinterpret_u64_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x2))) @@ -896,6 +1043,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x2_t svreinterpret_f16_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x2))) svfloat16x2_t svreinterpret_f16_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x2))) +svfloat16x2_t svreinterpret_f16_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x2))) svfloat16x2_t svreinterpret_f16_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x2))) @@ -920,6 +1069,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x2_t svreinterpret_bf16_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x2))) svbfloat16x2_t svreinterpret_bf16_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x2))) +svbfloat16x2_t svreinterpret_bf16_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x2))) svbfloat16x2_t svreinterpret_bf16_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x2))) @@ -944,6 +1095,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x2_t svreinterpret_f32_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x2))) svfloat32x2_t svreinterpret_f32_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x2))) +svfloat32x2_t svreinterpret_f32_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x2))) svfloat32x2_t svreinterpret_f32_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x2))) @@ -968,6 +1121,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x2_t svreinterpret_f64_s8_x2(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x2))) svfloat64x2_t svreinterpret_f64_u8_x2(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x2))) +svfloat64x2_t svreinterpret_f64_mf8_x2(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x2))) svfloat64x2_t svreinterpret_f64_s16_x2(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x2))) @@ -992,6 +1147,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x2_t svreinterpret_s8(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x2))) svint8x2_t svreinterpret_s8(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x2))) +svint8x2_t svreinterpret_s8(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x2))) svint8x2_t svreinterpret_s8(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x2))) @@ -1016,6 +1173,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x2_t svreinterpret_u8(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x2))) svuint8x2_t svreinterpret_u8(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x2))) +svuint8x2_t svreinterpret_u8(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x2))) svuint8x2_t svreinterpret_u8(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x2))) @@ -1036,10 +1195,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x2_t svreinterpret_u8(svfloat32x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x2))) svuint8x2_t svreinterpret_u8(svfloat64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x2))) +svmfloat8x2_t svreinterpret_mf8(svint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x2))) +svmfloat8x2_t svreinterpret_mf8(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x2))) +svmfloat8x2_t svreinterpret_mf8(svmfloat8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x2))) +svmfloat8x2_t svreinterpret_mf8(svint16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x2))) +svmfloat8x2_t svreinterpret_mf8(svuint16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x2))) +svmfloat8x2_t svreinterpret_mf8(svint32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x2))) +svmfloat8x2_t svreinterpret_mf8(svuint32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x2))) +svmfloat8x2_t svreinterpret_mf8(svint64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x2))) +svmfloat8x2_t svreinterpret_mf8(svuint64x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x2))) +svmfloat8x2_t svreinterpret_mf8(svfloat16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x2))) +svmfloat8x2_t svreinterpret_mf8(svbfloat16x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x2))) +svmfloat8x2_t svreinterpret_mf8(svfloat32x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x2))) +svmfloat8x2_t svreinterpret_mf8(svfloat64x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x2))) svint16x2_t svreinterpret_s16(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x2))) svint16x2_t svreinterpret_s16(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x2))) +svint16x2_t svreinterpret_s16(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x2))) svint16x2_t svreinterpret_s16(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x2))) @@ -1064,6 +1251,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x2_t svreinterpret_u16(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x2))) svuint16x2_t svreinterpret_u16(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x2))) +svuint16x2_t svreinterpret_u16(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x2))) svuint16x2_t svreinterpret_u16(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x2))) @@ -1088,6 +1277,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x2_t svreinterpret_s32(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x2))) svint32x2_t svreinterpret_s32(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x2))) +svint32x2_t svreinterpret_s32(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x2))) svint32x2_t svreinterpret_s32(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x2))) @@ -1112,6 +1303,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x2_t svreinterpret_u32(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x2))) svuint32x2_t svreinterpret_u32(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x2))) +svuint32x2_t svreinterpret_u32(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x2))) svuint32x2_t svreinterpret_u32(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x2))) @@ -1136,6 +1329,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x2_t svreinterpret_s64(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x2))) svint64x2_t svreinterpret_s64(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x2))) +svint64x2_t svreinterpret_s64(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x2))) svint64x2_t svreinterpret_s64(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x2))) @@ -1160,6 +1355,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x2_t svreinterpret_u64(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x2))) svuint64x2_t svreinterpret_u64(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x2))) +svuint64x2_t svreinterpret_u64(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x2))) svuint64x2_t svreinterpret_u64(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x2))) @@ -1184,6 +1381,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x2_t svreinterpret_f16(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x2))) svfloat16x2_t svreinterpret_f16(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x2))) +svfloat16x2_t svreinterpret_f16(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x2))) svfloat16x2_t svreinterpret_f16(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x2))) @@ -1208,6 +1407,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x2_t svreinterpret_bf16(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x2))) svbfloat16x2_t svreinterpret_bf16(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x2))) +svbfloat16x2_t svreinterpret_bf16(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x2))) svbfloat16x2_t svreinterpret_bf16(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x2))) @@ -1232,6 +1433,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x2_t svreinterpret_f32(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x2))) svfloat32x2_t svreinterpret_f32(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x2))) +svfloat32x2_t svreinterpret_f32(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x2))) svfloat32x2_t svreinterpret_f32(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x2))) @@ -1256,6 +1459,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x2_t svreinterpret_f64(svint8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x2))) svfloat64x2_t svreinterpret_f64(svuint8x2_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x2))) +svfloat64x2_t svreinterpret_f64(svmfloat8x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x2))) svfloat64x2_t svreinterpret_f64(svint16x2_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x2))) @@ -1280,6 +1485,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x3_t svreinterpret_s8_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x3))) svint8x3_t svreinterpret_s8_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x3))) +svint8x3_t svreinterpret_s8_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x3))) svint8x3_t svreinterpret_s8_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x3))) @@ -1304,6 +1511,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x3_t svreinterpret_u8_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x3))) svuint8x3_t svreinterpret_u8_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x3))) +svuint8x3_t svreinterpret_u8_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x3))) svuint8x3_t svreinterpret_u8_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x3))) @@ -1324,10 +1533,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x3_t svreinterpret_u8_f32_x3(svfloat32x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x3))) svuint8x3_t svreinterpret_u8_f64_x3(svfloat64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x3))) +svmfloat8x3_t svreinterpret_mf8_s8_x3(svint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x3))) +svmfloat8x3_t svreinterpret_mf8_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x3))) +svmfloat8x3_t svreinterpret_mf8_mf8_x3(svmfloat8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x3))) +svmfloat8x3_t svreinterpret_mf8_s16_x3(svint16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x3))) +svmfloat8x3_t svreinterpret_mf8_u16_x3(svuint16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x3))) +svmfloat8x3_t svreinterpret_mf8_s32_x3(svint32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x3))) +svmfloat8x3_t svreinterpret_mf8_u32_x3(svuint32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x3))) +svmfloat8x3_t svreinterpret_mf8_s64_x3(svint64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x3))) +svmfloat8x3_t svreinterpret_mf8_u64_x3(svuint64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x3))) +svmfloat8x3_t svreinterpret_mf8_f16_x3(svfloat16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x3))) +svmfloat8x3_t svreinterpret_mf8_bf16_x3(svbfloat16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x3))) +svmfloat8x3_t svreinterpret_mf8_f32_x3(svfloat32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x3))) +svmfloat8x3_t svreinterpret_mf8_f64_x3(svfloat64x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x3))) svint16x3_t svreinterpret_s16_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x3))) svint16x3_t svreinterpret_s16_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x3))) +svint16x3_t svreinterpret_s16_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x3))) svint16x3_t svreinterpret_s16_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x3))) @@ -1352,6 +1589,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x3_t svreinterpret_u16_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x3))) svuint16x3_t svreinterpret_u16_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x3))) +svuint16x3_t svreinterpret_u16_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x3))) svuint16x3_t svreinterpret_u16_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x3))) @@ -1376,6 +1615,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x3_t svreinterpret_s32_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x3))) svint32x3_t svreinterpret_s32_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x3))) +svint32x3_t svreinterpret_s32_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x3))) svint32x3_t svreinterpret_s32_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x3))) @@ -1400,6 +1641,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x3_t svreinterpret_u32_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x3))) svuint32x3_t svreinterpret_u32_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x3))) +svuint32x3_t svreinterpret_u32_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x3))) svuint32x3_t svreinterpret_u32_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x3))) @@ -1424,6 +1667,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x3_t svreinterpret_s64_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x3))) svint64x3_t svreinterpret_s64_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x3))) +svint64x3_t svreinterpret_s64_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x3))) svint64x3_t svreinterpret_s64_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x3))) @@ -1448,6 +1693,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x3_t svreinterpret_u64_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x3))) svuint64x3_t svreinterpret_u64_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x3))) +svuint64x3_t svreinterpret_u64_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x3))) svuint64x3_t svreinterpret_u64_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x3))) @@ -1472,6 +1719,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x3_t svreinterpret_f16_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x3))) svfloat16x3_t svreinterpret_f16_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x3))) +svfloat16x3_t svreinterpret_f16_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x3))) svfloat16x3_t svreinterpret_f16_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x3))) @@ -1496,6 +1745,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x3_t svreinterpret_bf16_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x3))) svbfloat16x3_t svreinterpret_bf16_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x3))) +svbfloat16x3_t svreinterpret_bf16_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x3))) svbfloat16x3_t svreinterpret_bf16_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x3))) @@ -1520,6 +1771,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x3_t svreinterpret_f32_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x3))) svfloat32x3_t svreinterpret_f32_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x3))) +svfloat32x3_t svreinterpret_f32_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x3))) svfloat32x3_t svreinterpret_f32_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x3))) @@ -1544,6 +1797,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x3_t svreinterpret_f64_s8_x3(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x3))) svfloat64x3_t svreinterpret_f64_u8_x3(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x3))) +svfloat64x3_t svreinterpret_f64_mf8_x3(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x3))) svfloat64x3_t svreinterpret_f64_s16_x3(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x3))) @@ -1568,6 +1823,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x3_t svreinterpret_s8(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x3))) svint8x3_t svreinterpret_s8(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x3))) +svint8x3_t svreinterpret_s8(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x3))) svint8x3_t svreinterpret_s8(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x3))) @@ -1592,6 +1849,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x3_t svreinterpret_u8(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x3))) svuint8x3_t svreinterpret_u8(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x3))) +svuint8x3_t svreinterpret_u8(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x3))) svuint8x3_t svreinterpret_u8(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x3))) @@ -1612,10 +1871,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x3_t svreinterpret_u8(svfloat32x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x3))) svuint8x3_t svreinterpret_u8(svfloat64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x3))) +svmfloat8x3_t svreinterpret_mf8(svint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x3))) +svmfloat8x3_t svreinterpret_mf8(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x3))) +svmfloat8x3_t svreinterpret_mf8(svmfloat8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x3))) +svmfloat8x3_t svreinterpret_mf8(svint16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x3))) +svmfloat8x3_t svreinterpret_mf8(svuint16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x3))) +svmfloat8x3_t svreinterpret_mf8(svint32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x3))) +svmfloat8x3_t svreinterpret_mf8(svuint32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x3))) +svmfloat8x3_t svreinterpret_mf8(svint64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x3))) +svmfloat8x3_t svreinterpret_mf8(svuint64x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x3))) +svmfloat8x3_t svreinterpret_mf8(svfloat16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x3))) +svmfloat8x3_t svreinterpret_mf8(svbfloat16x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x3))) +svmfloat8x3_t svreinterpret_mf8(svfloat32x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x3))) +svmfloat8x3_t svreinterpret_mf8(svfloat64x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x3))) svint16x3_t svreinterpret_s16(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x3))) svint16x3_t svreinterpret_s16(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x3))) +svint16x3_t svreinterpret_s16(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x3))) svint16x3_t svreinterpret_s16(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x3))) @@ -1640,6 +1927,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x3_t svreinterpret_u16(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x3))) svuint16x3_t svreinterpret_u16(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x3))) +svuint16x3_t svreinterpret_u16(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x3))) svuint16x3_t svreinterpret_u16(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x3))) @@ -1664,6 +1953,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x3_t svreinterpret_s32(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x3))) svint32x3_t svreinterpret_s32(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x3))) +svint32x3_t svreinterpret_s32(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x3))) svint32x3_t svreinterpret_s32(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x3))) @@ -1688,6 +1979,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x3_t svreinterpret_u32(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x3))) svuint32x3_t svreinterpret_u32(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x3))) +svuint32x3_t svreinterpret_u32(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x3))) svuint32x3_t svreinterpret_u32(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x3))) @@ -1712,6 +2005,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x3_t svreinterpret_s64(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x3))) svint64x3_t svreinterpret_s64(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x3))) +svint64x3_t svreinterpret_s64(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x3))) svint64x3_t svreinterpret_s64(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x3))) @@ -1736,6 +2031,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x3_t svreinterpret_u64(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x3))) svuint64x3_t svreinterpret_u64(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x3))) +svuint64x3_t svreinterpret_u64(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x3))) svuint64x3_t svreinterpret_u64(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x3))) @@ -1760,6 +2057,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x3_t svreinterpret_f16(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x3))) svfloat16x3_t svreinterpret_f16(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x3))) +svfloat16x3_t svreinterpret_f16(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x3))) svfloat16x3_t svreinterpret_f16(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x3))) @@ -1784,6 +2083,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x3_t svreinterpret_bf16(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x3))) svbfloat16x3_t svreinterpret_bf16(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x3))) +svbfloat16x3_t svreinterpret_bf16(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x3))) svbfloat16x3_t svreinterpret_bf16(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x3))) @@ -1808,6 +2109,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x3_t svreinterpret_f32(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x3))) svfloat32x3_t svreinterpret_f32(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x3))) +svfloat32x3_t svreinterpret_f32(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x3))) svfloat32x3_t svreinterpret_f32(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x3))) @@ -1832,6 +2135,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x3_t svreinterpret_f64(svint8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x3))) svfloat64x3_t svreinterpret_f64(svuint8x3_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x3))) +svfloat64x3_t svreinterpret_f64(svmfloat8x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x3))) svfloat64x3_t svreinterpret_f64(svint16x3_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x3))) @@ -1856,6 +2161,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x4_t svreinterpret_s8_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x4))) svint8x4_t svreinterpret_s8_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x4))) +svint8x4_t svreinterpret_s8_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x4))) svint8x4_t svreinterpret_s8_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x4))) @@ -1880,6 +2187,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x4_t svreinterpret_u8_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x4))) svuint8x4_t svreinterpret_u8_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x4))) +svuint8x4_t svreinterpret_u8_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x4))) svuint8x4_t svreinterpret_u8_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x4))) @@ -1900,10 +2209,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x4_t svreinterpret_u8_f32_x4(svfloat32x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x4))) svuint8x4_t svreinterpret_u8_f64_x4(svfloat64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x4))) +svmfloat8x4_t svreinterpret_mf8_s8_x4(svint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x4))) +svmfloat8x4_t svreinterpret_mf8_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x4))) +svmfloat8x4_t svreinterpret_mf8_mf8_x4(svmfloat8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x4))) +svmfloat8x4_t svreinterpret_mf8_s16_x4(svint16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x4))) +svmfloat8x4_t svreinterpret_mf8_u16_x4(svuint16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x4))) +svmfloat8x4_t svreinterpret_mf8_s32_x4(svint32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x4))) +svmfloat8x4_t svreinterpret_mf8_u32_x4(svuint32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x4))) +svmfloat8x4_t svreinterpret_mf8_s64_x4(svint64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x4))) +svmfloat8x4_t svreinterpret_mf8_u64_x4(svuint64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x4))) +svmfloat8x4_t svreinterpret_mf8_f16_x4(svfloat16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x4))) +svmfloat8x4_t svreinterpret_mf8_bf16_x4(svbfloat16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x4))) +svmfloat8x4_t svreinterpret_mf8_f32_x4(svfloat32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x4))) +svmfloat8x4_t svreinterpret_mf8_f64_x4(svfloat64x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x4))) svint16x4_t svreinterpret_s16_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x4))) svint16x4_t svreinterpret_s16_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x4))) +svint16x4_t svreinterpret_s16_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x4))) svint16x4_t svreinterpret_s16_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x4))) @@ -1928,6 +2265,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x4_t svreinterpret_u16_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x4))) svuint16x4_t svreinterpret_u16_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x4))) +svuint16x4_t svreinterpret_u16_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x4))) svuint16x4_t svreinterpret_u16_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x4))) @@ -1952,6 +2291,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x4_t svreinterpret_s32_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x4))) svint32x4_t svreinterpret_s32_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x4))) +svint32x4_t svreinterpret_s32_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x4))) svint32x4_t svreinterpret_s32_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x4))) @@ -1976,6 +2317,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x4_t svreinterpret_u32_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x4))) svuint32x4_t svreinterpret_u32_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x4))) +svuint32x4_t svreinterpret_u32_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x4))) svuint32x4_t svreinterpret_u32_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x4))) @@ -2000,6 +2343,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x4_t svreinterpret_s64_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x4))) svint64x4_t svreinterpret_s64_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x4))) +svint64x4_t svreinterpret_s64_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x4))) svint64x4_t svreinterpret_s64_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x4))) @@ -2024,6 +2369,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x4_t svreinterpret_u64_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x4))) svuint64x4_t svreinterpret_u64_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x4))) +svuint64x4_t svreinterpret_u64_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x4))) svuint64x4_t svreinterpret_u64_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x4))) @@ -2048,6 +2395,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x4_t svreinterpret_f16_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x4))) svfloat16x4_t svreinterpret_f16_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x4))) +svfloat16x4_t svreinterpret_f16_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x4))) svfloat16x4_t svreinterpret_f16_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x4))) @@ -2072,6 +2421,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x4_t svreinterpret_bf16_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x4))) svbfloat16x4_t svreinterpret_bf16_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x4))) +svbfloat16x4_t svreinterpret_bf16_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x4))) svbfloat16x4_t svreinterpret_bf16_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x4))) @@ -2096,6 +2447,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x4_t svreinterpret_f32_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x4))) svfloat32x4_t svreinterpret_f32_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x4))) +svfloat32x4_t svreinterpret_f32_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x4))) svfloat32x4_t svreinterpret_f32_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x4))) @@ -2120,6 +2473,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x4_t svreinterpret_f64_s8_x4(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x4))) svfloat64x4_t svreinterpret_f64_u8_x4(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x4))) +svfloat64x4_t svreinterpret_f64_mf8_x4(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x4))) svfloat64x4_t svreinterpret_f64_s16_x4(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x4))) @@ -2144,6 +2499,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s8_x svint8x4_t svreinterpret_s8(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u8_x4))) svint8x4_t svreinterpret_s8(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_mf8_x4))) +svint8x4_t svreinterpret_s8(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_s16_x4))) svint8x4_t svreinterpret_s8(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s8_u16_x4))) @@ -2168,6 +2525,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s8_x svuint8x4_t svreinterpret_u8(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u8_x4))) svuint8x4_t svreinterpret_u8(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_mf8_x4))) +svuint8x4_t svreinterpret_u8(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_s16_x4))) svuint8x4_t svreinterpret_u8(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_u16_x4))) @@ -2188,10 +2547,38 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f32_ svuint8x4_t svreinterpret_u8(svfloat32x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u8_f64_x4))) svuint8x4_t svreinterpret_u8(svfloat64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s8_x4))) +svmfloat8x4_t svreinterpret_mf8(svint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u8_x4))) +svmfloat8x4_t svreinterpret_mf8(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_mf8_x4))) +svmfloat8x4_t svreinterpret_mf8(svmfloat8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s16_x4))) +svmfloat8x4_t svreinterpret_mf8(svint16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u16_x4))) +svmfloat8x4_t svreinterpret_mf8(svuint16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s32_x4))) +svmfloat8x4_t svreinterpret_mf8(svint32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u32_x4))) +svmfloat8x4_t svreinterpret_mf8(svuint32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_s64_x4))) +svmfloat8x4_t svreinterpret_mf8(svint64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_u64_x4))) +svmfloat8x4_t svreinterpret_mf8(svuint64x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f16_x4))) +svmfloat8x4_t svreinterpret_mf8(svfloat16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_bf16_x4))) +svmfloat8x4_t svreinterpret_mf8(svbfloat16x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f32_x4))) +svmfloat8x4_t svreinterpret_mf8(svfloat32x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_mf8_f64_x4))) +svmfloat8x4_t svreinterpret_mf8(svfloat64x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s8_x4))) svint16x4_t svreinterpret_s16(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u8_x4))) svint16x4_t svreinterpret_s16(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_mf8_x4))) +svint16x4_t svreinterpret_s16(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_s16_x4))) svint16x4_t svreinterpret_s16(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s16_u16_x4))) @@ -2216,6 +2603,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s8_ svuint16x4_t svreinterpret_u16(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u8_x4))) svuint16x4_t svreinterpret_u16(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_mf8_x4))) +svuint16x4_t svreinterpret_u16(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_s16_x4))) svuint16x4_t svreinterpret_u16(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u16_u16_x4))) @@ -2240,6 +2629,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s8_ svint32x4_t svreinterpret_s32(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u8_x4))) svint32x4_t svreinterpret_s32(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_mf8_x4))) +svint32x4_t svreinterpret_s32(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_s16_x4))) svint32x4_t svreinterpret_s32(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s32_u16_x4))) @@ -2264,6 +2655,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s8_ svuint32x4_t svreinterpret_u32(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u8_x4))) svuint32x4_t svreinterpret_u32(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_mf8_x4))) +svuint32x4_t svreinterpret_u32(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_s16_x4))) svuint32x4_t svreinterpret_u32(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u32_u16_x4))) @@ -2288,6 +2681,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s8_ svint64x4_t svreinterpret_s64(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u8_x4))) svint64x4_t svreinterpret_s64(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_mf8_x4))) +svint64x4_t svreinterpret_s64(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_s16_x4))) svint64x4_t svreinterpret_s64(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_s64_u16_x4))) @@ -2312,6 +2707,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s8_ svuint64x4_t svreinterpret_u64(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u8_x4))) svuint64x4_t svreinterpret_u64(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_mf8_x4))) +svuint64x4_t svreinterpret_u64(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_s16_x4))) svuint64x4_t svreinterpret_u64(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_u64_u16_x4))) @@ -2336,6 +2733,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s8_ svfloat16x4_t svreinterpret_f16(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u8_x4))) svfloat16x4_t svreinterpret_f16(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_mf8_x4))) +svfloat16x4_t svreinterpret_f16(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_s16_x4))) svfloat16x4_t svreinterpret_f16(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f16_u16_x4))) @@ -2360,6 +2759,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s8 svbfloat16x4_t svreinterpret_bf16(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u8_x4))) svbfloat16x4_t svreinterpret_bf16(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_mf8_x4))) +svbfloat16x4_t svreinterpret_bf16(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_s16_x4))) svbfloat16x4_t svreinterpret_bf16(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_bf16_u16_x4))) @@ -2384,6 +2785,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s8_ svfloat32x4_t svreinterpret_f32(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u8_x4))) svfloat32x4_t svreinterpret_f32(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_mf8_x4))) +svfloat32x4_t svreinterpret_f32(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_s16_x4))) svfloat32x4_t svreinterpret_f32(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f32_u16_x4))) @@ -2408,6 +2811,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s8_ svfloat64x4_t svreinterpret_f64(svint8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u8_x4))) svfloat64x4_t svreinterpret_f64(svuint8x4_t op); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_mf8_x4))) +svfloat64x4_t svreinterpret_f64(svmfloat8x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_s16_x4))) svfloat64x4_t svreinterpret_f64(svint16x4_t op); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_reinterpret_f64_u16_x4))) @@ -3956,6 +4361,150 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s64_x4))) svint64x4_t svzipq(svint64x4_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq_s16_x4))) svint16x4_t svzipq(svint16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x2))) +svfloat64x2_t svamax_f64_x2(svfloat64x2_t, svfloat64x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x2))) +svfloat32x2_t svamax_f32_x2(svfloat32x2_t, svfloat32x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x2))) +svfloat16x2_t svamax_f16_x2(svfloat16x2_t, svfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x4))) +svfloat64x4_t svamax_f64_x4(svfloat64x4_t, svfloat64x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x4))) +svfloat32x4_t svamax_f32_x4(svfloat32x4_t, svfloat32x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x4))) +svfloat16x4_t svamax_f16_x4(svfloat16x4_t, svfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x2))) +svfloat64x2_t svamin_f64_x2(svfloat64x2_t, svfloat64x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x2))) +svfloat32x2_t svamin_f32_x2(svfloat32x2_t, svfloat32x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x2))) +svfloat16x2_t svamin_f16_x2(svfloat16x2_t, svfloat16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x4))) +svfloat64x4_t svamin_f64_x4(svfloat64x4_t, svfloat64x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x4))) +svfloat32x4_t svamin_f32_x4(svfloat32x4_t, svfloat32x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x4))) +svfloat16x4_t svamin_f16_x4(svfloat16x4_t, svfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x2))) +svfloat64x2_t svamax(svfloat64x2_t, svfloat64x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x2))) +svfloat32x2_t svamax(svfloat32x2_t, svfloat32x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x2))) +svfloat16x2_t svamax(svfloat16x2_t, svfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x4))) +svfloat64x4_t svamax(svfloat64x4_t, svfloat64x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x4))) +svfloat32x4_t svamax(svfloat32x4_t, svfloat32x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x4))) +svfloat16x4_t svamax(svfloat16x4_t, svfloat16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x2))) +svfloat64x2_t svamin(svfloat64x2_t, svfloat64x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x2))) +svfloat32x2_t svamin(svfloat32x2_t, svfloat32x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x2))) +svfloat16x2_t svamin(svfloat16x2_t, svfloat16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x4))) +svfloat64x4_t svamin(svfloat64x4_t, svfloat64x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x4))) +svfloat32x4_t svamin(svfloat32x4_t, svfloat32x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x4))) +svfloat16x4_t svamin(svfloat16x4_t, svfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvt1_bf16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_f16_mf8_x2_fpm))) +svfloat16x2_t svcvt1_f16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvt2_bf16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_f16_mf8_x2_fpm))) +svfloat16x2_t svcvt2_f16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_bf16_x2_fpm))) +svmfloat8_t svcvt_mf8_bf16_x2_fpm(svbfloat16x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_f16_x2_fpm))) +svmfloat8_t svcvt_mf8_f16_x2_fpm(svfloat16x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_f32_x4_fpm))) +svmfloat8_t svcvt_mf8_f32_x4_fpm(svfloat32x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl1_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvtl1_bf16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl1_f16_mf8_x2_fpm))) +svfloat16x2_t svcvtl1_f16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl2_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvtl2_bf16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl2_f16_mf8_x2_fpm))) +svfloat16x2_t svcvtl2_f16_mf8_x2_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_f32_x4_fpm))) +svmfloat8_t svcvtn_mf8_f32_x4_fpm(svfloat32x4_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f64_x2))) +svfloat64x2_t svscale_single_f64_x2(svfloat64x2_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f32_x2))) +svfloat32x2_t svscale_single_f32_x2(svfloat32x2_t, svint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f16_x2))) +svfloat16x2_t svscale_single_f16_x2(svfloat16x2_t, svint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f64_x4))) +svfloat64x4_t svscale_single_f64_x4(svfloat64x4_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f32_x4))) +svfloat32x4_t svscale_single_f32_x4(svfloat32x4_t, svint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f16_x4))) +svfloat16x4_t svscale_single_f16_x4(svfloat16x4_t, svint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x2))) +svfloat64x2_t svscale_f64_x2(svfloat64x2_t, svint64x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x2))) +svfloat32x2_t svscale_f32_x2(svfloat32x2_t, svint32x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x2))) +svfloat16x2_t svscale_f16_x2(svfloat16x2_t, svint16x2_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x4))) +svfloat64x4_t svscale_f64_x4(svfloat64x4_t, svint64x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x4))) +svfloat32x4_t svscale_f32_x4(svfloat32x4_t, svint32x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x4))) +svfloat16x4_t svscale_f16_x4(svfloat16x4_t, svint16x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvt1_bf16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_f16_mf8_x2_fpm))) +svfloat16x2_t svcvt1_f16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvt2_bf16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_f16_mf8_x2_fpm))) +svfloat16x2_t svcvt2_f16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_bf16_x2_fpm))) +svmfloat8_t svcvt_mf8_fpm(svbfloat16x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_f16_x2_fpm))) +svmfloat8_t svcvt_mf8_fpm(svfloat16x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt_mf8_f32_x4_fpm))) +svmfloat8_t svcvt_mf8_fpm(svfloat32x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl1_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvtl1_bf16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl1_f16_mf8_x2_fpm))) +svfloat16x2_t svcvtl1_f16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl2_bf16_mf8_x2_fpm))) +svbfloat16x2_t svcvtl2_bf16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtl2_f16_mf8_x2_fpm))) +svfloat16x2_t svcvtl2_f16_x2_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_f32_x4_fpm))) +svmfloat8_t svcvtn_mf8_fpm(svfloat32x4_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f64_x2))) +svfloat64x2_t svscale(svfloat64x2_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f32_x2))) +svfloat32x2_t svscale(svfloat32x2_t, svint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f16_x2))) +svfloat16x2_t svscale(svfloat16x2_t, svint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f64_x4))) +svfloat64x4_t svscale(svfloat64x4_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f32_x4))) +svfloat32x4_t svscale(svfloat32x4_t, svint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_single_f16_x4))) +svfloat16x4_t svscale(svfloat16x4_t, svint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x2))) +svfloat64x2_t svscale(svfloat64x2_t, svint64x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x2))) +svfloat32x2_t svscale(svfloat32x2_t, svint32x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x2))) +svfloat16x2_t svscale(svfloat16x2_t, svint16x2_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f64_x4))) +svfloat64x4_t svscale(svfloat64x4_t, svint64x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f32_x4))) +svfloat32x4_t svscale(svfloat32x4_t, svint32x4_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svscale_f16_x4))) +svfloat16x4_t svscale(svfloat16x4_t, svint16x4_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x2))) svbfloat16x2_t svclamp_single_bf16_x2(svbfloat16x2_t, svbfloat16_t, svbfloat16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svclamp_single_bf16_x4))) @@ -4028,6 +4577,12 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x2))) svbfloat16x2_t svminnm(svbfloat16x2_t, svbfloat16x2_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svminnm_bf16_x4))) svbfloat16x4_t svminnm(svbfloat16x4_t, svbfloat16x4_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64))) +float64_t svadda_f64(svbool_t, float64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32))) +float32_t svadda_f32(svbool_t, float32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16))) +float16_t svadda_f16(svbool_t, float16_t, svfloat16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_u32offset))) svuint32_t svadrb_u32base_u32offset(svuint32_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_u64offset))) @@ -5280,6 +5835,12 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16))) svfloat16_t svtssel_f16(svfloat16_t, svuint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwrffr))) void svwrffr(svbool_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64))) +float64_t svadda(svbool_t, float64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32))) +float32_t svadda(svbool_t, float32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16))) +float16_t svadda(svbool_t, float16_t, svfloat16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u32base_u32offset))) svuint32_t svadrb_offset(svuint32_t, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadrb_u64base_u64offset))) @@ -7948,6 +8509,406 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u6 void svstnt1w_scatter_offset(svbool_t, int32_t *, svuint64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svstnt1w_scatter_u64offset_u64))) void svstnt1w_scatter_offset(svbool_t, uint32_t *, svuint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16))) +svbfloat16_t svtbl2_bf16(svbfloat16x2_t, svuint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16))) +svbfloat16_t svtbx_bf16(svbfloat16_t, svbfloat16_t, svuint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16))) +svbool_t svwhilerw_bf16(bfloat16_t const *, bfloat16_t const *); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16))) +svbool_t svwhilewr_bf16(bfloat16_t const *, bfloat16_t const *); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16))) +svbfloat16_t svtbl2(svbfloat16x2_t, svuint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16))) +svbfloat16_t svtbx(svbfloat16_t, svbfloat16_t, svuint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16))) +svbool_t svwhilerw(bfloat16_t const *, bfloat16_t const *); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16))) +svbool_t svwhilewr(bfloat16_t const *, bfloat16_t const *); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_m))) +svfloat64_t svamax_n_f64_m(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_m))) +svfloat32_t svamax_n_f32_m(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_m))) +svfloat16_t svamax_n_f16_m(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_x))) +svfloat64_t svamax_n_f64_x(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_x))) +svfloat32_t svamax_n_f32_x(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_x))) +svfloat16_t svamax_n_f16_x(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_z))) +svfloat64_t svamax_n_f64_z(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_z))) +svfloat32_t svamax_n_f32_z(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_z))) +svfloat16_t svamax_n_f16_z(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_m))) +svfloat64_t svamax_f64_m(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_m))) +svfloat32_t svamax_f32_m(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_m))) +svfloat16_t svamax_f16_m(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x))) +svfloat64_t svamax_f64_x(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x))) +svfloat32_t svamax_f32_x(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x))) +svfloat16_t svamax_f16_x(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_z))) +svfloat64_t svamax_f64_z(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_z))) +svfloat32_t svamax_f32_z(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_z))) +svfloat16_t svamax_f16_z(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_m))) +svfloat64_t svamin_n_f64_m(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_m))) +svfloat32_t svamin_n_f32_m(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_m))) +svfloat16_t svamin_n_f16_m(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_x))) +svfloat64_t svamin_n_f64_x(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_x))) +svfloat32_t svamin_n_f32_x(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_x))) +svfloat16_t svamin_n_f16_x(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_z))) +svfloat64_t svamin_n_f64_z(svbool_t, svfloat64_t, float64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_z))) +svfloat32_t svamin_n_f32_z(svbool_t, svfloat32_t, float32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_z))) +svfloat16_t svamin_n_f16_z(svbool_t, svfloat16_t, float16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_m))) +svfloat64_t svamin_f64_m(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_m))) +svfloat32_t svamin_f32_m(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_m))) +svfloat16_t svamin_f16_m(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x))) +svfloat64_t svamin_f64_x(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x))) +svfloat32_t svamin_f32_x(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x))) +svfloat16_t svamin_f16_x(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_z))) +svfloat64_t svamin_f64_z(svbool_t, svfloat64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_z))) +svfloat32_t svamin_f32_z(svbool_t, svfloat32_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_z))) +svfloat16_t svamin_f16_z(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_m))) +svfloat64_t svamax_m(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_m))) +svfloat32_t svamax_m(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_m))) +svfloat16_t svamax_m(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_x))) +svfloat64_t svamax_x(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_x))) +svfloat32_t svamax_x(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_x))) +svfloat16_t svamax_x(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f64_z))) +svfloat64_t svamax_z(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f32_z))) +svfloat32_t svamax_z(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_n_f16_z))) +svfloat16_t svamax_z(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_m))) +svfloat64_t svamax_m(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_m))) +svfloat32_t svamax_m(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_m))) +svfloat16_t svamax_m(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_x))) +svfloat64_t svamax_x(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_x))) +svfloat32_t svamax_x(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_x))) +svfloat16_t svamax_x(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f64_z))) +svfloat64_t svamax_z(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f32_z))) +svfloat32_t svamax_z(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamax_f16_z))) +svfloat16_t svamax_z(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_m))) +svfloat64_t svamin_m(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_m))) +svfloat32_t svamin_m(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_m))) +svfloat16_t svamin_m(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_x))) +svfloat64_t svamin_x(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_x))) +svfloat32_t svamin_x(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_x))) +svfloat16_t svamin_x(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f64_z))) +svfloat64_t svamin_z(svbool_t, svfloat64_t, float64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f32_z))) +svfloat32_t svamin_z(svbool_t, svfloat32_t, float32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_n_f16_z))) +svfloat16_t svamin_z(svbool_t, svfloat16_t, float16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_m))) +svfloat64_t svamin_m(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_m))) +svfloat32_t svamin_m(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_m))) +svfloat16_t svamin_m(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_x))) +svfloat64_t svamin_x(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_x))) +svfloat32_t svamin_x(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_x))) +svfloat16_t svamin_x(svbool_t, svfloat16_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f64_z))) +svfloat64_t svamin_z(svbool_t, svfloat64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f32_z))) +svfloat32_t svamin_z(svbool_t, svfloat32_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svamin_f16_z))) +svfloat16_t svamin_z(svbool_t, svfloat16_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f16_mf8_fpm))) +svfloat16_t svdot_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_f16_mf8_fpm))) +svfloat16_t svdot_n_f16_mf8_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f16_mf8_fpm))) +svfloat16_t svdot_lane_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f16_mf8_fpm))) +svfloat16_t svdot_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_f16_mf8_fpm))) +svfloat16_t svdot_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f16_mf8_fpm))) +svfloat16_t svdot_lane_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f32_mf8_fpm))) +svfloat32_t svdot_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_f32_mf8_fpm))) +svfloat32_t svdot_n_f32_mf8_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f32_mf8_fpm))) +svfloat32_t svdot_lane_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_f32_mf8_fpm))) +svfloat32_t svdot_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_n_f32_mf8_fpm))) +svfloat32_t svdot_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdot_lane_f32_mf8_fpm))) +svfloat32_t svdot_lane_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_f16_mf8_fpm))) +svfloat16_t svmlalb_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_f16_mf8_fpm))) +svfloat16_t svmlalb_n_f16_mf8_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_f16_mf8_fpm))) +svfloat16_t svmlalb_lane_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_f32_mf8_fpm))) +svfloat32_t svmlallbb_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_n_f32_mf8_fpm))) +svfloat32_t svmlallbb_n_f32_mf8_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_lane_f32_mf8_fpm))) +svfloat32_t svmlallbb_lane_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_f32_mf8_fpm))) +svfloat32_t svmlallbt_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_n_f32_mf8_fpm))) +svfloat32_t svmlallbt_n_f32_mf8_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_lane_f32_mf8_fpm))) +svfloat32_t svmlallbt_lane_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_f32_mf8_fpm))) +svfloat32_t svmlalltb_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_n_f32_mf8_fpm))) +svfloat32_t svmlalltb_n_f32_mf8_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_lane_f32_mf8_fpm))) +svfloat32_t svmlalltb_lane_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_f32_mf8_fpm))) +svfloat32_t svmlalltt_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_n_f32_mf8_fpm))) +svfloat32_t svmlalltt_n_f32_mf8_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_lane_f32_mf8_fpm))) +svfloat32_t svmlalltt_lane_f32_mf8_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_f16_mf8_fpm))) +svfloat16_t svmlalt_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_f16_mf8_fpm))) +svfloat16_t svmlalt_n_f16_mf8_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_f16_mf8_fpm))) +svfloat16_t svmlalt_lane_f16_mf8_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_f16_mf8_fpm))) +svfloat16_t svmlalb_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_n_f16_mf8_fpm))) +svfloat16_t svmlalb_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalb_lane_f16_mf8_fpm))) +svfloat16_t svmlalb_lane_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_f32_mf8_fpm))) +svfloat32_t svmlallbb_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_n_f32_mf8_fpm))) +svfloat32_t svmlallbb_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbb_lane_f32_mf8_fpm))) +svfloat32_t svmlallbb_lane_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_f32_mf8_fpm))) +svfloat32_t svmlallbt_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_n_f32_mf8_fpm))) +svfloat32_t svmlallbt_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlallbt_lane_f32_mf8_fpm))) +svfloat32_t svmlallbt_lane_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_f32_mf8_fpm))) +svfloat32_t svmlalltb_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_n_f32_mf8_fpm))) +svfloat32_t svmlalltb_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltb_lane_f32_mf8_fpm))) +svfloat32_t svmlalltb_lane_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_f32_mf8_fpm))) +svfloat32_t svmlalltt_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_n_f32_mf8_fpm))) +svfloat32_t svmlalltt_fpm(svfloat32_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalltt_lane_f32_mf8_fpm))) +svfloat32_t svmlalltt_lane_fpm(svfloat32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_f16_mf8_fpm))) +svfloat16_t svmlalt_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_n_f16_mf8_fpm))) +svfloat16_t svmlalt_fpm(svfloat16_t, svmfloat8_t, mfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svmlalt_lane_f16_mf8_fpm))) +svfloat16_t svmlalt_lane_fpm(svfloat16_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_bf16_mf8_fpm))) +svbfloat16_t svcvt1_bf16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_f16_mf8_fpm))) +svfloat16_t svcvt1_f16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_bf16_mf8_fpm))) +svbfloat16_t svcvt2_bf16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_f16_mf8_fpm))) +svfloat16_t svcvt2_f16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt1_bf16_mf8_fpm))) +svbfloat16_t svcvtlt1_bf16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt1_f16_mf8_fpm))) +svfloat16_t svcvtlt1_f16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt2_bf16_mf8_fpm))) +svbfloat16_t svcvtlt2_bf16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt2_f16_mf8_fpm))) +svfloat16_t svcvtlt2_f16_mf8_fpm(svmfloat8_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_bf16_x2_fpm))) +svmfloat8_t svcvtn_mf8_bf16_x2_fpm(svbfloat16x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_f16_x2_fpm))) +svmfloat8_t svcvtn_mf8_f16_x2_fpm(svfloat16x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnb_mf8_f32_x2_fpm))) +svmfloat8_t svcvtnb_mf8_f32_x2_fpm(svfloat32x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_mf8_f32_x2_fpm))) +svmfloat8_t svcvtnt_mf8_f32_x2_fpm(svmfloat8_t, svfloat32x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_bf16_mf8_fpm))) +svbfloat16_t svcvt1_bf16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt1_f16_mf8_fpm))) +svfloat16_t svcvt1_f16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_bf16_mf8_fpm))) +svbfloat16_t svcvt2_bf16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvt2_f16_mf8_fpm))) +svfloat16_t svcvt2_f16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt1_bf16_mf8_fpm))) +svbfloat16_t svcvtlt1_bf16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt1_f16_mf8_fpm))) +svfloat16_t svcvtlt1_f16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt2_bf16_mf8_fpm))) +svbfloat16_t svcvtlt2_bf16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtlt2_f16_mf8_fpm))) +svfloat16_t svcvtlt2_f16_fpm(svmfloat8_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_bf16_x2_fpm))) +svmfloat8_t svcvtn_mf8_fpm(svbfloat16x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtn_mf8_f16_x2_fpm))) +svmfloat8_t svcvtn_mf8_fpm(svfloat16x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnb_mf8_f32_x2_fpm))) +svmfloat8_t svcvtnb_mf8_fpm(svfloat32x2_t, fpm_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcvtnt_mf8_f32_x2_fpm))) +svmfloat8_t svcvtnt_mf8_fpm(svmfloat8_t, svfloat32x2_t, fpm_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_bf16))) +svbfloat16_t svluti2_lane_bf16(svbfloat16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_bf16))) +svbfloat16_t svluti4_lane_bf16(svbfloat16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_bf16_x2))) +svbfloat16_t svluti4_lane_bf16_x2(svbfloat16x2_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_bf16))) +svbfloat16_t svluti2_lane(svbfloat16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_bf16))) +svbfloat16_t svluti4_lane(svbfloat16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_bf16_x2))) +svbfloat16_t svluti4_lane(svbfloat16x2_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_u8))) +svuint8_t svluti2_lane_u8(svuint8_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_s8))) +svint8_t svluti2_lane_s8(svint8_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_u16))) +svuint16_t svluti2_lane_u16(svuint16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_f16))) +svfloat16_t svluti2_lane_f16(svfloat16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_s16))) +svint16_t svluti2_lane_s16(svint16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u8))) +svuint8_t svluti4_lane_u8(svuint8_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s8))) +svint8_t svluti4_lane_s8(svint8_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u16))) +svuint16_t svluti4_lane_u16(svuint16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_f16))) +svfloat16_t svluti4_lane_f16(svfloat16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s16))) +svint16_t svluti4_lane_s16(svint16_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u16_x2))) +svuint16_t svluti4_lane_u16_x2(svuint16x2_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_f16_x2))) +svfloat16_t svluti4_lane_f16_x2(svfloat16x2_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s16_x2))) +svint16_t svluti4_lane_s16_x2(svint16x2_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_u8))) +svuint8_t svluti2_lane(svuint8_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_s8))) +svint8_t svluti2_lane(svint8_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_u16))) +svuint16_t svluti2_lane(svuint16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_f16))) +svfloat16_t svluti2_lane(svfloat16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti2_lane_s16))) +svint16_t svluti2_lane(svint16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u8))) +svuint8_t svluti4_lane(svuint8_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s8))) +svint8_t svluti4_lane(svint8_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u16))) +svuint16_t svluti4_lane(svuint16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_f16))) +svfloat16_t svluti4_lane(svfloat16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s16))) +svint16_t svluti4_lane(svint16_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_u16_x2))) +svuint16_t svluti4_lane(svuint16x2_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_f16_x2))) +svfloat16_t svluti4_lane(svfloat16x2_t, svuint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svluti4_lane_s16_x2))) +svint16_t svluti4_lane(svint16x2_t, svuint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8))) +svuint8_t svaesd_u8(svuint8_t, svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8))) +svuint8_t svaese_u8(svuint8_t, svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8))) +svuint8_t svaesimc_u8(svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8))) +svuint8_t svaesmc_u8(svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64))) +svuint64_t svpmullb_pair_n_u64(svuint64_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64))) +svuint64_t svpmullb_pair_u64(svuint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64))) +svuint64_t svpmullt_pair_n_u64(svuint64_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64))) +svuint64_t svpmullt_pair_u64(svuint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8))) +svuint8_t svaesd(svuint8_t, svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8))) +svuint8_t svaese(svuint8_t, svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8))) +svuint8_t svaesimc(svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8))) +svuint8_t svaesmc(svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64))) +svuint64_t svpmullb_pair(svuint64_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64))) +svuint64_t svpmullb_pair(svuint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64))) +svuint64_t svpmullt_pair(svuint64_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64))) +svuint64_t svpmullt_pair(svuint64_t, svuint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_m))) svbfloat16_t svadd_n_bf16_m(svbool_t, svbfloat16_t, bfloat16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_n_bf16_x))) @@ -8180,54 +9141,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_x))) svbfloat16_t svsub_x(svbool_t, svbfloat16_t, svbfloat16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsub_bf16_z))) svbfloat16_t svsub_z(svbool_t, svbfloat16_t, svbfloat16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16))) -svbfloat16_t svtbl2_bf16(svbfloat16x2_t, svuint16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16))) -svbfloat16_t svtbx_bf16(svbfloat16_t, svbfloat16_t, svuint16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16))) -svbool_t svwhilerw_bf16(bfloat16_t const *, bfloat16_t const *); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16))) -svbool_t svwhilewr_bf16(bfloat16_t const *, bfloat16_t const *); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbl2_bf16))) -svbfloat16_t svtbl2(svbfloat16x2_t, svuint16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbx_bf16))) -svbfloat16_t svtbx(svbfloat16_t, svbfloat16_t, svuint16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilerw_bf16))) -svbool_t svwhilerw(bfloat16_t const *, bfloat16_t const *); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilewr_bf16))) -svbool_t svwhilewr(bfloat16_t const *, bfloat16_t const *); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8))) -svuint8_t svaesd_u8(svuint8_t, svuint8_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8))) -svuint8_t svaese_u8(svuint8_t, svuint8_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8))) -svuint8_t svaesimc_u8(svuint8_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8))) -svuint8_t svaesmc_u8(svuint8_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64))) -svuint64_t svpmullb_pair_n_u64(svuint64_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64))) -svuint64_t svpmullb_pair_u64(svuint64_t, svuint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64))) -svuint64_t svpmullt_pair_n_u64(svuint64_t, uint64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64))) -svuint64_t svpmullt_pair_u64(svuint64_t, svuint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesd_u8))) -svuint8_t svaesd(svuint8_t, svuint8_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaese_u8))) -svuint8_t svaese(svuint8_t, svuint8_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesimc_u8))) -svuint8_t svaesimc(svuint8_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaesmc_u8))) -svuint8_t svaesmc(svuint8_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_n_u64))) -svuint64_t svpmullb_pair(svuint64_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullb_pair_u64))) -svuint64_t svpmullb_pair(svuint64_t, svuint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_n_u64))) -svuint64_t svpmullt_pair(svuint64_t, uint64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmullt_pair_u64))) -svuint64_t svpmullt_pair(svuint64_t, svuint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u8))) svuint8_t svbdep_n_u8(svuint8_t, uint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svbdep_n_u32))) @@ -8416,6 +9329,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s32))) svint32_t svextq_s32(svint32_t, svint32_t, int32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s64))) svint64_t svextq_s64(svint64_t, svint64_t, int32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_mf8))) +svmfloat8_t svextq_mf8(svmfloat8_t, svmfloat8_t, int32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s16))) svint16_t svextq_s16(svint16_t, svint16_t, int32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u32))) @@ -8831,17 +9746,17 @@ svuint32_t svpmov_u32_z(svbool_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_s32_z))) svint32_t svpmov_s32_z(svbool_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_u64))) -void svst1dq_u64(svbool_t, uint64_t const *, svuint64_t); +void svst1dq_u64(svbool_t, uint64_t *, svuint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_f64))) -void svst1dq_f64(svbool_t, float64_t const *, svfloat64_t); +void svst1dq_f64(svbool_t, float64_t *, svfloat64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_s64))) -void svst1dq_s64(svbool_t, int64_t const *, svint64_t); +void svst1dq_s64(svbool_t, int64_t *, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_u64))) -void svst1dq_vnum_u64(svbool_t, uint64_t const *, int64_t, svuint64_t); +void svst1dq_vnum_u64(svbool_t, uint64_t *, int64_t, svuint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_f64))) -void svst1dq_vnum_f64(svbool_t, float64_t const *, int64_t, svfloat64_t); +void svst1dq_vnum_f64(svbool_t, float64_t *, int64_t, svfloat64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_s64))) -void svst1dq_vnum_s64(svbool_t, int64_t const *, int64_t, svint64_t); +void svst1dq_vnum_s64(svbool_t, int64_t *, int64_t, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u8))) void svst1q_scatter_u64base_u8(svbool_t, svuint64_t, svuint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u32))) @@ -8910,6 +9825,26 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64ba void svst1q_scatter_u64base_offset_s64(svbool_t, svuint64_t, int64_t, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s16))) void svst1q_scatter_u64base_offset_s16(svbool_t, svuint64_t, int64_t, svint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u32))) +void svst1q_scatter_s64index_u32(svbool_t, uint32_t *, svint64_t, svuint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u64))) +void svst1q_scatter_s64index_u64(svbool_t, uint64_t *, svint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u16))) +void svst1q_scatter_s64index_u16(svbool_t, uint16_t *, svint64_t, svuint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_bf16))) +void svst1q_scatter_s64index_bf16(svbool_t, bfloat16_t *, svint64_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f64))) +void svst1q_scatter_s64index_f64(svbool_t, float64_t *, svint64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f32))) +void svst1q_scatter_s64index_f32(svbool_t, float32_t *, svint64_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f16))) +void svst1q_scatter_s64index_f16(svbool_t, float16_t *, svint64_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s32))) +void svst1q_scatter_s64index_s32(svbool_t, int32_t *, svint64_t, svint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s64))) +void svst1q_scatter_s64index_s64(svbool_t, int64_t *, svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s16))) +void svst1q_scatter_s64index_s16(svbool_t, int16_t *, svint64_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u32))) void svst1q_scatter_u64index_u32(svbool_t, uint32_t *, svuint64_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u64))) @@ -8930,6 +9865,30 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64in void svst1q_scatter_u64index_s64(svbool_t, int64_t *, svuint64_t, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s16))) void svst1q_scatter_u64index_s16(svbool_t, int16_t *, svuint64_t, svint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u8))) +void svst1q_scatter_s64offset_u8(svbool_t, uint8_t *, svint64_t, svuint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u32))) +void svst1q_scatter_s64offset_u32(svbool_t, uint32_t *, svint64_t, svuint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u64))) +void svst1q_scatter_s64offset_u64(svbool_t, uint64_t *, svint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u16))) +void svst1q_scatter_s64offset_u16(svbool_t, uint16_t *, svint64_t, svuint16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_bf16))) +void svst1q_scatter_s64offset_bf16(svbool_t, bfloat16_t *, svint64_t, svbfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s8))) +void svst1q_scatter_s64offset_s8(svbool_t, int8_t *, svint64_t, svint8_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f64))) +void svst1q_scatter_s64offset_f64(svbool_t, float64_t *, svint64_t, svfloat64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f32))) +void svst1q_scatter_s64offset_f32(svbool_t, float32_t *, svint64_t, svfloat32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f16))) +void svst1q_scatter_s64offset_f16(svbool_t, float16_t *, svint64_t, svfloat16_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s32))) +void svst1q_scatter_s64offset_s32(svbool_t, int32_t *, svint64_t, svint32_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s64))) +void svst1q_scatter_s64offset_s64(svbool_t, int64_t *, svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s16))) +void svst1q_scatter_s64offset_s16(svbool_t, int16_t *, svint64_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u8))) void svst1q_scatter_u64offset_u8(svbool_t, uint8_t *, svuint64_t, svuint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u32))) @@ -8955,17 +9914,17 @@ void svst1q_scatter_u64offset_s64(svbool_t, int64_t *, svuint64_t, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s16))) void svst1q_scatter_u64offset_s16(svbool_t, int16_t *, svuint64_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_u32))) -void svst1wq_u32(svbool_t, uint32_t const *, svuint32_t); +void svst1wq_u32(svbool_t, uint32_t *, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_f32))) -void svst1wq_f32(svbool_t, float32_t const *, svfloat32_t); +void svst1wq_f32(svbool_t, float32_t *, svfloat32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_s32))) -void svst1wq_s32(svbool_t, int32_t const *, svint32_t); +void svst1wq_s32(svbool_t, int32_t *, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_u32))) -void svst1wq_vnum_u32(svbool_t, uint32_t const *, int64_t, svuint32_t); +void svst1wq_vnum_u32(svbool_t, uint32_t *, int64_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_f32))) -void svst1wq_vnum_f32(svbool_t, float32_t const *, int64_t, svfloat32_t); +void svst1wq_vnum_f32(svbool_t, float32_t *, int64_t, svfloat32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_s32))) -void svst1wq_vnum_s32(svbool_t, int32_t const *, int64_t, svint32_t); +void svst1wq_vnum_s32(svbool_t, int32_t *, int64_t, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u8))) void svst2q_u8(svbool_t, uint8_t const *, svuint8x2_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u32))) @@ -9132,6 +10091,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s32))) svint32_t svtblq_s32(svint32_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s64))) svint64_t svtblq_s64(svint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_mf8))) +svmfloat8_t svtblq_mf8(svmfloat8_t, svuint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s16))) svint16_t svtblq_s16(svint16_t, svuint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u8))) @@ -9156,6 +10117,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s32))) svint32_t svtbxq_s32(svint32_t, svint32_t, svuint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s64))) svint64_t svtbxq_s64(svint64_t, svint64_t, svuint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_mf8))) +svmfloat8_t svtbxq_mf8(svmfloat8_t, svmfloat8_t, svuint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s16))) svint16_t svtbxq_s16(svint16_t, svint16_t, svuint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u8))) @@ -9180,6 +10143,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s32))) svint32_t svuzpq1_s32(svint32_t, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s64))) svint64_t svuzpq1_s64(svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_mf8))) +svmfloat8_t svuzpq1_mf8(svmfloat8_t, svmfloat8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s16))) svint16_t svuzpq1_s16(svint16_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u8))) @@ -9204,6 +10169,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s32))) svint32_t svuzpq2_s32(svint32_t, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s64))) svint64_t svuzpq2_s64(svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_mf8))) +svmfloat8_t svuzpq2_mf8(svmfloat8_t, svmfloat8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s16))) svint16_t svuzpq2_s16(svint16_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u8))) @@ -9228,6 +10195,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s32))) svint32_t svzipq1_s32(svint32_t, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s64))) svint64_t svzipq1_s64(svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_mf8))) +svmfloat8_t svzipq1_mf8(svmfloat8_t, svmfloat8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s16))) svint16_t svzipq1_s16(svint16_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u8))) @@ -9252,6 +10221,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s32))) svint32_t svzipq2_s32(svint32_t, svint32_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s64))) svint64_t svzipq2_s64(svint64_t, svint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_mf8))) +svmfloat8_t svzipq2_mf8(svmfloat8_t, svmfloat8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s16))) svint16_t svzipq2_s16(svint16_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddqv_u8))) @@ -9330,6 +10301,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s32))) svint32_t svextq(svint32_t, svint32_t, int32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s64))) svint64_t svextq(svint64_t, svint64_t, int32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_mf8))) +svmfloat8_t svextq(svmfloat8_t, svmfloat8_t, int32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svextq_s16))) svint16_t svextq(svint16_t, svint16_t, int32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svld1q_gather_u64base_index_u32))) @@ -9729,17 +10702,17 @@ svuint32_t svpmov_lane_m(svuint32_t, svbool_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpmov_lane_s32_m))) svint32_t svpmov_lane_m(svint32_t, svbool_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_u64))) -void svst1dq(svbool_t, uint64_t const *, svuint64_t); +void svst1dq(svbool_t, uint64_t *, svuint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_f64))) -void svst1dq(svbool_t, float64_t const *, svfloat64_t); +void svst1dq(svbool_t, float64_t *, svfloat64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_s64))) -void svst1dq(svbool_t, int64_t const *, svint64_t); +void svst1dq(svbool_t, int64_t *, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_u64))) -void svst1dq_vnum(svbool_t, uint64_t const *, int64_t, svuint64_t); +void svst1dq_vnum(svbool_t, uint64_t *, int64_t, svuint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_f64))) -void svst1dq_vnum(svbool_t, float64_t const *, int64_t, svfloat64_t); +void svst1dq_vnum(svbool_t, float64_t *, int64_t, svfloat64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1dq_vnum_s64))) -void svst1dq_vnum(svbool_t, int64_t const *, int64_t, svint64_t); +void svst1dq_vnum(svbool_t, int64_t *, int64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u8))) void svst1q_scatter(svbool_t, svuint64_t, svuint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_u32))) @@ -9808,6 +10781,26 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64b void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64base_offset_s16))) void svst1q_scatter_offset(svbool_t, svuint64_t, int64_t, svint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u32))) +void svst1q_scatter_index(svbool_t, uint32_t *, svint64_t, svuint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u64))) +void svst1q_scatter_index(svbool_t, uint64_t *, svint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_u16))) +void svst1q_scatter_index(svbool_t, uint16_t *, svint64_t, svuint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_bf16))) +void svst1q_scatter_index(svbool_t, bfloat16_t *, svint64_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f64))) +void svst1q_scatter_index(svbool_t, float64_t *, svint64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f32))) +void svst1q_scatter_index(svbool_t, float32_t *, svint64_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_f16))) +void svst1q_scatter_index(svbool_t, float16_t *, svint64_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s32))) +void svst1q_scatter_index(svbool_t, int32_t *, svint64_t, svint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s64))) +void svst1q_scatter_index(svbool_t, int64_t *, svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64index_s16))) +void svst1q_scatter_index(svbool_t, int16_t *, svint64_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u32))) void svst1q_scatter_index(svbool_t, uint32_t *, svuint64_t, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_u64))) @@ -9828,6 +10821,30 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64i void svst1q_scatter_index(svbool_t, int64_t *, svuint64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64index_s16))) void svst1q_scatter_index(svbool_t, int16_t *, svuint64_t, svint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u8))) +void svst1q_scatter_offset(svbool_t, uint8_t *, svint64_t, svuint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u32))) +void svst1q_scatter_offset(svbool_t, uint32_t *, svint64_t, svuint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u64))) +void svst1q_scatter_offset(svbool_t, uint64_t *, svint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_u16))) +void svst1q_scatter_offset(svbool_t, uint16_t *, svint64_t, svuint16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_bf16))) +void svst1q_scatter_offset(svbool_t, bfloat16_t *, svint64_t, svbfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s8))) +void svst1q_scatter_offset(svbool_t, int8_t *, svint64_t, svint8_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f64))) +void svst1q_scatter_offset(svbool_t, float64_t *, svint64_t, svfloat64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f32))) +void svst1q_scatter_offset(svbool_t, float32_t *, svint64_t, svfloat32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_f16))) +void svst1q_scatter_offset(svbool_t, float16_t *, svint64_t, svfloat16_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s32))) +void svst1q_scatter_offset(svbool_t, int32_t *, svint64_t, svint32_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s64))) +void svst1q_scatter_offset(svbool_t, int64_t *, svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_s64offset_s16))) +void svst1q_scatter_offset(svbool_t, int16_t *, svint64_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u8))) void svst1q_scatter_offset(svbool_t, uint8_t *, svuint64_t, svuint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_u32))) @@ -9853,17 +10870,17 @@ void svst1q_scatter_offset(svbool_t, int64_t *, svuint64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1q_scatter_u64offset_s16))) void svst1q_scatter_offset(svbool_t, int16_t *, svuint64_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_u32))) -void svst1wq(svbool_t, uint32_t const *, svuint32_t); +void svst1wq(svbool_t, uint32_t *, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_f32))) -void svst1wq(svbool_t, float32_t const *, svfloat32_t); +void svst1wq(svbool_t, float32_t *, svfloat32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_s32))) -void svst1wq(svbool_t, int32_t const *, svint32_t); +void svst1wq(svbool_t, int32_t *, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_u32))) -void svst1wq_vnum(svbool_t, uint32_t const *, int64_t, svuint32_t); +void svst1wq_vnum(svbool_t, uint32_t *, int64_t, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_f32))) -void svst1wq_vnum(svbool_t, float32_t const *, int64_t, svfloat32_t); +void svst1wq_vnum(svbool_t, float32_t *, int64_t, svfloat32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst1wq_vnum_s32))) -void svst1wq_vnum(svbool_t, int32_t const *, int64_t, svint32_t); +void svst1wq_vnum(svbool_t, int32_t *, int64_t, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u8))) void svst2q(svbool_t, uint8_t const *, svuint8x2_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svst2q_u32))) @@ -10030,6 +11047,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s32))) svint32_t svtblq(svint32_t, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s64))) svint64_t svtblq(svint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_mf8))) +svmfloat8_t svtblq(svmfloat8_t, svuint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtblq_s16))) svint16_t svtblq(svint16_t, svuint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_u8))) @@ -10054,6 +11073,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s32))) svint32_t svtbxq(svint32_t, svint32_t, svuint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s64))) svint64_t svtbxq(svint64_t, svint64_t, svuint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_mf8))) +svmfloat8_t svtbxq(svmfloat8_t, svmfloat8_t, svuint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtbxq_s16))) svint16_t svtbxq(svint16_t, svint16_t, svuint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_u8))) @@ -10078,6 +11099,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s32))) svint32_t svuzpq1(svint32_t, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s64))) svint64_t svuzpq1(svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_mf8))) +svmfloat8_t svuzpq1(svmfloat8_t, svmfloat8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq1_s16))) svint16_t svuzpq1(svint16_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_u8))) @@ -10102,6 +11125,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s32))) svint32_t svuzpq2(svint32_t, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s64))) svint64_t svuzpq2(svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_mf8))) +svmfloat8_t svuzpq2(svmfloat8_t, svmfloat8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzpq2_s16))) svint16_t svuzpq2(svint16_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_u8))) @@ -10126,6 +11151,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s32))) svint32_t svzipq1(svint32_t, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s64))) svint64_t svzipq1(svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_mf8))) +svmfloat8_t svzipq1(svmfloat8_t, svmfloat8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq1_s16))) svint16_t svzipq1(svint16_t, svint16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_u8))) @@ -10150,6 +11177,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s32))) svint32_t svzipq2(svint32_t, svint32_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s64))) svint64_t svzipq2(svint64_t, svint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_mf8))) +svmfloat8_t svzipq2(svmfloat8_t, svmfloat8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzipq2_s16))) svint16_t svzipq2(svint16_t, svint16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_bf16))) @@ -11522,6 +12551,8 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u8))) svuint8_t svdup_laneq_u8(svuint8_t, uint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s8))) svint8_t svdup_laneq_s8(svint8_t, uint64_t); +__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_mf8))) +svmfloat8_t svdup_laneq_mf8(svmfloat8_t, uint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u64))) svuint64_t svdup_laneq_u64(svuint64_t, uint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f64))) @@ -11544,6 +12575,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u8))) svuint8_t svdup_laneq(svuint8_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_s8))) svint8_t svdup_laneq(svint8_t, uint64_t); +__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_mf8))) +svmfloat8_t svdup_laneq(svmfloat8_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_u64))) svuint64_t svdup_laneq(svuint64_t, uint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_laneq_f64))) @@ -18424,12 +19457,6 @@ __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_z))) svint64_t svadd_s64_z(svbool_t, svint64_t, svint64_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_z))) svint16_t svadd_s16_z(svbool_t, svint16_t, svint16_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64))) -float64_t svadda_f64(svbool_t, float64_t, svfloat64_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32))) -float32_t svadda_f32(svbool_t, float32_t, svfloat32_t); -__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16))) -float16_t svadda_f16(svbool_t, float16_t, svfloat16_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s8))) int64_t svaddv_s8(svbool_t, svint8_t); __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s32))) @@ -24810,12 +25837,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s64_z))) svint64_t svadd_z(svbool_t, svint64_t, svint64_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadd_s16_z))) svint16_t svadd_z(svbool_t, svint16_t, svint16_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f64))) -float64_t svadda(svbool_t, float64_t, svfloat64_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f32))) -float32_t svadda(svbool_t, float32_t, svfloat32_t); -__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svadda_f16))) -float16_t svadda(svbool_t, float16_t, svfloat16_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s8))) int64_t svaddv(svbool_t, svint8_t); __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svaddv_s32))) diff --git a/lib/include/arm_vector_types.h b/lib/include/arm_vector_types.h index 8e79d39a60..e73e9c94fb 100644 --- a/lib/include/arm_vector_types.h +++ b/lib/include/arm_vector_types.h @@ -17,9 +17,62 @@ typedef float float32_t; typedef __fp16 float16_t; #if defined(__aarch64__) || defined(__arm64ec__) +typedef __mfp8 mfloat8_t; typedef double float64_t; #endif + +typedef uint64_t fpm_t; + +enum __ARM_FPM_FORMAT { __ARM_FPM_E5M2, __ARM_FPM_E4M3 }; + +enum __ARM_FPM_OVERFLOW { __ARM_FPM_INFNAN, __ARM_FPM_SATURATE }; + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_fpm_init(void) { + return 0; +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_src1_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) { + return (__fpm & ~7ull) | (fpm_t)__format; +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_src2_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) { + return (__fpm & ~0x38ull) | ((fpm_t)__format << 3u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_dst_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) { + return (__fpm & ~0x1c0ull) | ((fpm_t)__format << 6u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_overflow_mul(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) { + return (__fpm & ~0x4000ull) | ((fpm_t)__behaviour << 14u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_overflow_cvt(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) { + return (__fpm & ~0x8000ull) | ((fpm_t)__behaviour << 15u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_lscale(fpm_t __fpm, uint64_t __scale) { + return (__fpm & ~0x7f0000ull) | (__scale << 16u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_nscale(fpm_t __fpm, int64_t __scale) { + return (__fpm & ~0xff000000ull) | (((fpm_t)__scale & 0xffu) << 24u); +} + +static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__)) +__arm_set_fpm_lscale2(fpm_t __fpm, uint64_t __scale) { + return (uint32_t)__fpm | (__scale << 32u); +} + typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t; typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t; @@ -36,6 +89,10 @@ typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t; typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t; typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t; typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t; +#if defined(__aarch64__) || defined(__arm64ec__) +typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t; +typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t; +#endif typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t; typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t; typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t; @@ -109,6 +166,16 @@ typedef struct uint64x2x2_t { uint64x2_t val[2]; } uint64x2x2_t; +#if defined(__aarch64__) || defined(__arm64ec__) +typedef struct mfloat8x8x2_t { + mfloat8x8_t val[2]; +} mfloat8x8x2_t; + +typedef struct mfloat8x16x2_t { + mfloat8x16_t val[2]; +} mfloat8x16x2_t; + +#endif typedef struct float16x4x2_t { float16x4_t val[2]; } float16x4x2_t; @@ -199,6 +266,16 @@ typedef struct uint64x2x3_t { uint64x2_t val[3]; } uint64x2x3_t; +#if defined(__aarch64__) || defined(__arm64ec__) +typedef struct mfloat8x8x3_t { + mfloat8x8_t val[3]; +} mfloat8x8x3_t; + +typedef struct mfloat8x16x3_t { + mfloat8x16_t val[3]; +} mfloat8x16x3_t; + +#endif typedef struct float16x4x3_t { float16x4_t val[3]; } float16x4x3_t; @@ -289,6 +366,16 @@ typedef struct uint64x2x4_t { uint64x2_t val[4]; } uint64x2x4_t; +#if defined(__aarch64__) || defined(__arm64ec__) +typedef struct mfloat8x8x4_t { + mfloat8x8_t val[4]; +} mfloat8x8x4_t; + +typedef struct mfloat8x16x4_t { + mfloat8x16_t val[4]; +} mfloat8x16x4_t; + +#endif typedef struct float16x4x4_t { float16x4_t val[4]; } float16x4x4_t; diff --git a/lib/include/avx10_2_512bf16intrin.h b/lib/include/avx10_2_512bf16intrin.h new file mode 100644 index 0000000000..ce43ecbcfe --- /dev/null +++ b/lib/include/avx10_2_512bf16intrin.h @@ -0,0 +1,561 @@ +/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512BF16INTRIN_H +#define __AVX10_2_512BF16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { + return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) { + return (__m512bh)__builtin_ia32_undef512(); +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) { + return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh( + __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, + __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17, + __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22, + __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27, + __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) { + return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25, + bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17, + bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9, + bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1}; +} + +#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ + bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \ + bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \ + bf29, bf30, bf31, bf32) \ + _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \ + (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \ + (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \ + (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \ + (bf3), (bf2), (bf1)) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_castbf16_ps(__m512bh __a) { + return (__m512)__a; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_castbf16_pd(__m512bh __a) { + return (__m512d)__a; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castbf16_si512(__m512bh __a) { + return (__m512i)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castpd_pbh(__m512d __a) { + return (__m512bh)__a; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castsi512_pbh(__m512i __a) { + return (__m512bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS512 +_mm512_castbf16512_pbh128(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_castbf16512_pbh256(__m512bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_castbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextbf16128_pbh512(__m128bh __a) { + return __builtin_shufflevector( + __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_zextbf16256_pbh512(__m256bh __a) { + return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) { + return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), + (__m512i)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_load_pbh(void const *__p) { + return *(const __m512bh *)__p; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P, + __m512bh __A) { + *(__m512bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P, + __m512bh __A) { + struct __storeu_pbh { + __m512bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W, + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { + return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)((__v32bf)__A + (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)((__v32bf)__A - (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)((__v32bf)__A * (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)((__v32bf)__A / (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A, + __m512bh __B) { + return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), + (__v32bf)_mm512_setzero_pbh()); +} + +#define _mm512_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \ + (__v32bf)(__m512bh)(__B), \ + (int)(__P), (__mmask32) - 1)) + +#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \ + (__v32bf)(__m512bh)(__B), \ + (int)(__P), (__mmask32)(__U))) + +#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U))) + +#define _mm512_fpclass_pbh_mask(__A, imm) \ + ((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1)) + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_scalef_pbh(__m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(), + (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh( + __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) { + return (__m512bh)__builtin_ia32_vscalefbf16512_mask( + (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(), + (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vrcpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W, + (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrcpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_getexp_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexpbf16512_mask( + (__v32bf)__A, (__v32bf)__W, (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vgetexpbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_rsqrt_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W, + (__mmask32)__U); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask( + (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U); +} + +#define _mm512_reduce_pbh(__A, imm) \ + ((__m512bh)__builtin_ia32_vreducebf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \ + (__mmask32) - 1)) + +#define _mm512_mask_reduce_pbh(__W, __U, __A, imm) \ + ((__m512bh)__builtin_ia32_vreducebf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \ + (__mmask32)(__U))) + +#define _mm512_maskz_reduce_pbh(__U, __A, imm) \ + ((__m512bh)__builtin_ia32_vreducebf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32)(__U))) + +#define _mm512_roundscale_pbh(__A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32) - 1)) + +#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \ + (__mmask32)(__U))) + +#define _mm512_maskz_roundscale_pbh(__U, __A, imm) \ + ((__m512bh)__builtin_ia32_vrndscalebf16_mask( \ + (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \ + (__mmask32)(__U))) + +#define _mm512_getmant_pbh(__A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1)) + +#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)(__m512bh)(__W), (__mmask32)(__U))) + +#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \ + (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U))) + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) { + return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) { + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, + (__v32bf)_mm512_sqrt_pbh(__A), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, + -(__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, + -(__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh( + __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__A); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh( + __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)__C); +} + +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh( + __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) { + return (__m512bh)__builtin_ia32_selectpbf_512( + (__mmask32)__U, + _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), + (__v32bf)_mm512_setzero_pbh()); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif +#endif diff --git a/lib/include/avx10_2_512convertintrin.h b/lib/include/avx10_2_512convertintrin.h new file mode 100644 index 0000000000..0b5fca5cda --- /dev/null +++ b/lib/include/avx10_2_512convertintrin.h @@ -0,0 +1,320 @@ +/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512CONVERTINTRIN_H +#define __AVX10_2_512CONVERTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A, + __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1), + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) { + return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask( + (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtx_round2ps_ph(A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \ + (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (const int)(R))) + +#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B), \ + (__v32hf)(W), (__mmask32)(U), \ + (const int)(R))) + +#define _mm512_maskz_cvtx_round2ps_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \ + (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (const int)(R))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_bf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiassph_bf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_bf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiassph_bf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_hf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtbiassph_hf8(__m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_hf8( + __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbiassph_hf8(__mmask32 __U, __m512i __A, __m512h __B) { + return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask( + (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(), + (__mmask32)__U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_bf8(__m512h __A, + __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2bf8_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvt2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvts2ph_bf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2bf8s_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvts2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvts2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_hf8(__m512h __A, + __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2hf8_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvts2ph_hf8(__m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_vcvt2ph2hf8s_512((__v32hf)(__A), + (__v32hf)(__B)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvts2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), + (__v64qi)(__m512i)_mm512_setzero_si512()); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) { + return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( + (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_bf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_bf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_bf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_bf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsph_bf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsph_bf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_hf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_hf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_hf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_hf8(__m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsph_hf8(__m256i __W, __mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsph_hf8(__mmask32 __U, __m512h __A) { + return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask( + (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtbf8_ph(__m256i __A) { + return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8)); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) { + return _mm512_castsi512_ph( + _mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8)); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtbf8_ph(__mmask32 __U, __m256i __A) { + return _mm512_castsi512_ph( + _mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif // __AVX10_2_512CONVERTINTRIN_H +#endif // __SSE2__ diff --git a/lib/include/avx10_2_512minmaxintrin.h b/lib/include/avx10_2_512minmaxintrin.h new file mode 100644 index 0000000000..fbc7fbadbc --- /dev/null +++ b/lib/include/avx10_2_512minmaxintrin.h @@ -0,0 +1,127 @@ +/*===---- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics ---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2_512MINMAXINTRIN_H +#define __AVX10_2_512MINMAXINTRIN_H + +#define _mm512_minmax_pbh(A, B, C) \ + ((__m512bh)__builtin_ia32_vminmaxbf16512((__v32bf)(__m512bh)(A), \ + (__v32bf)(__m512bh)(A), (int)(C))) + +#define _mm512_mask_minmax_pbh(W, U, A, B, C) \ + ((__m512bh)__builtin_ia32_selectpbf_512( \ + (__mmask32)(U), \ + (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \ + (__v32bf)(__m512bh)(B), (int)(C)), \ + (__v32bf)(__m512bh)(W))) + +#define _mm512_maskz_minmax_pbh(U, A, B, C) \ + ((__m512bh)__builtin_ia32_selectpbf_512( \ + (__mmask32)(U), \ + (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \ + (__v32bf)(__m512bh)(B), (int)(C)), \ + (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps()))) + +#define _mm512_minmax_pd(A, B, C) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_undefined_pd(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_minmax_pd(W, U, A, B, C) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_minmax_pd(U, A, B, C) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_minmax_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R))) + +#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm512_minmax_ph(A, B, C) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_minmax_ph(W, U, A, B, C) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_minmax_ph(U, A, B, C) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_minmax_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) + +#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vminmaxph512_round_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +#define _mm512_minmax_ps(A, B, C) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_minmax_ps(W, U, A, B, C) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_minmax_ps(U, A, B, C) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_minmax_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R))) + +#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vminmaxps512_round_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) +#endif // __AVX10_2_512MINMAXINTRIN_H diff --git a/lib/include/avx10_2_512niintrin.h b/lib/include/avx10_2_512niintrin.h new file mode 100644 index 0000000000..7e614f7740 --- /dev/null +++ b/lib/include/avx10_2_512niintrin.h @@ -0,0 +1,314 @@ +/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2_512NIINTRIN_H +#define __AVX10_2_512NIINTRIN_H + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +/* VNNI FP16 */ +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A, + (__v32hf)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W, + __mmask16 __U, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U, + __m512 __W, + __m512h __A, + __m512h __B) { + return (__m512)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +/* VMPSADBW */ +#define _mm512_mpsadbw_epu8(A, B, imm) \ + ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(imm))) + +#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectw_512( \ + (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectw_512( \ + (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \ + (__v32hi)_mm512_setzero_si512())) + +/* VNNI INT8 */ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W, + __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32( + __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32( + __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512( + __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +/* VNNI INT16 */ +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A, + __m512i __B, + __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32( + __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __SSE2__ */ +#endif /* __AVX10_2_512NIINTRIN_H */ diff --git a/lib/include/avx10_2_512satcvtdsintrin.h b/lib/include/avx10_2_512satcvtdsintrin.h new file mode 100644 index 0000000000..5970ab0331 --- /dev/null +++ b/lib/include/avx10_2_512satcvtdsintrin.h @@ -0,0 +1,303 @@ +/*===----- avx10_2_512satcvtdsintrin.h - AVX10_2_512SATCVTDS intrinsics ----=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX10_2_512SATCVTDSINTRIN_H +#define __AVX10_2_512SATCVTDSINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \ + __min_vector_width__(512))) + +// 512 bit : Double -> Int +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi32(__m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm512_mask_cvttspd_epi32(__m256i __W, __mmask8 __U, __m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( + (__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttspd_epi32(__mmask8 __U, __m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundpd_epi32(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \ + (__mmask8) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundpd_epi32(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundpd_epi32(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \ + (const int)(__R))) + +// 512 bit : Double -> uInt +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu32(__m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm512_mask_cvttspd_epu32(__m256i __W, __mmask8 __U, __m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( + (__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttspd_epu32(__mmask8 __U, __m512d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundpd_epu32(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \ + (__mmask8) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundpd_epu32(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundpd_epu32(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \ + (const int)(__R))) + +// 512 bit : Double -> Long + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi64(__m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( + (__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttspd_epi64(__m512i __W, __mmask8 __U, __m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( + (__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttspd_epi64(__mmask8 __U, __m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundpd_epi64(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \ + (__mmask8) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundpd_epi64(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundpd_epi64(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \ + (const int)(__R))) + +// 512 bit : Double -> ULong + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu64(__m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( + (__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttspd_epu64(__m512i __W, __mmask8 __U, __m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( + (__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttspd_epu64(__mmask8 __U, __m512d __A) { + return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundpd_epu64(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \ + (__mmask8) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundpd_epu64(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundpd_epu64(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \ + (__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \ + (const int)(__R))) + +// 512 bit: Float -> int +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi32(__m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( + (__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttsps_epi32(__m512i __W, __mmask16 __U, __m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( + (__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttsps_epi32(__mmask16 __U, __m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( + (__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundps_epi32(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \ + (__mmask16) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundps_epi32(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundps_epi32(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(__U), (const int)(__R))) + +// 512 bit: Float -> uint +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu32(__m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( + (__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttsps_epu32(__m512i __W, __mmask16 __U, __m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( + (__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttsps_epu32(__mmask16 __U, __m512 __A) { + return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( + (__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundps_epu32(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \ + (__mmask16) - 1, (const int)(__R))) + +#define _mm512_mask_cvtts_roundps_epu32(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundps_epu32(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \ + (__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(__U), (const int)(__R))) + +// 512 bit : float -> long +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi64(__m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( + (__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttsps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( + (__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttsps_epi64(__mmask8 __U, __m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundps_epi64(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \ + (const int)(__R))) + +#define _mm512_mask_cvtts_roundps_epi64(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundps_epi64(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \ + (const int)(__R))) + +// 512 bit : float -> ulong +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu64(__m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( + (__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_cvttsps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( + (__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_cvttsps_epu64(__mmask8 __U, __m256 __A) { + return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm512_cvtts_roundps_epu64(__A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \ + (const int)(__R))) + +#define _mm512_mask_cvtts_roundps_epu64(__W, __U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \ + (const int)(__R))) + +#define _mm512_maskz_cvtts_roundps_epu64(__U, __A, __R) \ + ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \ + (__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \ + (const int)(__R))) + +#undef __DEFAULT_FN_ATTRS +#endif // __AVX10_2_512SATCVTDSINTRIN_H diff --git a/lib/include/avx10_2_512satcvtintrin.h b/lib/include/avx10_2_512satcvtintrin.h new file mode 100644 index 0000000000..7f41deb521 --- /dev/null +++ b/lib/include/avx10_2_512satcvtintrin.h @@ -0,0 +1,301 @@ +/*===------ avx10_2_512satcvtintrin.h - AVX10_2_512SATCVT intrinsics -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2_512SATCVTINTRIN_H +#define __AVX10_2_512SATCVTINTRIN_H + +#define _mm512_ipcvtbf16_epi8(A) \ + ((__m512i)__builtin_ia32_vcvtbf162ibs512((__v32bf)(__m512bh)(A))) + +#define _mm512_mask_ipcvtbf16_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvtbf16_epi8(A), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_ipcvtbf16_epi8(U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvtbf16_epi8(A), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_ipcvtbf16_epu8(A) \ + ((__m512i)__builtin_ia32_vcvtbf162iubs512((__v32bf)(__m512bh)(A))) + +#define _mm512_mask_ipcvtbf16_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvtbf16_epu8(A), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_ipcvtbf16_epu8(U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvtbf16_epu8(A), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_ipcvttbf16_epi8(A) \ + ((__m512i)__builtin_ia32_vcvttbf162ibs512((__v32bf)(__m512bh)(A))) + +#define _mm512_mask_ipcvttbf16_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvttbf16_epi8(A), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_ipcvttbf16_epi8(U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvttbf16_epi8(A), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_ipcvttbf16_epu8(A) \ + ((__m512i)__builtin_ia32_vcvttbf162iubs512((__v32bf)(__m512bh)(A))) + +#define _mm512_mask_ipcvttbf16_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvttbf16_epu8(A), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_ipcvttbf16_epu8(U, A) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_ipcvttbf16_epu8(A), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_ipcvtph_epi8(A) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvtph_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)(W), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvtph_epi8(U, A) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvt_roundph_epi8(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)_mm512_setzero_si512(), \ + (__mmask32)-1, (const int)R)) + +#define _mm512_mask_ipcvt_roundph_epi8(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R)) + +#define _mm512_maskz_ipcvt_roundph_epi8(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)_mm512_setzero_si512(), \ + (__mmask32)(U), (const int)R)) + +#define _mm512_ipcvtph_epu8(A) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvtph_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)(W), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvtph_epu8(U, A) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvt_roundph_epu8(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + (const int)R)) + +#define _mm512_mask_ipcvt_roundph_epu8(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R)) + +#define _mm512_maskz_ipcvt_roundph_epu8(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + (const int)R)) + +#define _mm512_ipcvtps_epi8(A) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvtps_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \ + (__v16su)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvtps_epi8(U, A) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvt_roundps_epi8(A, R) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \ + (__v16su)_mm512_setzero_si512(), \ + (__mmask16)-1, (const int)R)) + +#define _mm512_mask_ipcvt_roundps_epi8(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R)) + +#define _mm512_maskz_ipcvt_roundps_epi8(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \ + (__v16su)_mm512_setzero_si512(), \ + (__mmask16)(U), (const int)R)) + +#define _mm512_ipcvtps_epu8(A) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvtps_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask((__v16sf)(__m512)(A), \ + (__v16su)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvtps_epu8(U, A) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvt_roundps_epu8(A, R) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + (const int)R)) + +#define _mm512_mask_ipcvt_roundps_epu8(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R)) + +#define _mm512_maskz_ipcvt_roundps_epu8(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \ + (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + (const int)R)) + +#define _mm512_ipcvttph_epi8(A) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvttph_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)(W), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvttph_epi8(U, A) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvtt_roundph_epi8(A, S) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + S)) + +#define _mm512_mask_ipcvtt_roundph_epi8(W, U, A, S) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S)) + +#define _mm512_maskz_ipcvtt_roundph_epi8(U, A, S) \ + ((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + S)) + +#define _mm512_ipcvttph_epu8(A) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvttph_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask((__v32hf)(__m512h)(A), \ + (__v32hu)(W), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvttph_epu8(U, A) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvtt_roundph_epu8(A, S) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \ + S)) + +#define _mm512_mask_ipcvtt_roundph_epu8(W, U, A, S) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S)) + +#define _mm512_maskz_ipcvtt_roundph_epu8(U, A, S) \ + ((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \ + (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \ + S)) + +#define _mm512_ipcvttps_epi8(A) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvttps_epi8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask((__v16sf)(__m512h)(A), \ + (__v16su)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvttps_epi8(U, A) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvtt_roundps_epi8(A, S) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + S)) + +#define _mm512_mask_ipcvtt_roundps_epi8(W, U, A, S) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S)) + +#define _mm512_maskz_ipcvtt_roundps_epi8(U, A, S) \ + ((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + S)) + +#define _mm512_ipcvttps_epu8(A) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_ipcvttps_epu8(W, U, A) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask((__v16sf)(__m512h)(A), \ + (__v16su)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_ipcvttps_epu8(U, A) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_ipcvtt_roundps_epu8(A, S) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \ + S)) + +#define _mm512_mask_ipcvtt_roundps_epu8(W, U, A, S) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S)) + +#define _mm512_maskz_ipcvtt_roundps_epu8(U, A, S) \ + ((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \ + (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \ + S)) + +#endif // __AVX10_2_512SATCVTINTRIN_H diff --git a/lib/include/avx10_2bf16intrin.h b/lib/include/avx10_2bf16intrin.h new file mode 100644 index 0000000000..199cc13ff7 --- /dev/null +++ b/lib/include/avx10_2bf16intrin.h @@ -0,0 +1,1085 @@ +/*===-------------- avx10_2bf16intrin.h - AVX10-BF16 intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2BF16INTRIN_H +#define __AVX10_2BF16INTRIN_H + +typedef __bf16 __m128bh_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { + return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_setzero_pbh(void) { + return __builtin_bit_cast(__m128bh, _mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castbf16_ps(__m128bh __a) { + return (__m128)__a; +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_castbf16_ps(__m256bh __a) { + return (__m256)__a; +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_castbf16_pd(__m256bh __a) { + return (__m256d)__a; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castbf16_pd(__m128bh __a) { + return (__m128d)__a; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_castbf16_si128(__m128bh __a) { + return (__m128i)__a; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_castbf16_si256(__m256bh __a) { + return (__m256i)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castps_pbh(__m128 __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_castps_pbh(__m256 __a) { + return (__m256bh)__a; +} + +static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtsbh_bf16(__m128bh __a) { + return __a[0]; +} + +static __inline__ __bf16 __DEFAULT_FN_ATTRS256 +_mm256_cvtsbh_bf16(__m256bh __a) { + return __a[0]; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castpd_pbh(__m128d __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castpd_pbh(__m256d __a) { + return (__m256bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_castsi128_pbh(__m128i __a) { + return (__m128bh)__a; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castsi256_pbh(__m256i __a) { + return (__m256bh)__a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_castbf16256_pbh128(__m256bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_castbf16128_pbh256(__m128bh __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_zextbf16128_pbh256(__m128bh __a) { + return __builtin_shufflevector(__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_undefined_pbh(void) { + return (__m256bh)__builtin_ia32_undef256(); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_load_sbh(void const *__dp) { + __m128bh src = (__v8bf)_mm_setzero_pbh(); + return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__dp, src, + 1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_load_sbh(__m128bh __W, __mmask8 __U, const void *__A) { + __m128bh src = (__v8bf)__builtin_shufflevector( + (__v8bf)__W, (__v8bf)_mm_setzero_pbh(), 0, 8, 8, 8, 8, 8, 8, 8); + + return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__A, src, + __U & 1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_load_sbh(__mmask8 __U, const void *__A) { + return (__m128bh)__builtin_ia32_loadsbf16128_mask( + (const __v8bf *)__A, (__v8bf)_mm_setzero_pbh(), __U & 1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_load_pbh(void const *__p) { + return *(const __m256bh *)__p; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_load_pbh(void const *__p) { + return *(const __m128bh *)__p; +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m256bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_loadu_pbh(void const *__p) { + struct __loadu_pbh { + __m128bh_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pbh *)__p)->__v; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sbh(void *__dp, + __m128bh __a) { + struct __mm_store_sbh_struct { + __bf16 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sbh_struct *)__dp)->__u = __a[0]; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sbh(void *__W, + __mmask8 __U, + __m128bh __A) { + __builtin_ia32_storesbf16128_mask((__v8bf *)__W, __A, __U & 1); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_pbh(void *__P, + __m256bh __A) { + *(__m256bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_pbh(void *__P, + __m128bh __A) { + *(__m128bh *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_pbh(void *__P, + __m256bh __A) { + struct __storeu_pbh { + __m256bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_pbh(void *__P, + __m128bh __A) { + struct __storeu_pbh { + __m128bh_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pbh *)__P)->__v = __A; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a, + __m128bh __b) { + __a[0] = __b[0]; + return __a; +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), + _mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_undefined_pbh(void) { + return (__m128bh)__builtin_ia32_undef128(); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_set_sbh(__bf16 bf) { + return (__v8bf)__builtin_shufflevector( + (__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}, (__v8bf)_mm_setzero_pbh(), 0, 8, + 8, 8, 8, 8, 8, 8); +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_set1_pbh(__bf16 bf) { + return (__m128bh)(__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set1_pbh(__bf16 bf) { + return (__m256bh)(__v16bf){bf, bf, bf, bf, bf, bf, bf, bf, + bf, bf, bf, bf, bf, bf, bf, bf}; +} + +static __inline __m128bh __DEFAULT_FN_ATTRS128 +_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, + __bf16 bf6, __bf16 bf7, __bf16 bf8) { + return (__m128bh)(__v8bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8}; +} + +static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set_pbh( + __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6, + __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12, + __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) { + return (__m256bh)(__v16bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, + bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16}; +} + +#define _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8) \ + _mm_set_pbh((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1)) + +#define _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \ + bf11, bf12, bf13, bf14, bf15, bf16) \ + _mm256_set_pbh((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10), \ + (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), \ + (bf1)) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_abs_pbh(__m256bh __A) { + return (__m256bh)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), + (__m256i)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_abs_pbh(__m128bh __A) { + return (__m128bh)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) { + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, (__v8bf)__W, + (__v8bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, (__v16bf)__W, + (__v16bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { + return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { + return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_permutexvar_pbh(__m128i __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_add_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)((__v16bf)__A + (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_add_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_add_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A + (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_add_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_add_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sub_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)((__v16bf)__A - (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sub_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A - (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_sub_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mul_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)((__v16bf)__A * (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mul_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A * (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_mul_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_div_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)((__v16bf)__A / (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_div_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_div_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)((__v8bf)__A / (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_div_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_div_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_max_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)__builtin_ia32_vmaxbf16256((__v16bf)__A, (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_max_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vmaxbf16128((__v8bf)__A, (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_max_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_max_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_min_pbh(__m256bh __A, + __m256bh __B) { + return (__m256bh)__builtin_ia32_vminbf16256((__v16bf)__A, (__v16bf)__B); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_min_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vminbf16128((__v8bf)__A, (__v8bf)__B); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_min_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16eq((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16lt((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16le((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16gt((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16ge((__v8bf)A, (__v8bf)B); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sbh(__m128bh A, + __m128bh B) { + return __builtin_ia32_vcomisbf16neq((__v8bf)A, (__v8bf)B); +} + +#define _mm256_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A), \ + (__v16bf)(__m256bh)(__B), \ + (int)(__P), (__mmask16) - 1)) + +#define _mm256_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A), \ + (__v16bf)(__m256bh)(__B), \ + (int)(__P), (__mmask16)(__U))) + +#define _mm_cmp_pbh_mask(__A, __B, __P) \ + ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A), \ + (__v8bf)(__m128bh)(__B), \ + (int)(__P), (__mmask8) - 1)) + +#define _mm_mask_cmp_pbh_mask(__U, __A, __B, __P) \ + ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A), \ + (__v8bf)(__m128bh)(__B), \ + (int)(__P), (__mmask8)(__U))) + +#define _mm256_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask16)__builtin_ia32_vfpclassbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16)(__U))) + +#define _mm256_fpclass_pbh_mask(__A, imm) \ + ((__mmask16)__builtin_ia32_vfpclassbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16) - 1)) + +#define _mm_mask_fpclass_pbh_mask(__U, __A, imm) \ + ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A), \ + (int)(imm), (__mmask8)(__U))) + +#define _mm_fpclass_pbh_mask(__A, imm) \ + ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A), \ + (int)(imm), (__mmask8) - 1)) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_scalef_pbh(__m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_undefined_pbh(), + (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pbh( + __m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)__W, (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) { + return (__m256bh)__builtin_ia32_vscalefbf16256_mask( + (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_setzero_pbh(), + (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_scalef_pbh(__m128bh __A, + __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)__W, (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128bh)__builtin_ia32_vscalefbf16128_mask( + (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_rcp_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vrcpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrcpbf16256_mask((__v16bf)__A, (__v16bf)__W, + (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrcpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rcp_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vrcpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrcpbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrcpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_getexp_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( + (__v16bf)__A, (__v16bf)__W, (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vgetexpbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_getexp_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexpbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vgetexpbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_rsqrt_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask((__v16bf)__A, (__v16bf)__W, + (__mmask16)__U); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask( + (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rsqrt_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask((__v8bf)__A, (__v8bf)__W, + (__mmask8)__U); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask( + (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U); +} + +#define _mm256_reduce_pbh(__A, imm) \ + ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_undefined_pbh(), \ + (__mmask16) - 1)) + +#define _mm256_mask_reduce_pbh(__W, __U, __A, imm) \ + ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ + (__mmask16)(__U))) + +#define _mm256_maskz_reduce_pbh(__U, __A, imm) \ + ((__m256bh)__builtin_ia32_vreducebf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16)(__U))) + +#define _mm_reduce_pbh(__A, imm) \ + ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_undefined_pbh(), \ + (__mmask8) - 1)) + +#define _mm_mask_reduce_pbh(__W, __U, __A, imm) \ + ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ + (__mmask8)(__U))) + +#define _mm_maskz_reduce_pbh(__U, __A, imm) \ + ((__m128bh)__builtin_ia32_vreducebf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8)(__U))) + +#define _mm256_roundscale_pbh(__A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16) - 1)) + +#define _mm256_mask_roundscale_pbh(__W, __U, __A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W), \ + (__mmask16)(__U))) + +#define _mm256_maskz_roundscale_pbh(__U, __A, imm) \ + ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(), \ + (__mmask16)(__U))) + +#define _mm_roundscale_pbh(__A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8) - 1)) + +#define _mm_mask_roundscale_pbh(__W, __U, __A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W), \ + (__mmask8)(__U))) + +#define _mm_maskz_roundscale_pbh(__U, __A, imm) \ + ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(), \ + (__mmask8)(__U))) + +#define _mm256_getmant_pbh(__A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)_mm256_undefined_pbh(), (__mmask16) - 1)) + +#define _mm256_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)(__m256bh)(__W), (__mmask16)(__U))) + +#define _mm256_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m256bh)__builtin_ia32_vgetmantbf16256_mask( \ + (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v16bf)_mm256_setzero_pbh(), (__mmask16)(__U))) + +#define _mm_getmant_pbh(__A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)_mm_undefined_pbh(), (__mmask8) - 1)) + +#define _mm_mask_getmant_pbh(__W, __U, __A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)(__m128bh)(__W), (__mmask8)(__U))) + +#define _mm_maskz_getmant_pbh(__U, __A, __B, __C) \ + ((__m128bh)__builtin_ia32_vgetmantbf16128_mask( \ + (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)), \ + (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U))) + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) { + return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, (__v16bf)_mm256_sqrt_pbh(__A), (__v16bf)__W); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) { + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, + (__v16bf)_mm256_sqrt_pbh(__A), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) { + return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)__W); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, + -(__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsub_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, + -(__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh( + __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__A); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pbh( + __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)__C); +} + +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh( + __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) { + return (__m256bh)__builtin_ia32_selectpbf_256( + (__mmask16)__U, + _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), + (__v16bf)_mm256_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, + -(__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A, + __m128bh __B, + __m128bh __C) { + return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, + -(__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__A); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)__C); +} + +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { + return (__m128bh)__builtin_ia32_selectpbf_128( + (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C), + (__v8bf)_mm_setzero_pbh()); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif +#endif diff --git a/lib/include/avx10_2convertintrin.h b/lib/include/avx10_2convertintrin.h new file mode 100644 index 0000000000..c67a5b890f --- /dev/null +++ b/lib/include/avx10_2convertintrin.h @@ -0,0 +1,590 @@ +/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifdef __SSE2__ + +#ifndef __AVX10_2CONVERTINTRIN_H +#define __AVX10_2CONVERTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask( + (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A, + __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1), + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) { + return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask( + (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm256_cvtx_round2ps_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (const int)(R))) + +#define _mm256_mask_cvtx_round2ps_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)(W), (__mmask16)(U), (const int)(R))) + +#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \ + (__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \ + (__mmask16)(U), (const int)(R))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_bf8(__m128i __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiassph_bf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiassph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiassph_bf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiassph_bf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_bf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiassph_bf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_hf8(__m128i __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtbiassph_hf8(__m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbiassph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbiassph_hf8(__mmask8 __U, __m128i __A, __m128h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask( + (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtbiassph_hf8(__m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(), + (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_hf8( + __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbiassph_hf8(__mmask16 __U, __m256i __A, __m256h __B) { + return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask( + (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(), + (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A), (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_bf8(__m256h __A, + __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_bf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A), (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvts2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvts2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A), (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_hf8(__m256h __A, + __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_hf8(__m128h __A, + __m128h __B) { + return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A), (__v8hf)(__B)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvts2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvts2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), + (__v16qi)(__m128i)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A), + (__v16hf)(__B)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), + (__v32qi)(__m256i)_mm256_setzero_si256()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W, + __mmask8 __U, + __m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U, + __m128i __A) { + return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( + (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) { + return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( + (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_bf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_bf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsph_bf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsph_bf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_bf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsph_bf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsph_bf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_hf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_hf8(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsph_hf8(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsph_hf8(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask( + (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_hf8(__m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsph_hf8(__m128i __W, __mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsph_hf8(__mmask16 __U, __m256h __A) { + return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask( + (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) { + return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) { + return _mm_castsi128_ph( + _mm_mask_slli_epi16((__m128i)__S, __U, _mm_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) { + return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) { + return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtbf8_ph(__m256h __S, __mmask16 __U, __m128i __A) { + return _mm256_castsi256_ph( + _mm256_mask_slli_epi16((__m256i)__S, __U, _mm256_cvtepi8_epi16(__A), 8)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) { + return _mm256_castsi256_ph( + _mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8)); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVX10_2CONVERTINTRIN_H +#endif // __SSE2__ diff --git a/lib/include/avx10_2copyintrin.h b/lib/include/avx10_2copyintrin.h new file mode 100644 index 0000000000..76b8f8ced5 --- /dev/null +++ b/lib/include/avx10_2copyintrin.h @@ -0,0 +1,66 @@ +/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2COPYINTRIN_H +#define __AVX10_2COPYINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) + +/// Constructs a 128-bit integer vector, setting the lower 32 bits to the +/// lower 32 bits of the parameter \a __A; the upper bits are zeoroed. +/// +/// \code{.operation} +/// result[31:0] := __A[31:0] +/// result[MAX:32] := 0 +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD instruction. +/// +/// \param __A +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector. The lower 32 bits are copied from the +/// parameter \a __A; the upper bits are zeroed. +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) { + return (__m128i)__builtin_shufflevector( + (__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4); +} + +/// Constructs a 128-bit integer vector, setting the lower 16 bits to the +/// lower 16 bits of the parameter \a __A; the upper bits are zeoroed. +/// +/// \code{.operation} +/// result[15:0] := __A[15:0] +/// result[MAX:16] := 0 +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVW instruction. +/// +/// \param __A +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector. The lower 16 bits are copied from the +/// parameter \a __A; the upper bits are zeroed. +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) { + return (__m128i)__builtin_shufflevector( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8); +} + +#undef __DEFAULT_FN_ATTRS128 + +#endif // __AVX10_2COPYINTRIN_H diff --git a/lib/include/avx10_2minmaxintrin.h b/lib/include/avx10_2minmaxintrin.h new file mode 100644 index 0000000000..8164d49d89 --- /dev/null +++ b/lib/include/avx10_2minmaxintrin.h @@ -0,0 +1,277 @@ +/*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2MINMAXINTRIN_H +#define __AVX10_2MINMAXINTRIN_H + +#define _mm_minmax_pbh(A, B, C) \ + ((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A), \ + (__m128bh)(__v8bf)(B), (int)(C))) + +#define _mm_mask_minmax_pbh(W, U, A, B, C) \ + ((__m128bh)__builtin_ia32_selectpbf_128( \ + (__mmask8)(U), \ + (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \ + (int)(C)), \ + (__v8bf)(W))) + +#define _mm_maskz_minmax_pbh(U, A, B, C) \ + ((__m128bh)__builtin_ia32_selectpbf_128( \ + (__mmask8)(U), \ + (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \ + (int)(C)), \ + (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps()))) + +#define _mm256_minmax_pbh(A, B, C) \ + ((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A), \ + (__m256bh)(__v16bf)(B), (int)(C))) + +#define _mm256_mask_minmax_pbh(W, U, A, B, C) \ + ((__m256bh)__builtin_ia32_selectpbf_256( \ + (__mmask16)(U), \ + (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \ + (__m256bh)(__v16bf)(B), (int)(C)), \ + (__v16bf)(W))) + +#define _mm256_maskz_minmax_pbh(U, A, B, C) \ + ((__m256bh)__builtin_ia32_selectpbf_256( \ + (__mmask16)(U), \ + (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \ + (__m256bh)(__v16bf)(B), (int)(C)), \ + (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps()))) + +#define _mm_minmax_pd(A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxpd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm_mask_minmax_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxpd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_minmax_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxpd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_minmax_pd(A, B, C) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC)) + +#define _mm256_mask_minmax_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_maskz_minmax_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_minmax_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_minmax_round_pd(W, U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_minmax_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm_minmax_ph(A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxph128_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)-1)) + +#define _mm_mask_minmax_ph(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxph128_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)(__m128h)(W), (__mmask16)-1)) + +#define _mm_maskz_minmax_ph(U, A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxph128_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U))) + +#define _mm256_minmax_ph(A, B, C) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC)) + +#define _mm256_mask_minmax_ph(W, U, A, B, C) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \ + (__v16hf)(__m256h)(W), (__mmask16)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_maskz_minmax_ph(U, A, B, C) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_minmax_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \ + (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R))) + +#define _mm256_mask_minmax_round_ph(W, U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (C), \ + (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_minmax_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vminmaxph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +#define _mm_minmax_ps(A, B, C) \ + ((__m128)__builtin_ia32_vminmaxps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_minmax_ps(W, U, A, B, C) \ + ((__m128)__builtin_ia32_vminmaxps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_minmax_ps(U, A, B, C) \ + ((__m128)__builtin_ia32_vminmaxps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_minmax_ps(A, B, C) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC)) + +#define _mm256_mask_minmax_ps(W, U, A, B, C) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \ + (__mmask8)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_maskz_minmax_ps(U, A, B, C) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), _MM_FROUND_NO_EXC)) + +#define _mm256_minmax_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_minmax_round_ps(W, U, A, B, C, R) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_minmax_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vminmaxps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) + +#define _mm_minmax_sd(A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_minmax_sd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_minmax_sd(U, A, B, C) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_minmax_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R))) + +#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vminmaxsd_round_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm_minmax_sh(A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_minmax_sh(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_minmax_sh(U, A, B, C) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_minmax_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) + +#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vminmaxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm_minmax_ss(A, B, C) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_minmax_ss(W, U, A, B, C) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_minmax_ss(U, A, B, C) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_minmax_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R))) + +#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_vminmaxss_round_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R))) +#endif // __AVX10_2MINMAXINTRIN_H diff --git a/lib/include/avx10_2niintrin.h b/lib/include/avx10_2niintrin.h new file mode 100644 index 0000000000..c91a7b57c7 --- /dev/null +++ b/lib/include/avx10_2niintrin.h @@ -0,0 +1,2075 @@ +/*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifdef __SSE2__ + +#ifndef __AVX10_2NIINTRIN_H +#define __AVX10_2NIINTRIN_H + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) + +/* VNNI FP16 */ +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A, + (__v8hf)__B); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U, + __m128 __W, + __m128h __A, + __m128h __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpph_ps(__W, __A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W, + __m256h __A, + __m256h __B) { + return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A, + (__v16hf)__B); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) { + return (__m256)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +/* VMPSADBW */ +#define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \ + (__v8hi)(__m128i)(W))) + +#define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectw_256( \ + (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectw_256( \ + (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \ + (__v16hi)_mm256_setzero_si256())) + +/* VNNI INT8 */ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128( + __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32( + __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256( + __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +/* VNNI INT16 */ +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32( + __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +/* YMM Rounding */ +#define _mm256_add_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_add_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_add_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_add_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_add_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_add_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_add_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_add_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_add_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_cmp_round_pd_mask(A, B, P, R) \ + ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \ + ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cmp_round_ph_mask(A, B, P, R) \ + ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \ + (int)(R))) + +#define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \ + ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_cmp_round_ps_mask(A, B, P, R) \ + ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \ + ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepi32_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \ + (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \ + (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundepi32_ps(A, R) \ + ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \ + (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \ + ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundpd_epi32(A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundpd_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \ + (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \ + (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundpd_ps(A, R) \ + ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \ + (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \ + ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \ + (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_ps(U, A, R) \ + ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundpd_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundpd_epu32(A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \ + ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundpd_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundph_epi32(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \ + (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \ + (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundph_pd(A, R) \ + ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \ + (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundph_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_pd(U, A, R) \ + ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \ + (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvtx_roundph_ps(A, R) \ + ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \ + (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtx_roundph_ps(U, A, R) \ + ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \ + (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundph_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \ + (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \ + (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundph_epu32(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \ + (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \ + (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundph_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \ + (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \ + (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundph_epu16(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \ + (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epu16(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \ + (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_cvt_roundph_epi16(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \ + (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundph_epi16(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \ + (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_cvt_roundps_epi32(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundps_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundps_pd(A, R) \ + ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ + (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundps_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ + (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundps_pd(U, A, R) \ + ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ + (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundps_ph(A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ + (__v8hi)_mm_undefined_si128(), \ + (__mmask8)-1)) + +/* FIXME: We may use these way in future. +#define _mm256_cvt_roundps_ph(A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ + (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(), \ + (__mmask8)-1)) +#define _mm256_mask_cvt_roundps_ph(U, W, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ + (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W))) +#define _mm256_maskz_cvt_roundps_ph(W, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ + (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(W))) */ + +#define _mm256_cvtx_roundps_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \ + (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtx_roundps_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \ + (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundps_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundps_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundps_epu32(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundps_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundps_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundps_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepi64_pd(A, R) \ + ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ + (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ + (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \ + ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ + (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepi64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \ + (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \ + (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundepi64_ps(A, R) \ + ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \ + (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \ + ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \ + (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \ + ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_cvtt_roundpd_epi32(A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundpd_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundpd_epu32(A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \ + ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundpd_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ + (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundph_epi32(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \ + (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \ + (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvtt_roundph_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \ + (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \ + (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvtt_roundph_epu32(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \ + (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \ + (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvtt_roundph_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \ + (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \ + (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvtt_roundph_epu16(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ + (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ + (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ + (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundph_epi16(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \ + (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \ + (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundps_epi32(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundps_epi64(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundps_epu32(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvtt_roundps_epu64(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ + (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepu32_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \ + (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \ + (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundepu32_ps(A, R) \ + ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ + (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ + (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \ + ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ + (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepu64_pd(A, R) \ + ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ + (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ + (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \ + ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ + (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_cvt_roundepu64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \ + (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \ + (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundepu64_ps(A, R) \ + ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \ + (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \ + ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \ + (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \ + ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_cvt_roundepu16_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \ + (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \ + (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +#define _mm256_cvt_roundepi16_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \ + (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \ + (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +#define _mm256_div_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_div_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_div_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_div_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_div_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_div_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_div_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_div_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_div_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_fcmadd_round_pch(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_cmul_round_pch(A, B, R) \ + ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ + (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_cmul_round_pch(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_cmul_round_pch(U, A, B, R) \ + ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ + (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_fixupimm_round_pd(A, B, C, imm, R) \ + ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ + (int)(imm), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ + ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ + (int)(imm), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ + ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ + (int)(imm), (__mmask8)(U), (int)(R))) + +#define _mm256_fixupimm_round_ps(A, B, C, imm, R) \ + ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ + (int)(imm), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ + ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ + (int)(imm), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ + ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ + (int)(imm), (__mmask8)(U), (int)(R))) + +#define _mm256_fmadd_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmsub_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fnmadd_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fnmsub_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmadd_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fmsub_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fnmadd_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \ + -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ + -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fnmsub_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ + -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fmadd_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmsub_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fnmadd_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \ + -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ + -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fnmsub_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ + -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmadd_round_pch(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmaddsub_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmsubadd_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmaddsub_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fmsubadd_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_fmaddsub_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_fmsubadd_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) +#define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \ + ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ + (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \ + ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \ + -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \ + ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ + (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \ + ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \ + -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \ + ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ + (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \ + ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \ + -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mul_round_pch(A, B, R) \ + ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ + (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_mul_round_pch(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_mul_round_pch(U, A, B, R) \ + ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ + (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ + (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm256_getexp_round_pd(A, R) \ + ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_getexp_round_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_getexp_round_pd(U, A, R) \ + ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_getexp_round_ph(A, R) \ + ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \ + (int)(R))) + +#define _mm256_mask_getexp_round_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_getexp_round_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \ + (int)(R))) + +#define _mm256_getexp_round_ps(A, R) \ + ((__m256)__builtin_ia32_vgetexpps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \ + (int)(R))) + +#define _mm256_mask_getexp_round_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_vgetexpps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_getexp_round_ps(U, A, R) \ + ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_getmant_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \ + (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm256_getmant_round_ph(A, B, C, R) \ + ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R))) + +#define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \ + ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +#define _mm256_getmant_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \ + (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \ + ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) + +#define _mm256_max_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_max_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_max_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_max_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_max_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_max_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_max_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_max_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_max_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_min_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_min_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_min_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_min_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_min_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_min_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_min_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_min_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_min_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_mul_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_mul_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_mul_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_mul_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_mul_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_mul_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_mul_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_mul_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_mul_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_range_round_pd(A, B, C, R) \ + ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_range_round_pd(W, U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_range_round_pd(U, A, B, C, R) \ + ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) + +#define _mm256_range_round_ps(A, B, C, R) \ + ((__m256)__builtin_ia32_vrangeps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_range_round_ps(W, U, A, B, C, R) \ + ((__m256)__builtin_ia32_vrangeps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_range_round_ps(U, A, B, C, R) \ + ((__m256)__builtin_ia32_vrangeps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) + +#define _mm256_reduce_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_reduce_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_maskz_reduce_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \ + ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_reduce_round_ph(U, A, imm, R) \ + ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_reduce_round_ph(A, imm, R) \ + ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_reduce_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vreduceps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_reduce_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_vreduceps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm256_maskz_reduce_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_vreduceps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_roundscale_round_pd(A, imm, R) \ + ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ + (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \ + ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ + (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \ + (int)(R))) + +#define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \ + ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ + (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(A), (int)(R))) + +#define _mm256_roundscale_round_ph(A, imm, R) \ + ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)-1, (int)(R))) + +#define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \ + ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ + (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \ + (__mmask16)(B), (int)(R))) + +#define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \ + ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ + (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(A), (int)(R))) + +#define _mm256_roundscale_round_ps(A, imm, R) \ + ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ + (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \ + ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ + (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \ + (int)(R))) + +#define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \ + ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ + (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(A), (int)(R))) + +#define _mm256_scalef_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \ + (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R))) + +#define _mm256_mask_scalef_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_scalef_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_scalef_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \ + (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R))) + +#define _mm256_mask_scalef_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm256_maskz_scalef_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ + (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +#define _mm256_scalef_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vscalefps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm256_mask_scalef_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_vscalefps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_maskz_scalef_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_vscalefps256_round_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm256_sqrt_round_pd(A, R) \ + ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R))) + +#define _mm256_mask_sqrt_round_pd(W, U, A, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_sqrt_round_pd(U, A, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_sqrt_round_ph(A, R) \ + ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R))) + +#define _mm256_mask_sqrt_round_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_sqrt_round_ph(U, A, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_sqrt_round_ps(A, R) \ + ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R))) + +#define _mm256_mask_sqrt_round_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_sqrt_round_ps((A), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_sqrt_round_ps(U, A, R) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_sqrt_round_ps((A), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_sub_round_pd(A, B, R) \ + ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(R))) + +#define _mm256_mask_sub_round_pd(W, U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_sub_round_pd(U, A, B, R) \ + ((__m256d)__builtin_ia32_selectpd_256( \ + (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_sub_round_ph(A, B, R) \ + ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \ + (__v16hf)(__m256h)(B), (int)(R))) + +#define _mm256_mask_sub_round_ph(W, U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \ + (__v16hf)(__m256h)(W))) + +#define _mm256_maskz_sub_round_ph(U, A, B, R) \ + ((__m256h)__builtin_ia32_selectph_256( \ + (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \ + (__v16hf)_mm256_setzero_ph())) + +#define _mm256_sub_round_ps(A, B, R) \ + ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(R))) + +#define _mm256_mask_sub_round_ps(W, U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_sub_round_ps(U, A, B, R) \ + ((__m256)__builtin_ia32_selectps_256( \ + (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \ + (__v8sf)_mm256_setzero_ps())) + +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128 + +#endif /* __AVX10_2NIINTRIN_H */ +#endif /* __SSE2__ */ diff --git a/lib/include/avx10_2satcvtdsintrin.h b/lib/include/avx10_2satcvtdsintrin.h new file mode 100644 index 0000000000..5902843631 --- /dev/null +++ b/lib/include/avx10_2satcvtdsintrin.h @@ -0,0 +1,496 @@ +/*===----------- avx10_2satcvtdsintrin.h - AVX512SATCVTDS intrinsics --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2SATCVTDSINTRIN_H +#define __AVX10_2SATCVTDSINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(256))) + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ + __min_vector_width__(128))) + +#define _mm_cvtts_roundsd_i32(__A, __R) \ + ((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128)(__A), (const int)(__R))) + +#define _mm_cvtts_roundsd_si32(__A, __R) \ + ((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128d)(__A), (const int)(__R))) + +#define _mm_cvtts_roundsd_u32(__A, __R) \ + ((unsigned int)__builtin_ia32_vcvttsd2usis32((__v2df)(__m128d)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundss_i32(__A, __R) \ + ((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R))) + +#define _mm_cvtts_roundss_si32(__A, __R) \ + ((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R))) + +#define _mm_cvtts_roundss_u32(__A, __R) \ + ((unsigned int)__builtin_ia32_vcvttss2usis32((__v4sf)(__m128)(__A), \ + (const int)(__R))) + +#ifdef __x86_64__ +#define _mm_cvtts_roundss_u64(__A, __R) \ + ((unsigned long long)__builtin_ia32_vcvttss2usis64((__v4sf)(__m128)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundsd_u64(__A, __R) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usis64((__v2df)(__m128d)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundss_i64(__A, __R) \ + ((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundss_si64(__A, __R) \ + ((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundsd_si64(__A, __R) \ + ((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A), \ + (const int)(__R))) + +#define _mm_cvtts_roundsd_i64(__A, __R) \ + ((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A), \ + (const int)(__R))) +#endif /* __x86_64__ */ + +// 128 Bit : Double -> int +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi32(__m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask( + (__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1))); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask((__v2df)__A, (__v4si)__W, + __U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttspd_epi32(__mmask16 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask( + (__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U)); +} + +// 256 Bit : Double -> int +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvttspd_epi32(__m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( + (__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( + (__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttspd_epi32(__mmask8 __U, __m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundpd_epi32(__A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(), \ + (__mmask8) - 1, (int)(__R))) + +#define _mm256_mask_cvtts_roundpd_epi32(__W, __U, __A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R))) + +#define _mm256_maskz_cvtts_roundpd_epi32(__U, __A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)__U, (int)(__R))) + +// 128 Bit : Double -> uint +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu32(__m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask( + (__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1))); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask( + (__v2df)__A, (__v4si)(__m128i)__W, (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttspd_epu32(__mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask( + (__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U)); +} + +// 256 Bit : Double -> uint +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvttspd_epu32(__m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( + (__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( + (__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttspd_epu32(__mmask8 __U, __m256d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundpd_epu32(__A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(), \ + (__mmask8) - 1, (int)(__R))) + +#define _mm256_mask_cvtts_roundpd_epu32(__W, __U, __A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R))) + +#define _mm256_maskz_cvtts_roundpd_epu32(__U, __A, __R) \ + ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \ + (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)__U, (int)(__R))) + +// 128 Bit : Double -> long +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi64(__m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask( + (__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttspd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttspd_epi64(__mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U)); +} + +// 256 Bit : Double -> long +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttspd_epi64(__m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( + (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttspd_epi64(__m256i __W, __mmask8 __U, __m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( + (__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttspd_epi64(__mmask8 __U, __m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundpd_epi64(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( \ + (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \ + (int)__R)) + +#define _mm256_mask_cvtts_roundpd_epi64(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask((__v4df)__A, (__v4di)__W, \ + (__mmask8)__U, (int)__R)) + +#define _mm256_maskz_cvtts_roundpd_epi64(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( \ + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R)) + +// 128 Bit : Double -> ulong +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu64(__m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask( + (__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttspd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttspd_epu64(__mmask8 __U, __m128d __A) { + return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U)); +} + +// 256 Bit : Double -> ulong + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttspd_epu64(__m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( + (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttspd_epu64(__m256i __W, __mmask8 __U, __m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( + (__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttspd_epu64(__mmask8 __U, __m256d __A) { + return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundpd_epu64(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \ + (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \ + (int)__R)) + +#define _mm256_mask_cvtts_roundpd_epu64(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \ + (__v4df)__A, (__v4di)__W, (__mmask8)__U, (int)__R)) + +#define _mm256_maskz_cvtts_roundpd_epu64(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \ + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R)) + +// 128 Bit : float -> int +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi32(__m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask( + (__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1))); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttsps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttsps_epi32(__mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask( + (__v4sf)__A, (__v4si)(__m128i)_mm_setzero_si128(), (__mmask8)__U)); +} + +// 256 Bit : float -> int +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttsps_epi32(__m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( + (__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttsps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( + (__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttsps_epi32(__mmask8 __U, __m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundps_epi32(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(), \ + (__mmask8) - 1, (int)(__R))) + +#define _mm256_mask_cvtts_roundps_epi32(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R))) + +#define _mm256_maskz_cvtts_roundps_epi32(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)__U, (int)(__R))) + +// 128 Bit : float -> uint +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu32(__m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask( + (__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1))); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttsps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttsps_epu32(__mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U)); +} + +// 256 Bit : float -> uint + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttsps_epu32(__m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( + (__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttsps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( + (__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttsps_epu32(__mmask8 __U, __m256 __A) { + return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundps_epu32(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(), \ + (__mmask8) - 1, (int)(__R))) + +#define _mm256_mask_cvtts_roundps_epu32(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R))) + +#define _mm256_maskz_cvtts_roundps_epu32(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \ + (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)__U, (int)(__R))) + +// 128 bit : float -> long +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi64(__m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask( + (__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttsps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask( + (__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U)); +} +// 256 bit : float -> long + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttsps_epi64(__m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( + (__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttsps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( + (__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundps_epi64(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \ + (int)__R)) + +#define _mm256_mask_cvtts_roundps_epi64(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R)) + +#define _mm256_maskz_cvtts_roundps_epi64(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, \ + (int)__R)) + +// 128 bit : float -> ulong +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu64(__m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask( + (__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttsps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask( + (__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) { + return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U)); +} +// 256 bit : float -> ulong + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttsps_epu64(__m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( + (__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttsps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( + (__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) { + return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U, + _MM_FROUND_CUR_DIRECTION)); +} + +#define _mm256_cvtts_roundps_epu64(__A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \ + (int)__R)) + +#define _mm256_mask_cvtts_roundps_epu64(__W, __U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R)) + +#define _mm256_maskz_cvtts_roundps_epu64(__U, __A, __R) \ + ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \ + (__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, \ + (int)__R)) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 +#endif // __AVX10_2SATCVTDSINTRIN_H diff --git a/lib/include/avx10_2satcvtintrin.h b/lib/include/avx10_2satcvtintrin.h new file mode 100644 index 0000000000..d16c60e638 --- /dev/null +++ b/lib/include/avx10_2satcvtintrin.h @@ -0,0 +1,444 @@ +/*===----------- avx10_2satcvtintrin.h - AVX10_2SATCVT intrinsics ----------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AVX10_2SATCVTINTRIN_H +#define __AVX10_2SATCVTINTRIN_H + +#define _mm_ipcvtbf16_epi8(A) \ + ((__m128i)__builtin_ia32_vcvtbf162ibs128((__v8bf)(__m128bh)(A))) + +#define _mm_mask_ipcvtbf16_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epi8(A), (__v8hi)(__m128i)(W))) + +#define _mm_maskz_ipcvtbf16_epi8(U, A) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_ipcvtbf16_epi8(A), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_ipcvtbf16_epi8(A) \ + ((__m256i)__builtin_ia32_vcvtbf162ibs256((__v16bf)(__m256bh)(A))) + +#define _mm256_mask_ipcvtbf16_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvtbf16_epi8(A), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_ipcvtbf16_epi8(U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvtbf16_epi8(A), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_ipcvtbf16_epu8(A) \ + ((__m128i)__builtin_ia32_vcvtbf162iubs128((__v8bf)(__m128bh)(A))) + +#define _mm_mask_ipcvtbf16_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epu8(A), (__v8hi)(__m128i)(W))) + +#define _mm_maskz_ipcvtbf16_epu8(U, A) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_ipcvtbf16_epu8(A), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_ipcvtbf16_epu8(A) \ + ((__m256i)__builtin_ia32_vcvtbf162iubs256((__v16bf)(__m256bh)(A))) + +#define _mm256_mask_ipcvtbf16_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvtbf16_epu8(A), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_ipcvtbf16_epu8(U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvtbf16_epu8(A), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_ipcvtph_epi8(A) \ + ((__m128i)__builtin_ia32_vcvtph2ibs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvtph_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvtph2ibs128_mask((__v8hf)(__m128h)(A), \ + (__v8hu)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvtph_epi8(U, A) \ + ((__m128i)__builtin_ia32_vcvtph2ibs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvtph_epi8(A) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvtph_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvtph_epi8(U, A) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvt_roundph_epi8(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)_mm256_setzero_si256(), \ + (__mmask16)-1, (const int)R)) + +#define _mm256_mask_ipcvt_roundph_epi8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R)) + +#define _mm256_maskz_ipcvt_roundph_epi8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)_mm256_setzero_si256(), \ + (__mmask16)(U), (const int)R)) + +#define _mm_ipcvtph_epu8(A) \ + ((__m128i)__builtin_ia32_vcvtph2iubs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvtph_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvtph2iubs128_mask((__v8hf)(__m128h)(A), \ + (__v8hu)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvtph_epu8(U, A) \ + ((__m128i)__builtin_ia32_vcvtph2iubs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvtph_epu8(A) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvtph_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvtph_epu8(U, A) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvt_roundph_epu8(A, R) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + (const int)R)) + +#define _mm256_mask_ipcvt_roundph_epu8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R)) + +#define _mm256_maskz_ipcvt_roundph_epu8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ + (const int)R)) + +#define _mm_ipcvtps_epi8(A) \ + ((__m128i)__builtin_ia32_vcvtps2ibs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvtps_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvtps2ibs128_mask((__v4sf)(__m128)(A), \ + (__v4su)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvtps_epi8(U, A) \ + ((__m128i)__builtin_ia32_vcvtps2ibs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvtps_epi8(A) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvtps_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvtps_epi8(U, A) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvt_roundps_epi8(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)-1, (const int)R)) + +#define _mm256_mask_ipcvt_roundps_epi8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R)) + +#define _mm256_maskz_ipcvt_roundps_epi8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)(U), (const int)R)) + +#define _mm_ipcvtps_epu8(A) \ + ((__m128i)__builtin_ia32_vcvtps2iubs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvtps_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvtps2iubs128_mask((__v4sf)(__m128)(A), \ + (__v4su)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvtps_epu8(U, A) \ + ((__m128i)__builtin_ia32_vcvtps2iubs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvtps_epu8(A) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvtps_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \ + (__v8su)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvtps_epu8(U, A) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvt_roundps_epu8(A, R) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)-1, (const int)R)) + +#define _mm256_mask_ipcvt_roundps_epu8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R)) + +#define _mm256_maskz_ipcvt_roundps_epu8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)(U), (const int)R)) + +#define _mm_ipcvttbf16_epi8(A) \ + ((__m128i)__builtin_ia32_vcvttbf162ibs128((__v8bf)(__m128bh)(A))) + +#define _mm_mask_ipcvttbf16_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epi8(A), (__v8hi)(__m128i)(W))) + +#define _mm_maskz_ipcvttbf16_epi8(U, A) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_ipcvttbf16_epi8(A), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_ipcvttbf16_epi8(A) \ + ((__m256i)__builtin_ia32_vcvttbf162ibs256((__v16bf)(__m256bh)(A))) + +#define _mm256_mask_ipcvttbf16_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvttbf16_epi8(A), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_ipcvttbf16_epi8(U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvttbf16_epi8(A), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_ipcvttbf16_epu8(A) \ + ((__m128i)__builtin_ia32_vcvttbf162iubs128((__v8bf)(__m128bh)(A))) + +#define _mm_mask_ipcvttbf16_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_selectw_128( \ + (__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epu8(A), (__v8hi)(__m128i)(W))) + +#define _mm_maskz_ipcvttbf16_epu8(U, A) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_ipcvttbf16_epu8(A), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_ipcvttbf16_epu8(A) \ + ((__m256i)__builtin_ia32_vcvttbf162iubs256((__v16bf)(__m256bh)(A))) + +#define _mm256_mask_ipcvttbf16_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvttbf16_epu8(A), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_ipcvttbf16_epu8(U, A) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_ipcvttbf16_epu8(A), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_ipcvttph_epi8(A) \ + ((__m128i)__builtin_ia32_vcvttph2ibs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvttph_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvttph2ibs128_mask((__v8hf)(__m128h)(A), \ + (__v8hu)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvttph_epi8(U, A) \ + ((__m128i)__builtin_ia32_vcvttph2ibs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvttph_epi8(A) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvttph_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvttph_epi8(U, A) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvtt_roundph_epi8(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + (const int)R)) + +#define _mm256_mask_ipcvtt_roundph_epi8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R)) + +#define _mm256_maskz_ipcvtt_roundph_epi8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ + (const int)R)) + +#define _mm_ipcvttph_epu8(A) \ + ((__m128i)__builtin_ia32_vcvttph2iubs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvttph_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvttph2iubs128_mask((__v8hf)(__m128h)(A), \ + (__v8hu)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvttph_epu8(U, A) \ + ((__m128i)__builtin_ia32_vcvttph2iubs128_mask( \ + (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvttph_epu8(A) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvttph_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask((__v16hf)(__m256h)(A), \ + (__v16hu)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvttph_epu8(U, A) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvtt_roundph_epu8(A, R) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \ + (const int)R)) + +#define _mm256_mask_ipcvtt_roundph_epu8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R)) + +#define _mm256_maskz_ipcvtt_roundph_epu8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \ + (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ + (const int)R)) + +#define _mm_ipcvttps_epi8(A) \ + ((__m128i)__builtin_ia32_vcvttps2ibs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvttps_epi8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvttps2ibs128_mask((__v4sf)(__m128)(A), \ + (__v4su)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvttps_epi8(U, A) \ + ((__m128i)__builtin_ia32_vcvttps2ibs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvttps_epi8(A) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvttps_epi8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvttps_epi8(U, A) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvtt_roundps_epi8(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)-1, (const int)R)) + +#define _mm256_mask_ipcvtt_roundps_epi8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R)) + +#define _mm256_maskz_ipcvtt_roundps_epi8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \ + (__v8su)_mm256_setzero_si256(), \ + (__mmask8)(U), (const int)R)) + +#define _mm_ipcvttps_epu8(A) \ + ((__m128i)__builtin_ia32_vcvttps2iubs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_ipcvttps_epu8(W, U, A) \ + ((__m128i)__builtin_ia32_vcvttps2iubs128_mask((__v4sf)(__m128)(A), \ + (__v4su)(W), (__mmask8)(U))) + +#define _mm_maskz_ipcvttps_epu8(U, A) \ + ((__m128i)__builtin_ia32_vcvttps2iubs128_mask( \ + (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U))) + +#define _mm256_ipcvttps_epu8(A) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_mask_ipcvttps_epu8(W, U, A) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask((__v8sf)(__m256)(A), \ + (__v8su)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_maskz_ipcvttps_epu8(U, A) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm256_ipcvtt_roundps_epu8(A, R) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ + (const int)R)) + +#define _mm256_mask_ipcvtt_roundps_epu8(W, U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R)) + +#define _mm256_maskz_ipcvtt_roundps_epu8(U, A, R) \ + ((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \ + (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \ + (const int)R)) +#endif // __AVX10_2SATCVTINTRIN_H diff --git a/lib/include/avx2intrin.h b/lib/include/avx2intrin.h index 096cae01b5..dc9fc07314 100644 --- a/lib/include/avx2intrin.h +++ b/lib/include/avx2intrin.h @@ -15,12 +15,21 @@ #define __AVX2INTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS256 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx2,no-evex512"), __min_vector_width__(256))) #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx2,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx2"), \ + __min_vector_width__(128))) +#endif /* SSE4 Multiple Packed Sums of Absolute Difference. */ /// Computes sixteen sum of absolute difference (SAD) operations on sets of diff --git a/lib/include/avx512bitalgintrin.h b/lib/include/avx512bitalgintrin.h index bad265ceb7..3c446b34e7 100644 --- a/lib/include/avx512bitalgintrin.h +++ b/lib/include/avx512bitalgintrin.h @@ -23,7 +23,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi16(__m512i __A) { - return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A); + return (__m512i)__builtin_elementwise_popcount((__v32hu)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -45,7 +45,7 @@ _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi8(__m512i __A) { - return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A); + return (__m512i)__builtin_elementwise_popcount((__v64qu)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/lib/include/avx512fintrin.h b/lib/include/avx512fintrin.h index 4f172c74b3..45e7eeb532 100644 --- a/lib/include/avx512fintrin.h +++ b/lib/include/avx512fintrin.h @@ -175,12 +175,21 @@ typedef enum __attribute__((__always_inline__, __nodebug__, \ __target__("avx512f,no-evex512"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /* Create vectors with repeated elements */ -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_setzero_si512(void) -{ - return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_setzero_si512(void) { + return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; } #define _mm512_setzero_epi32 _mm512_setzero_si512 @@ -256,20 +265,16 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v8di) _mm512_setzero_si512()); } - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_setzero_ps(void) -{ - return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void) { + return __extension__(__m512){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; } #define _mm512_setzero _mm512_setzero_ps -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_setzero_pd(void) -{ - return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_setzero_pd(void) { + return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -9775,5 +9780,8 @@ _mm512_cvtsi512_si32(__m512i __A) { #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif /* __AVX512FINTRIN_H */ diff --git a/lib/include/avx512vlbitalgintrin.h b/lib/include/avx512vlbitalgintrin.h index 377e3a5ea5..1b01fe0b9d 100644 --- a/lib/include/avx512vlbitalgintrin.h +++ b/lib/include/avx512vlbitalgintrin.h @@ -27,7 +27,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi16(__m256i __A) { - return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); + return (__m256i)__builtin_elementwise_popcount((__v16hu)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -49,7 +49,7 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi16(__m128i __A) { - return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); + return (__m128i)__builtin_elementwise_popcount((__v8hu)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -71,7 +71,7 @@ _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi8(__m256i __A) { - return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); + return (__m256i)__builtin_elementwise_popcount((__v32qu)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -93,7 +93,7 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi8(__m128i __A) { - return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); + return (__m128i)__builtin_elementwise_popcount((__v16qu)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 diff --git a/lib/include/avx512vpopcntdqintrin.h b/lib/include/avx512vpopcntdqintrin.h index e73e7e4f71..e24c2c5e1b 100644 --- a/lib/include/avx512vpopcntdqintrin.h +++ b/lib/include/avx512vpopcntdqintrin.h @@ -21,8 +21,15 @@ __target__("avx512vpopcntdq,evex512"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A); +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_popcnt_epi64(__m512i __A) { + return (__m512i)__builtin_elementwise_popcount((__v8du)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -36,8 +43,9 @@ _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_popcnt_epi32(__m512i __A) { + return (__m512i)__builtin_elementwise_popcount((__v16su)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/lib/include/avx512vpopcntdqvlintrin.h b/lib/include/avx512vpopcntdqvlintrin.h index b2df2e84d3..b6c819b0cb 100644 --- a/lib/include/avx512vpopcntdqvlintrin.h +++ b/lib/include/avx512vpopcntdqvlintrin.h @@ -25,9 +25,17 @@ __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_popcnt_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); + return (__m128i)__builtin_elementwise_popcount((__v2du)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -41,9 +49,9 @@ _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_popcnt_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); + return (__m128i)__builtin_elementwise_popcount((__v4su)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS128 @@ -57,9 +65,9 @@ _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_popcnt_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); + return (__m256i)__builtin_elementwise_popcount((__v4du)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 @@ -73,9 +81,9 @@ _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_popcnt_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); + return (__m256i)__builtin_elementwise_popcount((__v8su)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 diff --git a/lib/include/avxintrin.h b/lib/include/avxintrin.h index 4983f33113..8e497a9823 100644 --- a/lib/include/avxintrin.h +++ b/lib/include/avxintrin.h @@ -50,12 +50,29 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); #endif /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ __min_vector_width__(256))) #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \ __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx"), \ + __min_vector_width__(128))) +#endif + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS +#endif /* Arithmetic */ /// Adds two 256-bit vectors of [4 x double]. @@ -3689,7 +3706,7 @@ _mm256_undefined_si256(void) /// A double-precision floating-point value used to initialize bits [63:0] /// of the result. /// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_pd(double __a, double __b, double __c, double __d) { return __extension__ (__m256d){ __d, __c, __b, __a }; @@ -3728,7 +3745,7 @@ _mm256_set_pd(double __a, double __b, double __c, double __d) /// A single-precision floating-point value used to initialize bits [31:0] /// of the result. /// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) { @@ -3955,7 +3972,7 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) /// A double-precision floating-point value used to initialize bits [255:192] /// of the result. /// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_pd(double __a, double __b, double __c, double __d) { return _mm256_set_pd(__d, __c, __b, __a); @@ -3995,7 +4012,7 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d) /// A single-precision floating-point value used to initialize bits [255:224] /// of the result. /// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) { @@ -4212,7 +4229,7 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) /// A double-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_pd(double __w) { return _mm256_set_pd(__w, __w, __w, __w); @@ -4231,7 +4248,7 @@ _mm256_set1_pd(double __w) /// A single-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_set1_ps(float __w) { return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); @@ -4322,10 +4339,8 @@ _mm256_set1_epi64x(long long __q) /// This intrinsic corresponds to the VXORPS instruction. /// /// \returns A 256-bit vector of [4 x double] with all elements set to zero. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_setzero_pd(void) -{ - return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; +static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) { + return __extension__(__m256d){0.0, 0.0, 0.0, 0.0}; } /// Constructs a 256-bit floating-point vector of [8 x float] with all @@ -4336,9 +4351,7 @@ _mm256_setzero_pd(void) /// This intrinsic corresponds to the VXORPS instruction. /// /// \returns A 256-bit vector of [8 x float] with all elements set to zero. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_setzero_ps(void) -{ +static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) { return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; } @@ -4349,9 +4362,8 @@ _mm256_setzero_ps(void) /// This intrinsic corresponds to the VXORPS instruction. /// /// \returns A 256-bit integer vector initialized to zero. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setzero_si256(void) -{ +static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm256_setzero_si256(void) { return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; } @@ -5121,6 +5133,8 @@ _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif /* __AVXINTRIN_H */ diff --git a/lib/include/avxvnniint16intrin.h b/lib/include/avxvnniint16intrin.h index e4d342a8b4..805d249911 100644 --- a/lib/include/avxvnniint16intrin.h +++ b/lib/include/avxvnniint16intrin.h @@ -15,14 +15,6 @@ #ifndef __AVXVNNIINT16INTRIN_H #define __AVXVNNIINT16INTRIN_H -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ - __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \ - __min_vector_width__(256))) - /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -53,12 +45,9 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwsud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwsud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwsuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwsuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwusd_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwusd_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwusds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate @@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwusds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwuud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpwuud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpwuuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate @@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 +#define _mm256_dpwuuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) #endif // __AVXVNNIINT16INTRIN_H diff --git a/lib/include/avxvnniint8intrin.h b/lib/include/avxvnniint8intrin.h index b0b6cb853f..c211620c68 100644 --- a/lib/include/avxvnniint8intrin.h +++ b/lib/include/avxvnniint8intrin.h @@ -14,14 +14,6 @@ #ifndef __AVXVNNIINT8INTRIN_H #define __AVXVNNIINT8INTRIN_H -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ - __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \ - __min_vector_width__(128))) - /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding @@ -52,12 +44,9 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbssd_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbssd_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbssds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate @@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbssds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbsud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbsud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbsuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbsuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbuud_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} +#define _mm256_dpbuud_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate @@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, - __m128i __A, - __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A, - (__v4si)__B); -} +#define _mm_dpbuuds_epi32(__W, __A, __B) \ + ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \ + (__v4si)(__B))) -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate /// signed 16-bit results. Sum these 4 results with the corresponding /// 32-bit integer in \a __W with signed saturation, and store the packed @@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W, /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A, - (__v8si)__B); -} -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 +#define _mm256_dpbuuds_epi32(__W, __A, __B) \ + ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \ + (__v8si)(__B))) #endif // __AVXVNNIINT8INTRIN_H diff --git a/lib/include/bmi2intrin.h b/lib/include/bmi2intrin.h index f0a3343bef..bdb61b13fb 100644 --- a/lib/include/bmi2intrin.h +++ b/lib/include/bmi2intrin.h @@ -15,7 +15,13 @@ #define __BMI2INTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) constexpr +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) +#endif /// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits /// starting at bit number \a __Y. @@ -38,8 +44,7 @@ /// The lower 8 bits specify the bit number of the lowest bit to zero. /// \returns The partially zeroed 32-bit value. static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bzhi_u32(unsigned int __X, unsigned int __Y) -{ +_bzhi_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_bzhi_si(__X, __Y); } @@ -68,8 +73,7 @@ _bzhi_u32(unsigned int __X, unsigned int __Y) /// The 32-bit mask specifying where to deposit source bits. /// \returns The 32-bit result. static __inline__ unsigned int __DEFAULT_FN_ATTRS -_pdep_u32(unsigned int __X, unsigned int __Y) -{ +_pdep_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_pdep_si(__X, __Y); } @@ -98,8 +102,7 @@ _pdep_u32(unsigned int __X, unsigned int __Y) /// The 32-bit mask specifying which source bits to extract. /// \returns The 32-bit result. static __inline__ unsigned int __DEFAULT_FN_ATTRS -_pext_u32(unsigned int __X, unsigned int __Y) -{ +_pext_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_pext_si(__X, __Y); } @@ -124,8 +127,7 @@ _pext_u32(unsigned int __X, unsigned int __Y) /// A pointer to memory for storing the upper half of the product. /// \returns The lower half of the product. static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) -{ +_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { unsigned long long __res = (unsigned long long) __X * __Y; *__P = (unsigned int)(__res >> 32); return (unsigned int)__res; @@ -154,8 +156,7 @@ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) /// The lower 8 bits specify the bit number of the lowest bit to zero. /// \returns The partially zeroed 64-bit value. static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bzhi_u64(unsigned long long __X, unsigned long long __Y) -{ +_bzhi_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_bzhi_di(__X, __Y); } @@ -184,8 +185,7 @@ _bzhi_u64(unsigned long long __X, unsigned long long __Y) /// The 64-bit mask specifying where to deposit source bits. /// \returns The 64-bit result. static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_pdep_u64(unsigned long long __X, unsigned long long __Y) -{ +_pdep_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_pdep_di(__X, __Y); } @@ -214,8 +214,7 @@ _pdep_u64(unsigned long long __X, unsigned long long __Y) /// The 64-bit mask specifying which source bits to extract. /// \returns The 64-bit result. static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_pext_u64(unsigned long long __X, unsigned long long __Y) -{ +_pext_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_pext_di(__X, __Y); } @@ -241,8 +240,7 @@ _pext_u64(unsigned long long __X, unsigned long long __Y) /// \returns The lower half of the product. static __inline__ unsigned long long __DEFAULT_FN_ATTRS _mulx_u64 (unsigned long long __X, unsigned long long __Y, - unsigned long long *__P) -{ + unsigned long long *__P) { unsigned __int128 __res = (unsigned __int128) __X * __Y; *__P = (unsigned long long) (__res >> 64); return (unsigned long long) __res; diff --git a/lib/include/bmiintrin.h b/lib/include/bmiintrin.h index 78bffe68e2..59c5ece397 100644 --- a/lib/include/bmiintrin.h +++ b/lib/include/bmiintrin.h @@ -17,7 +17,12 @@ /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT instruction behaves as BSF on non-BMI targets, there is code that expects to use it as a potentially faster version of BSF. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __RELAXED_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__)) constexpr +#else #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#endif /// Counts the number of trailing zero bits in the operand. /// @@ -31,8 +36,7 @@ /// bits in the operand. /// \see _tzcnt_u16 static __inline__ unsigned short __RELAXED_FN_ATTRS -__tzcnt_u16(unsigned short __X) -{ +__tzcnt_u16(unsigned short __X) { return __builtin_ia32_tzcnt_u16(__X); } @@ -65,8 +69,7 @@ __tzcnt_u16(unsigned short __X) /// bits in the operand. /// \see { _mm_tzcnt_32 _tzcnt_u32 } static __inline__ unsigned int __RELAXED_FN_ATTRS -__tzcnt_u32(unsigned int __X) -{ +__tzcnt_u32(unsigned int __X) { return __builtin_ia32_tzcnt_u32(__X); } @@ -82,8 +85,7 @@ __tzcnt_u32(unsigned int __X) /// the operand. /// \see { __tzcnt_u32 _tzcnt_u32 } static __inline__ int __RELAXED_FN_ATTRS -_mm_tzcnt_32(unsigned int __X) -{ +_mm_tzcnt_32(unsigned int __X) { return (int)__builtin_ia32_tzcnt_u32(__X); } @@ -118,8 +120,7 @@ _mm_tzcnt_32(unsigned int __X) /// bits in the operand. /// \see { _mm_tzcnt_64 _tzcnt_u64 } static __inline__ unsigned long long __RELAXED_FN_ATTRS -__tzcnt_u64(unsigned long long __X) -{ +__tzcnt_u64(unsigned long long __X) { return __builtin_ia32_tzcnt_u64(__X); } @@ -135,8 +136,7 @@ __tzcnt_u64(unsigned long long __X) /// the operand. /// \see { __tzcnt_u64 _tzcnt_u64 } static __inline__ long long __RELAXED_FN_ATTRS -_mm_tzcnt_64(unsigned long long __X) -{ +_mm_tzcnt_64(unsigned long long __X) { return (long long)__builtin_ia32_tzcnt_u64(__X); } @@ -164,7 +164,13 @@ _mm_tzcnt_64(unsigned long long __X) #if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) constexpr +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) +#endif /// Performs a bitwise AND of the second operand with the one's /// complement of the first operand. @@ -181,8 +187,7 @@ _mm_tzcnt_64(unsigned long long __X) /// operand with the one's complement of the first operand. /// \see _andn_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__andn_u32(unsigned int __X, unsigned int __Y) -{ +__andn_u32(unsigned int __X, unsigned int __Y) { return ~__X & __Y; } @@ -224,8 +229,7 @@ __andn_u32(unsigned int __X, unsigned int __Y) /// extracted bits. /// \see _bextr_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__bextr_u32(unsigned int __X, unsigned int __Y) -{ +__bextr_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_bextr_u32(__X, __Y); } @@ -249,9 +253,8 @@ __bextr_u32(unsigned int __X, unsigned int __Y) /// extracted bits. /// \see __bextr_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) { + return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } /* Intel-specified, single-leading-underscore version of BEXTR2 */ @@ -289,8 +292,7 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) { /// the source operand. /// \see _blsi_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsi_u32(unsigned int __X) -{ +__blsi_u32(unsigned int __X) { return __X & -__X; } @@ -325,8 +327,7 @@ __blsi_u32(unsigned int __X) /// \returns An unsigned integer containing the newly created mask. /// \see _blsmsk_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsmsk_u32(unsigned int __X) -{ +__blsmsk_u32(unsigned int __X) { return __X ^ (__X - 1); } @@ -361,8 +362,7 @@ __blsmsk_u32(unsigned int __X) /// operand. /// \see _blsr_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsr_u32(unsigned int __X) -{ +__blsr_u32(unsigned int __X) { return __X & (__X - 1); } @@ -401,8 +401,7 @@ __blsr_u32(unsigned int __X) /// operand with the one's complement of the first operand. /// \see _andn_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__andn_u64 (unsigned long long __X, unsigned long long __Y) -{ +__andn_u64 (unsigned long long __X, unsigned long long __Y) { return ~__X & __Y; } @@ -445,8 +444,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) /// extracted bits. /// \see _bextr_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__bextr_u64(unsigned long long __X, unsigned long long __Y) -{ +__bextr_u64(unsigned long long __X, unsigned long long __Y) { return __builtin_ia32_bextr_u64(__X, __Y); } @@ -470,9 +468,8 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y) /// extracted bits. /// \see __bextr_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { + return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } /* Intel-specified, single-leading-underscore version of BEXTR2 */ @@ -510,8 +507,7 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) { /// bits from the source operand. /// \see _blsi_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsi_u64(unsigned long long __X) -{ +__blsi_u64(unsigned long long __X) { return __X & -__X; } @@ -546,8 +542,7 @@ __blsi_u64(unsigned long long __X) /// \returns An unsigned 64-bit integer containing the newly created mask. /// \see _blsmsk_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsmsk_u64(unsigned long long __X) -{ +__blsmsk_u64(unsigned long long __X) { return __X ^ (__X - 1); } @@ -582,8 +577,7 @@ __blsmsk_u64(unsigned long long __X) /// source operand. /// \see _blsr_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsr_u64(unsigned long long __X) -{ +__blsr_u64(unsigned long long __X) { return __X & (__X - 1); } diff --git a/lib/include/cmpccxaddintrin.h b/lib/include/cmpccxaddintrin.h index 6957498996..0076c402f5 100644 --- a/lib/include/cmpccxaddintrin.h +++ b/lib/include/cmpccxaddintrin.h @@ -63,7 +63,7 @@ typedef enum { (int)(__D)))) #define _cmpccxadd_epi64(__A, __B, __C, __D) \ - ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \ + ((long long)(__builtin_ia32_cmpccxadd64((__A), (long long)(__B), \ (long long)(__C), (int)(__D)))) #endif // __x86_64__ diff --git a/lib/include/cpuid.h b/lib/include/cpuid.h index 82d995f1b9..2601aa5724 100644 --- a/lib/include/cpuid.h +++ b/lib/include/cpuid.h @@ -187,17 +187,18 @@ #define bit_ENQCMD 0x20000000 /* Features in %edx for leaf 7 sub-leaf 0 */ -#define bit_AVX5124VNNIW 0x00000004 -#define bit_AVX5124FMAPS 0x00000008 -#define bit_UINTR 0x00000020 -#define bit_SERIALIZE 0x00004000 -#define bit_TSXLDTRK 0x00010000 -#define bit_PCONFIG 0x00040000 -#define bit_IBT 0x00100000 -#define bit_AMXBF16 0x00400000 -#define bit_AVX512FP16 0x00800000 -#define bit_AMXTILE 0x01000000 -#define bit_AMXINT8 0x02000000 +#define bit_AVX5124VNNIW 0x00000004 +#define bit_AVX5124FMAPS 0x00000008 +#define bit_UINTR 0x00000020 +#define bit_AVX512VP2INTERSECT 0x00000100 +#define bit_SERIALIZE 0x00004000 +#define bit_TSXLDTRK 0x00010000 +#define bit_PCONFIG 0x00040000 +#define bit_IBT 0x00100000 +#define bit_AMXBF16 0x00400000 +#define bit_AVX512FP16 0x00800000 +#define bit_AMXTILE 0x01000000 +#define bit_AMXINT8 0x02000000 /* Features in %eax for leaf 7 sub-leaf 1 */ #define bit_SHA512 0x00000001 diff --git a/lib/include/emmintrin.h b/lib/include/emmintrin.h index 4dff642135..78e8a422db 100644 --- a/lib/include/emmintrin.h +++ b/lib/include/emmintrin.h @@ -49,12 +49,27 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); #endif /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("sse2,no-evex512"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,sse2,no-evex512"), __min_vector_width__(64))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#endif + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Adds lower double-precision values in both operands and returns the /// sum in the lower 64 bits of the result. The upper 64 bits of the result @@ -71,8 +86,8 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the /// sum of the lower 64 bits of both operands. The upper 64 bits are copied /// from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a, + __m128d __b) { __a[0] += __b[0]; return __a; } @@ -89,8 +104,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, /// A 128-bit vector of [2 x double] containing one of the source operands. /// \returns A 128-bit vector of [2 x double] containing the sums of both /// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2df)__a + (__v2df)__b); } @@ -111,8 +126,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the /// difference of the lower 64 bits of both operands. The upper 64 bits are /// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a, + __m128d __b) { __a[0] -= __b[0]; return __a; } @@ -129,8 +144,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, /// A 128-bit vector of [2 x double] containing the subtrahend. /// \returns A 128-bit vector of [2 x double] containing the differences between /// both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2df)__a - (__v2df)__b); } @@ -150,8 +165,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the /// product of the lower 64 bits of both operands. The upper 64 bits are /// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a, + __m128d __b) { __a[0] *= __b[0]; return __a; } @@ -168,8 +183,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, /// A 128-bit vector of [2 x double] containing one of the operands. /// \returns A 128-bit vector of [2 x double] containing the products of both /// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2df)__a * (__v2df)__b); } @@ -190,8 +205,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the /// quotient of the lower 64 bits of both operands. The upper 64 bits are /// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a, + __m128d __b) { __a[0] /= __b[0]; return __a; } @@ -209,8 +224,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, /// A 128-bit vector of [2 x double] containing the divisor. /// \returns A 128-bit vector of [2 x double] containing the quotients of both /// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2df)__a / (__v2df)__b); } @@ -358,8 +373,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, /// A 128-bit vector of [2 x double] containing one of the source operands. /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the /// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2du)__a & (__v2du)__b); } @@ -378,8 +393,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the /// values in the second operand and the one's complement of the first /// operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_andnot_pd(__m128d __a, __m128d __b) { return (__m128d)(~(__v2du)__a & (__v2du)__b); } @@ -395,8 +410,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, /// A 128-bit vector of [2 x double] containing one of the source operands. /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the /// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2du)__a | (__v2du)__b); } @@ -412,8 +427,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, /// A 128-bit vector of [2 x double] containing one of the source operands. /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the /// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a, + __m128d __b) { return (__m128d)((__v2du)__a ^ (__v2du)__b); } @@ -1291,7 +1306,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { /// floating-point elements are converted to double-precision values. The /// upper two elements are unused. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtps_pd(__m128 __a) { return (__m128d) __builtin_convertvector( __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); } @@ -1312,7 +1328,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { /// /// The upper two elements are unused. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi32_pd(__m128i __a) { return (__m128d) __builtin_convertvector( __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); } @@ -1398,8 +1415,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the /// converted value from the second parameter. The upper 64 bits are copied /// from the upper 64 bits of the first parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, - int __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi32_sd(__m128d __a, int __b) { __a[0] = __b; return __a; } @@ -1423,8 +1440,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the /// converted value from the second parameter. The upper 64 bits are copied /// from the upper 64 bits of the first parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, - __m128 __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtss_sd(__m128d __a, __m128 __b) { __a[0] = __b[0]; return __a; } @@ -1486,8 +1503,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { + return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a)); } /// Converts the two double-precision floating-point elements of a @@ -1505,8 +1522,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { + return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a)); } /// Converts the two signed 32-bit integer elements of a 64-bit vector of @@ -1520,8 +1537,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { /// \param __a /// A 64-bit vector of [2 x i32]. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { - return __builtin_ia32_cvtpi2pd((__v2si)__a); +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtpi32_pd(__m64 __a) { + return (__m128d) __builtin_convertvector((__v2si)__a, __v2df); } /// Returns the low-order element of a 128-bit vector of [2 x double] as @@ -1535,7 +1553,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. /// \returns A double-precision floating-point value copied from the lower 64 /// bits of \a __a. -static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { +static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsd_f64(__m128d __a) { return __a[0]; } @@ -1770,7 +1789,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { /// \returns An initialized 128-bit floating-point vector of [2 x double]. The /// lower 64 bits contain the value of the parameter. The upper 64 bits are /// set to zero. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) { return __extension__(__m128d){__w, 0.0}; } @@ -1786,7 +1805,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { /// A double-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) { return __extension__(__m128d){__w, __w}; } @@ -1802,7 +1821,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { /// A double-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) { return _mm_set1_pd(__w); } @@ -1820,8 +1839,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) { /// A double-precision floating-point value used to initialize the lower 64 /// bits of the result. /// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, - double __x) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w, + double __x) { return __extension__(__m128d){__x, __w}; } @@ -1840,8 +1859,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, /// A double-precision floating-point value used to initialize the upper 64 /// bits of the result. /// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, - double __x) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w, + double __x) { return __extension__(__m128d){__w, __x}; } @@ -1854,7 +1873,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, /// /// \returns An initialized 128-bit floating-point vector of [2 x double] with /// all elements set to zero. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) { return __extension__(__m128d){0.0, 0.0}; } @@ -1873,8 +1892,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the /// lower 64 bits of the result. /// \returns A 128-bit vector of [2 x double] containing the moved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_move_sd(__m128d __a, __m128d __b) { __a[0] = __b[0]; return __a; } @@ -2091,8 +2110,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, /// A 128-bit vector of [4 x i32]. /// \returns A 128-bit vector of [4 x i32] containing the sums of both /// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_add_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4su)__a + (__v4su)__b); } @@ -2108,9 +2127,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, - __m64 __b) { - return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { + return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b)); } /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], @@ -2129,8 +2147,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, /// A 128-bit vector of [2 x i64]. /// \returns A 128-bit vector of [2 x i64] containing the sums of both /// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_add_epi64(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a + (__v2du)__b); } @@ -2431,9 +2449,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, - __m64 __b) { - return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2521,8 +2539,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the differences of the values /// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sub_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4su)__a - (__v4su)__b); } @@ -2539,9 +2557,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, - __m64 __b) { - return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { + return (__m64)((unsigned long long)__a - (unsigned long long)__b); } /// Subtracts the corresponding elements of two [2 x i64] vectors. @@ -2556,8 +2573,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, /// A 128-bit integer vector containing the subtrahends. /// \returns A 128-bit integer vector containing the differences of the values /// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sub_epi64(__m128i __a, __m128i __b) { return (__m128i)((__v2du)__a - (__v2du)__b); } @@ -3255,8 +3272,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a, - long long __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi64_sd(__m128d __a, long long __b) { __a[0] = __b; return __a; } @@ -3310,7 +3327,8 @@ static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { /// \param __a /// A 128-bit integer vector. /// \returns A 128-bit vector of [4 x float] containing the converted values. -static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtepi32_ps(__m128i __a) { return (__m128) __builtin_convertvector((__v4si)__a, __v4sf); } @@ -3494,8 +3512,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { /// destination vector of [2 x i64]. /// \returns An initialized 128-bit vector of [2 x i64] containing the values /// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, - long long __q0) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set_epi64x(long long __q1, long long __q0) { return __extension__(__m128i)(__v2di){__q0, __q1}; } @@ -3515,9 +3533,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, /// destination vector of [2 x i64]. /// \returns An initialized 128-bit vector of [2 x i64] containing the values /// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, - __m64 __q0) { - return _mm_set_epi64x((long long)__q1, (long long)__q0); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set_epi64(__m64 __q1, __m64 __q0) { + return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]); } /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with @@ -3542,8 +3560,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, /// vector. /// \returns An initialized 128-bit vector of [4 x i32] containing the values /// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, - int __i1, int __i0) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3, + int __i2, + int __i1, + int __i0) { return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3}; } @@ -3581,7 +3601,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, /// vector. /// \returns An initialized 128-bit vector of [8 x i16] containing the values /// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) { return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3, @@ -3630,7 +3650,7 @@ _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, /// Initializes bits [7:0] of the destination vector. /// \returns An initialized 128-bit vector of [16 x i8] containing the values /// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { @@ -3652,7 +3672,8 @@ _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, /// vector. /// \returns An initialized 128-bit integer vector of [2 x i64] with both /// elements containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set1_epi64x(long long __q) { return _mm_set_epi64x(__q, __q); } @@ -3669,7 +3690,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { /// vector. /// \returns An initialized 128-bit vector of [2 x i64] with all elements /// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set1_epi64(__m64 __q) { return _mm_set_epi64(__q, __q); } @@ -3686,7 +3708,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { /// vector. /// \returns An initialized 128-bit vector of [4 x i32] with all elements /// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) { return _mm_set_epi32(__i, __i, __i, __i); } @@ -3703,7 +3725,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { /// vector. /// \returns An initialized 128-bit vector of [8 x i16] with all elements /// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set1_epi16(short __w) { return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); } @@ -3720,7 +3743,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { /// vector. /// \returns An initialized 128-bit vector of [16 x i8] with all elements /// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) { return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); } @@ -3739,8 +3762,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { /// A 64-bit integral value used to initialize the upper 64 bits of the /// result. /// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, - __m64 __q1) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_setr_epi64(__m64 __q0, __m64 __q1) { return _mm_set_epi64(__q1, __q0); } @@ -3761,9 +3784,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, /// \param __i3 /// A 32-bit integral value used to initialize bits [127:96] of the result. /// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, - int __i2, - int __i3) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) { return _mm_set_epi32(__i3, __i2, __i1, __i0); } @@ -3792,7 +3814,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, /// \param __w7 /// A 16-bit integral value used to initialize bits [127:112] of the result. /// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) { return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); @@ -3839,7 +3861,7 @@ _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, /// \param __b15 /// An 8-bit integral value used to initialize bits [127:120] of the result. /// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) { @@ -3855,7 +3877,7 @@ _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, /// /// \returns An initialized 128-bit integer vector with all elements set to /// zero. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) { return __extension__(__m128i)(__v2di){0LL, 0LL}; } @@ -4588,7 +4610,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, /// A 128-bit integer vector operand. The lower 64 bits are moved to the /// destination. /// \returns A 64-bit integer containing the lower 64 bits of the parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_movepi64_pi64(__m128i __a) { return (__m64)__a[0]; } @@ -4603,8 +4626,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { /// A 64-bit value. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from /// the operand. The upper 64 bits are assigned zeros. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { - return __extension__(__m128i)(__v2di){(long long)__a, 0}; +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_movpi64_epi64(__m64 __a) { + return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1); } /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit @@ -4619,7 +4643,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { /// destination. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from /// the operand. The upper 64 bits are assigned zeros. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_move_epi64(__m128i __a) { return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); } @@ -4638,8 +4663,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { /// A 128-bit vector of [2 x double]. \n /// Bits [127:64] are written to bits [127:64] of the destination. /// \returns A 128-bit vector of [2 x double] containing the interleaved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1); } @@ -4658,8 +4683,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, /// A 128-bit vector of [2 x double]. \n /// Bits [63:0] are written to bits [127:64] of the destination. /// \returns A 128-bit vector of [2 x double] containing the interleaved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, - __m128d __b) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0); } @@ -4722,7 +4747,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { /// A 128-bit floating-point vector of [2 x double]. /// \returns A 128-bit floating-point vector of [4 x float] containing the same /// bitwise pattern as the parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castpd_ps(__m128d __a) { return (__m128)__a; } @@ -4737,7 +4763,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { /// A 128-bit floating-point vector of [2 x double]. /// \returns A 128-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castpd_si128(__m128d __a) { return (__m128i)__a; } @@ -4752,7 +4779,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { /// A 128-bit floating-point vector of [4 x float]. /// \returns A 128-bit floating-point vector of [2 x double] containing the same /// bitwise pattern as the parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castps_pd(__m128 __a) { return (__m128d)__a; } @@ -4767,7 +4795,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { /// A 128-bit floating-point vector of [4 x float]. /// \returns A 128-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castps_si128(__m128 __a) { return (__m128i)__a; } @@ -4782,7 +4811,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { /// A 128-bit integer vector. /// \returns A 128-bit floating-point vector of [4 x float] containing the same /// bitwise pattern as the parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castsi128_ps(__m128i __a) { return (__m128)__a; } @@ -4797,7 +4827,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { /// A 128-bit integer vector. /// \returns A 128-bit floating-point vector of [2 x double] containing the same /// bitwise pattern as the parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_castsi128_pd(__m128i __a) { return (__m128d)__a; } @@ -4889,8 +4920,11 @@ void _mm_pause(void); #if defined(__cplusplus) } // extern "C" #endif + +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) diff --git a/lib/include/gfniintrin.h b/lib/include/gfniintrin.h index 73b04a824a..9a5743d4b6 100644 --- a/lib/include/gfniintrin.h +++ b/lib/include/gfniintrin.h @@ -14,6 +14,7 @@ #ifndef __GFNIINTRIN_H #define __GFNIINTRIN_H +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) /* Default attributes for simple form (no masking). */ #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ @@ -25,6 +26,37 @@ __target__("avx,gfni,no-evex512"), \ __min_vector_width__(256))) +/* Default attributes for VLX masked forms. */ +#define __DEFAULT_FN_ATTRS_VL128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni,no-evex512"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_VL256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni,no-evex512"), \ + __min_vector_width__(256))) +#else +/* Default attributes for simple form (no masking). */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("gfni"), \ + __min_vector_width__(128))) + +/* Default attributes for YMM unmasked form. */ +#define __DEFAULT_FN_ATTRS_Y \ + __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), \ + __min_vector_width__(256))) + +/* Default attributes for VLX masked forms. */ +#define __DEFAULT_FN_ATTRS_VL128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_VL256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512bw,avx512vl,gfni"), \ + __min_vector_width__(256))) +#endif + /* Default attributes for ZMM unmasked forms. */ #define __DEFAULT_FN_ATTRS_Z \ __attribute__((__always_inline__, __nodebug__, \ @@ -36,16 +68,6 @@ __target__("avx512bw,evex512,gfni"), \ __min_vector_width__(512))) -/* Default attributes for VLX masked forms. */ -#define __DEFAULT_FN_ATTRS_VL128 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512bw,avx512vl,gfni,no-evex512"), \ - __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_VL256 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512bw,avx512vl,gfni,no-evex512"), \ - __min_vector_width__(256))) - #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), \ diff --git a/lib/include/hexagon_types.h b/lib/include/hexagon_types.h index 029727cc48..8e73fad4bc 100644 --- a/lib/include/hexagon_types.h +++ b/lib/include/hexagon_types.h @@ -1,7 +1,11 @@ -/******************************************************************************/ -/* (c) 2020 Qualcomm Innovation Center, Inc. All rights reserved. */ -/* */ -/******************************************************************************/ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #ifndef HEXAGON_TYPES_H #define HEXAGON_TYPES_H diff --git a/lib/include/hvx_hexagon_protos.h b/lib/include/hvx_hexagon_protos.h index 7e3679a38b..fd120a589f 100644 --- a/lib/include/hvx_hexagon_protos.h +++ b/lib/include/hvx_hexagon_protos.h @@ -5178,6 +5178,433 @@ #define Q6_Vuh_vmpy_VuhVuh_rs16(Vu,Vv) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpyuhvs)(Vu,Vv) #endif /* __HEXAGON_ARCH___ >= 69 */ +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vdd32.sf=vadd(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vadd_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Wsf_vadd_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_sf_bf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.h=Vu32.hf + C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vh_equals_Vhf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.hf=Vu32.h + C Intrinsic Prototype: HVX_Vector Q6_Vhf_equals_Vh(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vhf_equals_Vh(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_hf_h)(Vu) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.sf=Vu32.w + C Intrinsic Prototype: HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vsf_equals_Vw(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_sf_w)(Vu) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.w=Vu32.sf + C Intrinsic Prototype: HVX_Vector Q6_Vw_equals_Vsf(HVX_Vector Vu) + Instruction Type: CVI_VS + Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Vw_equals_Vsf(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_w_sf)(Vu) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.bf=vcvt(Vu32.sf,Vv32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vbf_vcvt_VsfVsf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vbf_vcvt_VsfVsf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_bf_sf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Qd4=vcmp.gt(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gt_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_gt_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \ + ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf)(Vu, Vv)), -1) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Qx4&=vcmp.gt(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtand_QVbfVbf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_gtand_QVbfVbf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \ + ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_and)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Qx4|=vcmp.gt(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtor_QVbfVbf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_gtor_QVbfVbf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \ + ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_or)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Qx4^=vcmp.gt(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtxacc_QVbfVbf(HVX_VectorPred + Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution + Slots: SLOT0123 + ========================================================================== */ + +#define Q6_Q_vcmp_gtxacc_QVbfVbf(Qx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \ + ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_xor)( \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \ + Vv)), \ + -1) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.bf=vmax(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmax_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_LATE Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vbf_vmax_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmax_bf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vd32.bf=vmin(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmin_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_LATE Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vbf_vmin_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmin_bf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vdd32.sf=vmpy(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpy_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Wsf_vmpy_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vxx32.sf+=vmpy(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpyacc_WsfVbfVbf(HVX_VectorPair + Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution + Slots: SLOT23 + ========================================================================== */ + +#define Q6_Wsf_vmpyacc_WsfVbfVbf(Vxx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf_acc)(Vxx, Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 73 +/* ========================================================================== + Assembly Syntax: Vdd32.sf=vsub(Vu32.bf,Vv32.bf) + C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vsub_VbfVbf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Wsf_vsub_VbfVbf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_bf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 73 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32=vgetqfext(Vu32.x,Rt32) + C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfext_VR(HVX_Vector Vu, Word32 Rt) + Instruction Type: CVI_VX + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vgetqfext_VR(Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext)(Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vx32|=vgetqfext(Vu32.x,Rt32) + C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfextor_VVR(HVX_Vector Vx, + HVX_Vector Vu, Word32 Rt) Instruction Type: CVI_VX Execution Slots: + SLOT23 + ========================================================================== */ + +#define Q6_V_vgetqfextor_VVR(Vx, Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext_oracc)(Vx, Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.x=vsetqfext(Vu32,Rt32) + C Intrinsic Prototype: HVX_Vector Q6_V_vsetqfext_VR(HVX_Vector Vu, Word32 Rt) + Instruction Type: CVI_VX + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vsetqfext_VR(Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_set_qfext)(Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.f8=vabs(Vu32.f8) + C Intrinsic Prototype: HVX_Vector Q6_V_vabs_V(HVX_Vector Vu) + Instruction Type: CVI_VX_LATE + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vabs_V(Vu) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vadd(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vadd_VV(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vadd_VV(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_hf_f8)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.b=vcvt2(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vb_vcvt2_VhfVhf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vb_vcvt2_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_b_hf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vcvt2(Vu32.b) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vb(HVX_Vector Vu) + Instruction Type: CVI_VX_DV + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vcvt2_Vb(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_b)(Vu) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vcvt2(Vu32.ub) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vub(HVX_Vector Vu) + Instruction Type: CVI_VX_DV + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vcvt2_Vub(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_ub)(Vu) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.ub=vcvt2(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vub_vcvt2_VhfVhf(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vub_vcvt2_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_ub_hf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.f8=vcvt(Vu32.hf,Vv32.hf) + C Intrinsic Prototype: HVX_Vector Q6_V_vcvt_VhfVhf(HVX_Vector Vu, HVX_Vector + Vv) Instruction Type: CVI_VX Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vcvt_VhfVhf(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_f8_hf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vcvt(Vu32.f8) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt_V(HVX_Vector Vu) + Instruction Type: CVI_VX_DV + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vcvt_V(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_hf_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.f8=vfmax(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_Vector Q6_V_vfmax_VV(HVX_Vector Vu, HVX_Vector Vv) + Instruction Type: CVI_VX_LATE + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vfmax_VV(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmax_f8)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.f8=vfmin(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_Vector Q6_V_vfmin_VV(HVX_Vector Vu, HVX_Vector Vv) + Instruction Type: CVI_VX_LATE + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vfmin_VV(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmin_f8)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.f8=vfneg(Vu32.f8) + C Intrinsic Prototype: HVX_Vector Q6_V_vfneg_V(HVX_Vector Vu) + Instruction Type: CVI_VX_LATE + Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_V_vfneg_V(Vu) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfneg_f8)(Vu) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32=vmerge(Vu32.x,Vv32.w) + C Intrinsic Prototype: HVX_Vector Q6_V_vmerge_VVw(HVX_Vector Vu, HVX_Vector + Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123 + ========================================================================== */ + +#define Q6_V_vmerge_VVw(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmerge_qf)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vmpy(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpy_VV(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vmpy_VV(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vxx32.hf+=vmpy(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpyacc_WhfVV(HVX_VectorPair + Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution + Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vmpyacc_WhfVV(Vxx, Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8_acc)(Vxx, Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vmpy(Vu32.hf,Rt32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_VhfRhf(HVX_Vector Vu, Word32 + Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vqf16_vmpy_VhfRhf(Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_hf)(Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.qf16=vmpy(Vu32.qf16,Rt32.hf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_Vqf16Rhf(HVX_Vector Vu, + Word32 Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vqf16_vmpy_Vqf16Rhf(Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_qf16)(Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vd32.qf32=vmpy(Vu32.sf,Rt32.sf) + C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vmpy_VsfRsf(HVX_Vector Vu, Word32 + Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Vqf32_vmpy_VsfRsf(Vu, Rt) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_sf)(Vu, Rt) +#endif /* __HEXAGON_ARCH___ >= 79 */ + +#if __HVX_ARCH__ >= 79 +/* ========================================================================== + Assembly Syntax: Vdd32.hf=vsub(Vu32.f8,Vv32.f8) + C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vsub_VV(HVX_Vector Vu, + HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23 + ========================================================================== */ + +#define Q6_Whf_vsub_VV(Vu, Vv) \ + __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv) +#endif /* __HEXAGON_ARCH___ >= 79 */ + #endif /* __HVX__ */ #endif diff --git a/lib/include/immintrin.h b/lib/include/immintrin.h index cd6cf09b90..19c5987257 100644 --- a/lib/include/immintrin.h +++ b/lib/include/immintrin.h @@ -605,6 +605,20 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVRS__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AVX10_2__) && defined(__MOVRS__)) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AVX10_2_512__) && defined(__MOVRS__)) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__) #include #endif @@ -620,9 +634,6 @@ _storebe_i64(void * __P, long long __D) { #if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__) #include #endif -#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__) -#include -#endif #if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) || \ defined(__WIDEKL__) @@ -634,10 +645,59 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__) #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include @@ -648,6 +708,30 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__) +#include +#include +#include +#include +#include +#include +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__) +#include +#include +#include +#include +#include +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AVX10_2_512__) && defined(__SM4__)) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__) #include #endif diff --git a/lib/include/intrin.h b/lib/include/intrin.h index 6308c865ca..376046aeea 100644 --- a/lib/include/intrin.h +++ b/lib/include/intrin.h @@ -94,8 +94,8 @@ void __outwordstring(unsigned short, unsigned short *, unsigned long); unsigned long __readcr0(void); unsigned long __readcr2(void); unsigned __LPTRINT_TYPE__ __readcr3(void); -unsigned long __readcr4(void); -unsigned long __readcr8(void); +unsigned __LPTRINT_TYPE__ __readcr4(void); +unsigned __int64 __readcr8(void); unsigned int __readdr(unsigned int); #ifdef __i386__ unsigned char __readfsbyte(unsigned long); @@ -124,8 +124,8 @@ void __vmx_vmptrst(unsigned __int64 *); void __wbinvd(void); void __writecr0(unsigned int); void __writecr3(unsigned __INTPTR_TYPE__); -void __writecr4(unsigned int); -void __writecr8(unsigned int); +void __writecr4(unsigned __INTPTR_TYPE__); +void __writecr8(unsigned __int64); void __writedr(unsigned int, unsigned int); void __writefsbyte(unsigned long, unsigned char); void __writefsdword(unsigned long, unsigned long); @@ -330,33 +330,33 @@ static __inline__ void __DEFAULT_FN_ATTRS __halt(void) { __asm__ volatile("hlt"); } -static inline unsigned char __inbyte(unsigned short port) { +static __inline__ unsigned char __inbyte(unsigned short port) { unsigned char ret; __asm__ __volatile__("inb %w1, %b0" : "=a"(ret) : "Nd"(port)); return ret; } -static inline unsigned short __inword(unsigned short port) { +static __inline__ unsigned short __inword(unsigned short port) { unsigned short ret; __asm__ __volatile__("inw %w1, %w0" : "=a"(ret) : "Nd"(port)); return ret; } -static inline unsigned long __indword(unsigned short port) { +static __inline__ unsigned long __indword(unsigned short port) { unsigned long ret; __asm__ __volatile__("inl %w1, %k0" : "=a"(ret) : "Nd"(port)); return ret; } -static inline void __outbyte(unsigned short port, unsigned char data) { +static __inline__ void __outbyte(unsigned short port, unsigned char data) { __asm__ __volatile__("outb %b0, %w1" : : "a"(data), "Nd"(port)); } -static inline void __outword(unsigned short port, unsigned short data) { +static __inline__ void __outword(unsigned short port, unsigned short data) { __asm__ __volatile__("outw %w0, %w1" : : "a"(data), "Nd"(port)); } -static inline void __outdword(unsigned short port, unsigned long data) { +static __inline__ void __outdword(unsigned short port, unsigned long data) { __asm__ __volatile__("outl %k0, %w1" : : "a"(data), "Nd"(port)); } #endif @@ -396,6 +396,16 @@ unsigned short __readx18word(unsigned long offset); unsigned long __readx18dword(unsigned long offset); unsigned __int64 __readx18qword(unsigned long offset); +void __addx18byte(unsigned long offset, unsigned char data); +void __addx18word(unsigned long offset, unsigned short data); +void __addx18dword(unsigned long offset, unsigned long data); +void __addx18qword(unsigned long offset, unsigned __int64 data); + +void __incx18byte(unsigned long offset); +void __incx18word(unsigned long offset); +void __incx18dword(unsigned long offset); +void __incx18qword(unsigned long offset); + double _CopyDoubleFromInt64(__int64); float _CopyFloatFromInt32(__int32); __int32 _CopyInt32FromFloat(float); diff --git a/lib/include/intrin0.h b/lib/include/intrin0.h index 866c889661..2bca9fc877 100644 --- a/lib/include/intrin0.h +++ b/lib/include/intrin0.h @@ -44,7 +44,7 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination, __int64 *_ComparandResult); #endif -#ifdef __x86_64__ && !defined(__arm64ec__) +#if defined(__x86_64__) && !defined(__arm64ec__) unsigned __int64 _umul128(unsigned __int64, unsigned __int64, unsigned __int64 *); unsigned __int64 __shiftleft128(unsigned __int64 _LowPart, @@ -207,6 +207,9 @@ long _InterlockedExchange_rel(long volatile *_Target, long _Value); __int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value); __int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value); __int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value); +void *_InterlockedExchangePointer_acq(void *volatile *_Target, void *_Value); +void *_InterlockedExchangePointer_nf(void *volatile *_Target, void *_Value); +void *_InterlockedExchangePointer_rel(void *volatile *_Target, void *_Value); /*----------------------------------------------------------------------------*\ |* Interlocked Compare Exchange @@ -237,6 +240,12 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination, __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination, __int64 _Exchange, __int64 _Comparand); +void *_InterlockedCompareExchangePointer_acq(void *volatile *_Destination, + void *_Exchange, void *_Comparand); +void *_InterlockedCompareExchangePointer_nf(void *volatile *_Destination, + void *_Exchange, void *_Comparand); +void *_InterlockedCompareExchangePointer_rel(void *volatile *_Destination, + void *_Exchange, void *_Comparand); #endif #ifdef __cplusplus diff --git a/lib/include/larchintrin.h b/lib/include/larchintrin.h index f421829591..a1247d12e2 100644 --- a/lib/include/larchintrin.h +++ b/lib/include/larchintrin.h @@ -228,17 +228,31 @@ extern __inline void ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2))) #endif -#define __frecipe_s(/*float*/ _1) \ - (float)__builtin_loongarch_frecipe_s((float)_1) +#ifdef __loongarch_frecipe +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __frecipe_s(float _1) { + return __builtin_loongarch_frecipe_s(_1); +} -#define __frecipe_d(/*double*/ _1) \ - (double)__builtin_loongarch_frecipe_d((double)_1) +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __frecipe_d(double _1) { + return __builtin_loongarch_frecipe_d(_1); +} -#define __frsqrte_s(/*float*/ _1) \ - (float)__builtin_loongarch_frsqrte_s((float)_1) +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __frsqrte_s(float _1) { + return __builtin_loongarch_frsqrte_s(_1); +} -#define __frsqrte_d(/*double*/ _1) \ - (double)__builtin_loongarch_frsqrte_d((double)_1) +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __frsqrte_d(double _1) { + return __builtin_loongarch_frsqrte_d(_1); +} +#endif #ifdef __cplusplus } diff --git a/lib/include/lasxintrin.h b/lib/include/lasxintrin.h index dafc2a2f3e..85020d8282 100644 --- a/lib/include/lasxintrin.h +++ b/lib/include/lasxintrin.h @@ -1726,18 +1726,6 @@ extern __inline return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1); } -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 - __lasx_xvfrecipe_s(__m256 _1) { - return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1); -} - -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d - __lasx_xvfrecipe_d(__m256d _1) { - return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1); -} - extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 __lasx_xvfrint_s(__m256 _1) { @@ -1762,18 +1750,6 @@ extern __inline return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1); } -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 - __lasx_xvfrsqrte_s(__m256 _1) { - return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1); -} - -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d - __lasx_xvfrsqrte_d(__m256d _1) { - return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1); -} - extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 __lasx_xvflogb_s(__m256 _1) { @@ -2585,7 +2561,7 @@ extern __inline extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i __lasx_xvorn_v(__m256i _1, __m256i _2) { - return (__m256i)__builtin_lasx_xvorn_v((v32i8)_1, (v32i8)_2); + return (__m256i)__builtin_lasx_xvorn_v((v32u8)_1, (v32u8)_2); } #define __lasx_xvldi(/*i13*/ _1) ((__m256i)__builtin_lasx_xvldi((_1))) @@ -3866,6 +3842,32 @@ extern __inline return (__m256i)__builtin_lasx_xvfcmp_sun_s((v8f32)_1, (v8f32)_2); } +#if defined(__loongarch_frecipe) +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_xvfrecipe_s(__m256 _1) { + return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_xvfrecipe_d(__m256d _1) { + return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_xvfrsqrte_s(__m256 _1) { + return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_xvfrsqrte_d(__m256d _1) { + return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1); +} +#endif + #define __lasx_xvpickve_d_f(/*__m256d*/ _1, /*ui2*/ _2) \ ((__m256d)__builtin_lasx_xvpickve_d_f((v4f64)(_1), (_2))) diff --git a/lib/include/limits.h b/lib/include/limits.h index 56dffe5684..d08227fe4d 100644 --- a/lib/include/limits.h +++ b/lib/include/limits.h @@ -111,11 +111,14 @@ #define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL) #endif -/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension. It's too bad - that we don't have something like #pragma poison that could be used to - deprecate a macro - the code should just use LLONG_MAX and friends. +/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension. Android's + bionic also defines them. It's too bad that we don't have something like + #pragma poison that could be used to deprecate a macro - the code should just + use LLONG_MAX and friends. */ -#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__) +#if (defined(__GNU_LIBRARY__) ? defined(__USE_GNU) \ + : !defined(__STRICT_ANSI__)) || \ + defined(__BIONIC__) #undef LONG_LONG_MIN #undef LONG_LONG_MAX diff --git a/lib/include/llvm_libc_wrappers/ctype.h b/lib/include/llvm_libc_wrappers/ctype.h index 49c2af9347..960cf43302 100644 --- a/lib/include/llvm_libc_wrappers/ctype.h +++ b/lib/include/llvm_libc_wrappers/ctype.h @@ -51,6 +51,19 @@ #pragma push_macro("toascii") #pragma push_macro("tolower") #pragma push_macro("toupper") +#pragma push_macro("isalnum_l") +#pragma push_macro("isalpha_l") +#pragma push_macro("isascii_l") +#pragma push_macro("isblank_l") +#pragma push_macro("iscntrl_l") +#pragma push_macro("isdigit_l") +#pragma push_macro("isgraph_l") +#pragma push_macro("islower_l") +#pragma push_macro("isprint_l") +#pragma push_macro("ispunct_l") +#pragma push_macro("isspace_l") +#pragma push_macro("isupper_l") +#pragma push_macro("isxdigit_l") #undef isalnum #undef isalpha @@ -68,6 +81,18 @@ #undef toascii #undef tolower #undef toupper +#undef isalnum_l +#undef isalpha_l +#undef iscntrl_l +#undef isdigit_l +#undef islower_l +#undef isgraph_l +#undef isprint_l +#undef ispunct_l +#undef isspace_l +#undef isupper_l +#undef isblank_l +#undef isxdigit_l #pragma omp begin declare target @@ -93,6 +118,19 @@ #pragma pop_macro("toascii") #pragma pop_macro("tolower") #pragma pop_macro("toupper") +#pragma pop_macro("isalnum_l") +#pragma pop_macro("isalpha_l") +#pragma pop_macro("isascii_l") +#pragma pop_macro("isblank_l") +#pragma pop_macro("iscntrl_l") +#pragma pop_macro("isdigit_l") +#pragma pop_macro("isgraph_l") +#pragma pop_macro("islower_l") +#pragma pop_macro("isprint_l") +#pragma pop_macro("ispunct_l") +#pragma pop_macro("isspace_l") +#pragma pop_macro("isupper_l") +#pragma pop_macro("isxdigit_l") #endif #undef __LIBC_ATTRS diff --git a/lib/include/llvm_libc_wrappers/stdlib.h b/lib/include/llvm_libc_wrappers/stdlib.h index 7fce5a1a31..69afdf4a68 100644 --- a/lib/include/llvm_libc_wrappers/stdlib.h +++ b/lib/include/llvm_libc_wrappers/stdlib.h @@ -34,8 +34,16 @@ _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!"); _Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!"); _Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!"); +#if defined(__GLIBC__) && __cplusplus >= 201703L +#define at_quick_exit atexit +#endif + #include +#if defined(__GLIBC__) && __cplusplus >= 201703L +#undef at_quick_exit +#endif + #pragma omp end declare target #undef __LIBC_ATTRS diff --git a/lib/include/lsxintrin.h b/lib/include/lsxintrin.h index f347955ce6..a9b19223fc 100644 --- a/lib/include/lsxintrin.h +++ b/lib/include/lsxintrin.h @@ -1776,18 +1776,6 @@ extern __inline return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1); } -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 - __lsx_vfrecipe_s(__m128 _1) { - return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1); -} - -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d - __lsx_vfrecipe_d(__m128d _1) { - return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1); -} - extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 __lsx_vfrint_s(__m128 _1) { @@ -1812,18 +1800,6 @@ extern __inline return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1); } -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 - __lsx_vfrsqrte_s(__m128 _1) { - return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1); -} - -extern __inline - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d - __lsx_vfrsqrte_d(__m128d _1) { - return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1); -} - extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 __lsx_vflogb_s(__m128 _1) { @@ -3425,7 +3401,7 @@ extern __inline extern __inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i __lsx_vorn_v(__m128i _1, __m128i _2) { - return (__m128i)__builtin_lsx_vorn_v((v16i8)_1, (v16i8)_2); + return (__m128i)__builtin_lsx_vorn_v((v16u8)_1, (v16u8)_2); } #define __lsx_vldi(/*i13*/ _1) ((__m128i)__builtin_lsx_vldi((_1))) @@ -3738,6 +3714,32 @@ extern __inline return (__m128i)__builtin_lsx_vfcmp_sun_s((v4f32)_1, (v4f32)_2); } +#if defined(__loongarch_frecipe) +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lsx_vfrecipe_s(__m128 _1) { + return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lsx_vfrecipe_d(__m128d _1) { + return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lsx_vfrsqrte_s(__m128 _1) { + return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lsx_vfrsqrte_d(__m128d _1) { + return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1); +} +#endif + #define __lsx_vrepli_b(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_b((_1))) #define __lsx_vrepli_d(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_d((_1))) diff --git a/lib/include/lzcntintrin.h b/lib/include/lzcntintrin.h index f4ddce9d0e..27509021ec 100644 --- a/lib/include/lzcntintrin.h +++ b/lib/include/lzcntintrin.h @@ -15,7 +15,13 @@ #define __LZCNTINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) constexpr +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) +#endif #ifndef _MSC_VER /// Counts the number of leading zero bits in the operand. @@ -43,8 +49,7 @@ /// bits in the operand. /// \see _lzcnt_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -__lzcnt32(unsigned int __X) -{ +__lzcnt32(unsigned int __X) { return __builtin_ia32_lzcnt_u32(__X); } @@ -60,8 +65,7 @@ __lzcnt32(unsigned int __X) /// bits in the operand. /// \see __lzcnt32 static __inline__ unsigned int __DEFAULT_FN_ATTRS -_lzcnt_u32(unsigned int __X) -{ +_lzcnt_u32(unsigned int __X) { return __builtin_ia32_lzcnt_u32(__X); } @@ -93,8 +97,7 @@ _lzcnt_u32(unsigned int __X) /// bits in the operand. /// \see __lzcnt64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_lzcnt_u64(unsigned long long __X) -{ +_lzcnt_u64(unsigned long long __X) { return __builtin_ia32_lzcnt_u64(__X); } #endif diff --git a/lib/include/mmintrin.h b/lib/include/mmintrin.h index 4e154e2d85..dc0fa5c523 100644 --- a/lib/include/mmintrin.h +++ b/lib/include/mmintrin.h @@ -21,10 +21,45 @@ typedef int __v2si __attribute__((__vector_size__(8))); typedef short __v4hi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8))); +/* Unsigned types */ +typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); +typedef unsigned short __v4hu __attribute__((__vector_size__(8))); +typedef unsigned char __v8qu __attribute__((__vector_size__(8))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v8qs __attribute__((__vector_size__(8))); + +/* SSE/SSE2 types */ +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \ - __min_vector_width__(64))) +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("sse2,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#endif + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr +#else +#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 +#endif + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. @@ -50,10 +85,10 @@ _mm_empty(void) { /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) { - return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); + return __extension__ (__m64)(__v2si){__i, 0}; } /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit @@ -67,10 +102,10 @@ _mm_cvtsi32_si64(int __i) /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) { - return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); + return ((__v2si)__m)[0]; } /// Casts a 64-bit signed integer value into a 64-bit integer vector. @@ -83,7 +118,7 @@ _mm_cvtsi64_si32(__m64 __m) /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) { return (__m64)__i; @@ -99,7 +134,7 @@ _mm_cvtsi64_m64(long long __i) /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) { return (long long)__m; @@ -124,10 +159,11 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_packsswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Converts, with saturation, 32-bit signed integers from both 64-bit integer @@ -149,10 +185,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -174,10 +211,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_packuswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] @@ -201,10 +239,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 4, 12, 5, 13, 6, 14, 7, 15); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -224,10 +263,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 2, 6, 3, 7); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -245,10 +285,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] @@ -272,10 +312,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 0, 8, 1, 9, 2, 10, 3, 11); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -295,10 +336,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 0, 4, 1, 5); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -316,10 +358,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } /// Adds each 8-bit integer element of the first 64-bit integer vector @@ -337,10 +379,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); } /// Adds each 16-bit integer element of the first 64-bit integer vector @@ -358,10 +400,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); } /// Adds each 32-bit integer element of the first 64-bit integer vector @@ -379,10 +421,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); } /// Adds, with saturation, each 8-bit signed integer element of the first @@ -403,10 +445,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); } /// Adds, with saturation, each 16-bit signed integer element of the first @@ -427,10 +469,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); } /// Adds, with saturation, each 8-bit unsigned integer element of the first @@ -450,10 +492,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); } /// Adds, with saturation, each 16-bit unsigned integer element of the first @@ -473,10 +515,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); } /// Subtracts each 8-bit integer element of the second 64-bit integer @@ -494,10 +536,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); } /// Subtracts each 16-bit integer element of the second 64-bit integer @@ -515,10 +557,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); } /// Subtracts each 32-bit integer element of the second 64-bit integer @@ -536,10 +578,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); } /// Subtracts, with saturation, each 8-bit signed integer element of the second @@ -560,10 +602,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); } /// Subtracts, with saturation, each 16-bit signed integer element of the @@ -584,10 +626,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); } /// Subtracts each 8-bit unsigned integer element of the second 64-bit @@ -608,10 +650,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); } /// Subtracts each 16-bit unsigned integer element of the second 64-bit @@ -632,10 +674,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -659,10 +701,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -680,10 +723,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -701,10 +745,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); } /// Left-shifts each 16-bit signed integer element of the first @@ -724,10 +768,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -746,10 +791,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), + __count)); } /// Left-shifts each 32-bit signed integer element of the first @@ -769,10 +815,11 @@ _mm_slli_pi16(__m64 __m, int __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -791,10 +838,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), + __count)); } /// Left-shifts the first 64-bit integer parameter by the number of bits @@ -811,10 +859,11 @@ _mm_slli_pi32(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), + (__v2di)__anyext128(__count))); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -831,10 +880,11 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -855,10 +905,11 @@ _mm_slli_si64(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -878,10 +929,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -902,10 +954,11 @@ _mm_srai_pi16(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -925,10 +978,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -948,10 +1002,11 @@ _mm_srai_pi32(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -970,10 +1025,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -993,10 +1049,11 @@ _mm_srli_pi16(__m64 __m, int __count) /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1015,10 +1072,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts the first 64-bit integer parameter by the number of bits @@ -1035,10 +1093,11 @@ _mm_srli_pi32(__m64 __m, int __count) /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), + (__v2di)__anyext128(__count))); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1056,10 +1115,11 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), + __count)); } /// Performs a bitwise AND of two 64-bit integer vectors. @@ -1074,10 +1134,10 @@ _mm_srli_si64(__m64 __m, int __count) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1095,10 +1155,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); + return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1113,10 +1173,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1131,10 +1191,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1153,10 +1213,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1175,10 +1235,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1197,10 +1257,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1219,10 +1279,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); + /* This function always performs a signed comparison, but __v8qi is a char + which may be signed or unsigned, so use __v8qs. */ + return (__m64)((__v8qs)__m1 > (__v8qs)__m2); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1241,10 +1303,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)((__v4hi)__m1 > (__v4hi)__m2); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1263,10 +1325,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); + return (__m64)((__v2si)__m1 > (__v2si)__m2); } /// Constructs a 64-bit integer vector initialized to zero. @@ -1276,10 +1338,9 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// This intrinsic corresponds to the PXOR instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setzero_si64(void) -{ - return __extension__ (__m64){ 0LL }; +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_setzero_si64(void) { + return __extension__(__m64){0LL}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1297,10 +1358,9 @@ _mm_setzero_si64(void) /// A 32-bit integer value used to initialize the lower 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set_pi32(int __i1, int __i0) -{ - return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_set_pi32(int __i1, int __i0) { + return __extension__(__m64)(__v2si){__i0, __i1}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1320,10 +1380,9 @@ _mm_set_pi32(int __i1, int __i0) /// \param __s0 /// A 16-bit integer value used to initialize bits [15:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) -{ - return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { + return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1351,12 +1410,11 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) /// \param __b0 /// An 8-bit integer value used to initialize bits [7:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, - char __b1, char __b0) -{ - return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, - __b4, __b5, __b6, __b7); + char __b1, char __b0) { + return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7}; } /// Constructs a 64-bit integer vector of [2 x i32], with each of the @@ -1372,10 +1430,9 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, /// A 32-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi32(int __i) -{ - return _mm_set_pi32(__i, __i); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_set1_pi32(int __i) { + return _mm_set_pi32(__i, __i); } /// Constructs a 64-bit integer vector of [4 x i16], with each of the @@ -1391,10 +1448,9 @@ _mm_set1_pi32(int __i) /// A 16-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi16(short __w) -{ - return _mm_set_pi16(__w, __w, __w, __w); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_set1_pi16(short __w) { + return _mm_set_pi16(__w, __w, __w, __w); } /// Constructs a 64-bit integer vector of [8 x i8], with each of the @@ -1409,10 +1465,9 @@ _mm_set1_pi16(short __w) /// An 8-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi8(char __b) -{ - return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_set1_pi8(char __b) { + return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } /// Constructs a 64-bit integer vector, initialized in reverse order with @@ -1430,10 +1485,9 @@ _mm_set1_pi8(char __b) /// A 32-bit integer value used to initialize the upper 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setr_pi32(int __i0, int __i1) -{ - return _mm_set_pi32(__i1, __i0); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_setr_pi32(int __i0, int __i1) { + return _mm_set_pi32(__i1, __i0); } /// Constructs a 64-bit integer vector, initialized in reverse order with @@ -1453,10 +1507,9 @@ _mm_setr_pi32(int __i0, int __i1) /// \param __w3 /// A 16-bit integer value used to initialize bits [63:48] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) -{ - return _mm_set_pi16(__w3, __w2, __w1, __w0); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { + return _mm_set_pi16(__w3, __w2, __w1, __w0); } /// Constructs a 64-bit integer vector, initialized in reverse order with @@ -1484,14 +1537,15 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) /// \param __b7 /// An 8-bit integer value used to initialize bits [63:56] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, - char __b6, char __b7) -{ - return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); + char __b6, char __b7) { + return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -#undef __DEFAULT_FN_ATTRS +#undef __anyext128 +#undef __trunc64 +#undef __DEFAULT_FN_ATTRS_SSE2 /* Aliases for compatibility. */ #define _m_empty _mm_empty diff --git a/lib/include/module.modulemap b/lib/include/module.modulemap index 9ffc249c8d..dcaf09e8f2 100644 --- a/lib/include/module.modulemap +++ b/lib/include/module.modulemap @@ -66,6 +66,8 @@ module _Builtin_intrinsics [system] [extern_c] { textual header "__wmmintrin_aes.h" textual header "__wmmintrin_pclmul.h" + textual header "mm3dnow.h" + explicit module mm_malloc { requires !freestanding header "mm_malloc.h" @@ -122,10 +124,6 @@ module _Builtin_intrinsics [system] [extern_c] { header "popcntintrin.h" } - explicit module mm3dnow { - header "mm3dnow.h" - } - explicit module aes_pclmul { header "wmmintrin.h" export aes diff --git a/lib/include/movrs_avx10_2_512intrin.h b/lib/include/movrs_avx10_2_512intrin.h new file mode 100644 index 0000000000..5cd907a597 --- /dev/null +++ b/lib/include/movrs_avx10_2_512intrin.h @@ -0,0 +1,98 @@ +/*===----- movrs_avx10_2_512intrin.h - AVX10.2-512-MOVRS intrinsics --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __MOVRS_AVX10_2_512INTRIN_H +#define __MOVRS_AVX10_2_512INTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("movrs, avx10.2-512"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadrs_epi8(void const *__A) { + return (__m512i)__builtin_ia32_vmovrsb512((const __v64qi *)(__A)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadrs_epi8(__m512i __W, __mmask64 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_loadrs_epi8(__A), (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadrs_epi8(__mmask64 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_loadrs_epi8(__A), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadrs_epi32(void const *__A) { + return (__m512i)__builtin_ia32_vmovrsd512((const __v16si *)(__A)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadrs_epi32(__m512i __W, __mmask16 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_loadrs_epi32(__A), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadrs_epi32(__mmask16 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_loadrs_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadrs_epi64(void const *__A) { + return (__m512i)__builtin_ia32_vmovrsq512((const __v8di *)(__A)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadrs_epi64(__m512i __W, __mmask8 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectq_512( + (__mmask8)__U, (__v8di)_mm512_loadrs_epi64(__A), (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadrs_epi64(__mmask8 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_loadrs_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadrs_epi16(void const *__A) { + return (__m512i)__builtin_ia32_vmovrsw512((const __v32hi *)(__A)); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadrs_epi16(__m512i __W, __mmask32 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_loadrs_epi16(__A), (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadrs_epi16(__mmask32 __U, void const *__A) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_loadrs_epi16(__A), + (__v32hi)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif /* __x86_64__ */ +#endif /* __MOVRS_AVX10_2_512INTRIN_H */ diff --git a/lib/include/movrs_avx10_2intrin.h b/lib/include/movrs_avx10_2intrin.h new file mode 100644 index 0000000000..27b625b6b4 --- /dev/null +++ b/lib/include/movrs_avx10_2intrin.h @@ -0,0 +1,174 @@ +/*===--------- movrs_avx10_2intrin.h - AVX10.2-MOVRS intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __MOVRS_AVX10_2INTRIN_H +#define __MOVRS_AVX10_2INTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("movrs,avx10.2-256"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("movrs,avx10.2-256"), __min_vector_width__(256))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_loadrs_epi8(void const *__A) { + return (__m128i)__builtin_ia32_vmovrsb128((const __v16qi *)(__A)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadrs_epi8(__m128i __W, __mmask16 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_loadrs_epi8(__A), (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadrs_epi8(__mmask16 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_loadrs_epi8(__A), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadrs_epi8(void const *__A) { + return (__m256i)__builtin_ia32_vmovrsb256((const __v32qi *)(__A)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadrs_epi8(__m256i __W, __mmask32 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask32)__U, (__v32qi)_mm256_loadrs_epi8(__A), (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadrs_epi8(__mmask32 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_loadrs_epi8(__A), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_loadrs_epi32(void const *__A) { + return (__m128i)__builtin_ia32_vmovrsd128((const __v4si *)(__A)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadrs_epi32(__m128i __W, __mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_loadrs_epi32(__A), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadrs_epi32(__mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_loadrs_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadrs_epi32(void const *__A) { + return (__m256i)__builtin_ia32_vmovrsd256((const __v8si *)(__A)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadrs_epi32(__m256i __W, __mmask8 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_loadrs_epi32(__A), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadrs_epi32(__mmask8 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_loadrs_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_loadrs_epi64(void const *__A) { + return (__m128i)__builtin_ia32_vmovrsq128((const __v2di *)(__A)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadrs_epi64(__m128i __W, __mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectq_128( + (__mmask8)__U, (__v2di)_mm_loadrs_epi64(__A), (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadrs_epi64(__mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_loadrs_epi64(__A), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadrs_epi64(void const *__A) { + return (__m256i)__builtin_ia32_vmovrsq256((const __v4di *)(__A)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadrs_epi64(__m256i __W, __mmask8 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectq_256( + (__mmask8)__U, (__v4di)_mm256_loadrs_epi64(__A), (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadrs_epi64(__mmask8 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_loadrs_epi64(__A), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_loadrs_epi16(void const *__A) { + return (__m128i)__builtin_ia32_vmovrsw128((const __v8hi *)(__A)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadrs_epi16(__m128i __W, __mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectw_128( + (__mmask8)__U, (__v8hi)_mm_loadrs_epi16(__A), (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadrs_epi16(__mmask8 __U, void const *__A) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_loadrs_epi16(__A), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadrs_epi16(void const *__A) { + return (__m256i)__builtin_ia32_vmovrsw256((const __v16hi *)(__A)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadrs_epi16(__m256i __W, __mmask16 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_loadrs_epi16(__A), (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadrs_epi16(__mmask16 __U, void const *__A) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_loadrs_epi16(__A), + (__v16hi)_mm256_setzero_si256()); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __x86_64__ */ +#endif /* __MOVRS_AVX10_2INTRIN_H */ diff --git a/lib/include/movrsintrin.h b/lib/include/movrsintrin.h new file mode 100644 index 0000000000..250f4004cd --- /dev/null +++ b/lib/include/movrsintrin.h @@ -0,0 +1,59 @@ +/*===---------------- movrsintrin.h - MOVRS intrinsics ----------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===----------------------------------------------------------------------===*/ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __MOVRSINTRIN_H +#define __MOVRSINTRIN_H + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("movrs"))) + +#ifdef __x86_64__ +static __inline__ char __DEFAULT_FN_ATTRS _movrs_i8(const void *__A) { + return (char)__builtin_ia32_movrsqi((const void *)__A); +} + +static __inline__ short __DEFAULT_FN_ATTRS _movrs_i16(const void *__A) { + return (short)__builtin_ia32_movrshi((const void *)__A); +} + +static __inline__ int __DEFAULT_FN_ATTRS _movrs_i32(const void *__A) { + return (int)__builtin_ia32_movrssi((const void *)__A); +} + +static __inline__ long long __DEFAULT_FN_ATTRS _movrs_i64(const void *__A) { + return (long long)__builtin_ia32_movrsdi((const void *)__A); +} +#endif // __x86_64__ + +// Loads a memory sequence containing the specified memory address into +/// the L3 data cache. Data will be shared (read/written) to by requesting +/// core and other cores. +/// +/// Note that the effect of this intrinsic is dependent on the processor +/// implementation. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PREFETCHRS instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. +static __inline__ void __DEFAULT_FN_ATTRS +_m_prefetchrs(volatile const void *__P) { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-qual" + __builtin_ia32_prefetchrs((const void *)__P); +#pragma clang diagnostic pop +} + +#undef __DEFAULT_FN_ATTRS +#endif // __MOVRSINTRIN_H \ No newline at end of file diff --git a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h index d5b6846b03..3e354c63ef 100644 --- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h +++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h @@ -10,17 +10,15 @@ #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__ -#ifndef _OPENMP -#error "This file is for OpenMP compilation only." -#endif - #ifdef __cplusplus extern "C" { #endif +#ifdef __NVPTX__ #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) +#pragma push_macro("__CUDA__") #define __CUDA__ #define __OPENMP_NVPTX__ @@ -31,9 +29,10 @@ extern "C" { #include <__clang_cuda_device_functions.h> #undef __OPENMP_NVPTX__ -#undef __CUDA__ +#pragma pop_macro("__CUDA__") #pragma omp end declare variant +#endif #ifdef __AMDGCN__ #pragma omp begin declare variant match(device = {arch(amdgcn)}) diff --git a/lib/include/openmp_wrappers/complex_cmath.h b/lib/include/openmp_wrappers/complex_cmath.h index e3d9aebbbc..cee36bde3f 100644 --- a/lib/include/openmp_wrappers/complex_cmath.h +++ b/lib/include/openmp_wrappers/complex_cmath.h @@ -64,8 +64,13 @@ template __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) { } // conj - -template std::complex<_Tp> conj(const std::complex<_Tp> &__c) { +#ifdef _GLIBCXX20_CONSTEXPR +#define CXX20_CONSTEXPR_DEVICE __DEVICE__ +#else +#define CXX20_CONSTEXPR_DEVICE +#endif +template +CXX20_CONSTEXPR_DEVICE std::complex<_Tp> conj(const std::complex<_Tp> &__c) { return std::complex<_Tp>(__c.real(), -__c.imag()); } diff --git a/lib/include/pmmintrin.h b/lib/include/pmmintrin.h index 91cee1edda..cd605df7fb 100644 --- a/lib/include/pmmintrin.h +++ b/lib/include/pmmintrin.h @@ -17,9 +17,21 @@ #include /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("sse3,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sse3"), \ + __min_vector_width__(128))) +#endif + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif /// Loads data from an unaligned memory location to elements in a 128-bit /// vector. @@ -122,7 +134,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b) /// destination. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated /// values. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movehdup_ps(__m128 __a) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); @@ -143,7 +155,7 @@ _mm_movehdup_ps(__m128 __a) /// destination. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated /// values. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_moveldup_ps(__m128 __a) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); @@ -244,7 +256,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b) /// [127:64] and [63:0] of the destination. /// \returns A 128-bit vector of [2 x double] containing the moved and /// duplicated values. -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movedup_pd(__m128d __a) { return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); @@ -297,5 +309,6 @@ _mm_mwait(unsigned __extensions, unsigned __hints) } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif /* __PMMINTRIN_H */ diff --git a/lib/include/popcntintrin.h b/lib/include/popcntintrin.h index 0aa94aecda..b276b4da4d 100644 --- a/lib/include/popcntintrin.h +++ b/lib/include/popcntintrin.h @@ -11,12 +11,13 @@ #define __POPCNTINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) - #if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("popcnt"))) constexpr #else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) #endif /// Counts the number of bits in the source operand having a value of 1. @@ -29,7 +30,7 @@ /// An unsigned 32-bit integer operand. /// \returns A 32-bit integer containing the number of bits with value 1 in the /// source operand. -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ int __DEFAULT_FN_ATTRS _mm_popcnt_u32(unsigned int __A) { return __builtin_popcount(__A); @@ -46,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A) /// An unsigned 64-bit integer operand. /// \returns A 64-bit integer containing the number of bits with value 1 in the /// source operand. -static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR +static __inline__ long long __DEFAULT_FN_ATTRS _mm_popcnt_u64(unsigned long long __A) { return __builtin_popcountll(__A); @@ -54,6 +55,5 @@ _mm_popcnt_u64(unsigned long long __A) #endif /* __x86_64__ */ #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif /* __POPCNTINTRIN_H */ diff --git a/lib/include/ptrauth.h b/lib/include/ptrauth.h index 154b599862..d489a67c53 100644 --- a/lib/include/ptrauth.h +++ b/lib/include/ptrauth.h @@ -42,6 +42,9 @@ typedef enum { The extra data is always 0. */ ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data, + /* The key used to sign pointers in ELF .init_array/.fini_array. */ + ptrauth_key_init_fini_pointer = ptrauth_key_process_independent_code, + /* Other pointers signed under the ABI use private ABI rules. */ } ptrauth_key; @@ -253,6 +256,9 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t; [[clang::ptrauth_vtable_pointer(key, address_discrimination, \ extra_discrimination)]] +/* The value is ptrauth_string_discriminator("init_fini") */ +#define __ptrauth_init_fini_discriminator 0xd9d4 + #else #define ptrauth_strip(__value, __key) \ diff --git a/lib/include/riscv_corev_alu.h b/lib/include/riscv_corev_alu.h new file mode 100644 index 0000000000..d2832ddf72 --- /dev/null +++ b/lib/include/riscv_corev_alu.h @@ -0,0 +1,128 @@ +/*===---- riscv_corev_alu.h - CORE-V ALU intrinsics ------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __RISCV_COREV_ALU_H +#define __RISCV_COREV_ALU_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__riscv_xcvalu) + +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_abs(long a) { + return __builtin_abs(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_slet(long a, long b) { + return __builtin_riscv_cv_alu_slet(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS +__riscv_cv_alu_sletu(unsigned long a, unsigned long b) { + return __builtin_riscv_cv_alu_sletu(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_min(long a, long b) { + return __builtin_elementwise_min(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_minu(unsigned long a, unsigned long b) { + return __builtin_elementwise_min(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_max(long a, long b) { + return __builtin_elementwise_max(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_maxu(unsigned long a, unsigned long b) { + return __builtin_elementwise_max(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_exths(int16_t a) { + return __builtin_riscv_cv_alu_exths(a); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_exthz(uint16_t a) { + return __builtin_riscv_cv_alu_exthz(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_extbs(int8_t a) { + return __builtin_riscv_cv_alu_extbs(a); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_extbz(uint8_t a) { + return __builtin_riscv_cv_alu_extbz(a); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_clip(long a, + unsigned long b) { + return __builtin_riscv_cv_alu_clip(a, b); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_clipu(unsigned long a, unsigned long b) { + return __builtin_riscv_cv_alu_clipu(a, b); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_addN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_adduN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_adduN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addRN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_addRN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_adduRN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_adduRN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_subN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_subuN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_subuN(a, b, shft); +} + +static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subRN(long a, long b, + uint8_t shft) { + return __builtin_riscv_cv_alu_subRN(a, b, shft); +} + +static __inline__ unsigned long __DEFAULT_FN_ATTRS +__riscv_cv_alu_subuRN(unsigned long a, unsigned long b, uint8_t shft) { + return __builtin_riscv_cv_alu_subuRN(a, b, shft); +} + +#endif // defined(__riscv_xcvalu) + +#if defined(__cplusplus) +} +#endif + +#endif // define __RISCV_COREV_ALU_H diff --git a/lib/include/riscv_vector.h b/lib/include/riscv_vector.h index c99ceb8021..0560e82a85 100644 --- a/lib/include/riscv_vector.h +++ b/lib/include/riscv_vector.h @@ -419,7 +419,6 @@ typedef __rvv_bfloat16m2x4_t vbfloat16m2x4_t; typedef __rvv_bfloat16m4_t vbfloat16m4_t; typedef __rvv_bfloat16m4x2_t vbfloat16m4x2_t; typedef __rvv_bfloat16m8_t vbfloat16m8_t; -#define __riscv_v_intrinsic_overloading 1 #ifdef __cplusplus } diff --git a/lib/include/sm4evexintrin.h b/lib/include/sm4evexintrin.h new file mode 100644 index 0000000000..f6ae0037ba --- /dev/null +++ b/lib/include/sm4evexintrin.h @@ -0,0 +1,32 @@ +/*===--------------- sm4evexintrin.h - SM4 EVEX intrinsics -----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __SM4EVEXINTRIN_H +#define __SM4EVEXINTRIN_H + +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("sm4,avx10.2-512"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sm4key4_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vsm4key4512((__v16su)__A, (__v16su)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vsm4rnds4512((__v16su)__A, (__v16su)__B); +} + +#undef __DEFAULT_FN_ATTRS512 + +#endif // __SM4EVEXINTRIN_H diff --git a/lib/include/smmintrin.h b/lib/include/smmintrin.h index b3fec474e3..bc6fe4c801 100644 --- a/lib/include/smmintrin.h +++ b/lib/include/smmintrin.h @@ -17,9 +17,15 @@ #include /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("sse4.1,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \ + __min_vector_width__(128))) +#endif /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 diff --git a/lib/include/stdalign.h b/lib/include/stdalign.h index 56cdfa52d4..158508e65d 100644 --- a/lib/include/stdalign.h +++ b/lib/include/stdalign.h @@ -10,10 +10,6 @@ #ifndef __STDALIGN_H #define __STDALIGN_H -#if defined(__MVS__) && __has_include_next() -#include_next -#else - #if defined(__cplusplus) || \ (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L) #ifndef __cplusplus @@ -25,5 +21,4 @@ #define __alignof_is_defined 1 #endif /* __STDC_VERSION__ */ -#endif /* __MVS__ */ #endif /* __STDALIGN_H */ diff --git a/lib/include/tbmintrin.h b/lib/include/tbmintrin.h index f4e848a1c0..cf92d5a7b3 100644 --- a/lib/include/tbmintrin.h +++ b/lib/include/tbmintrin.h @@ -15,63 +15,60 @@ #define __TBMINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) constexpr +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) +#endif #define __bextri_u32(a, b) \ ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \ (unsigned int)(b))) static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcfill_u32(unsigned int __a) -{ +__blcfill_u32(unsigned int __a) { return __a & (__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blci_u32(unsigned int __a) -{ +__blci_u32(unsigned int __a) { return __a | ~(__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcic_u32(unsigned int __a) -{ +__blcic_u32(unsigned int __a) { return ~__a & (__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcmsk_u32(unsigned int __a) -{ +__blcmsk_u32(unsigned int __a) { return __a ^ (__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcs_u32(unsigned int __a) -{ +__blcs_u32(unsigned int __a) { return __a | (__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsfill_u32(unsigned int __a) -{ +__blsfill_u32(unsigned int __a) { return __a | (__a - 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsic_u32(unsigned int __a) -{ +__blsic_u32(unsigned int __a) { return ~__a | (__a - 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__t1mskc_u32(unsigned int __a) -{ +__t1mskc_u32(unsigned int __a) { return ~__a | (__a + 1); } static __inline__ unsigned int __DEFAULT_FN_ATTRS -__tzmsk_u32(unsigned int __a) -{ +__tzmsk_u32(unsigned int __a) { return ~__a & (__a - 1); } @@ -81,56 +78,47 @@ __tzmsk_u32(unsigned int __a) (unsigned long long)(b))) static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcfill_u64(unsigned long long __a) -{ +__blcfill_u64(unsigned long long __a) { return __a & (__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blci_u64(unsigned long long __a) -{ +__blci_u64(unsigned long long __a) { return __a | ~(__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcic_u64(unsigned long long __a) -{ +__blcic_u64(unsigned long long __a) { return ~__a & (__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcmsk_u64(unsigned long long __a) -{ +__blcmsk_u64(unsigned long long __a) { return __a ^ (__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcs_u64(unsigned long long __a) -{ +__blcs_u64(unsigned long long __a) { return __a | (__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsfill_u64(unsigned long long __a) -{ +__blsfill_u64(unsigned long long __a) { return __a | (__a - 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsic_u64(unsigned long long __a) -{ +__blsic_u64(unsigned long long __a) { return ~__a | (__a - 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__t1mskc_u64(unsigned long long __a) -{ +__t1mskc_u64(unsigned long long __a) { return ~__a | (__a + 1); } static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__tzmsk_u64(unsigned long long __a) -{ +__tzmsk_u64(unsigned long long __a) { return ~__a & (__a - 1); } #endif diff --git a/lib/include/tmmintrin.h b/lib/include/tmmintrin.h index bf8327b692..371cc82e3d 100644 --- a/lib/include/tmmintrin.h +++ b/lib/include/tmmintrin.h @@ -17,13 +17,21 @@ #include /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ - __target__("ssse3,no-evex512"), __min_vector_width__(64))) -#define __DEFAULT_FN_ATTRS_MMX \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,ssse3,no-evex512"), \ - __min_vector_width__(64))) + __target__("ssse3,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), \ + __min_vector_width__(128))) +#endif + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) /// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer @@ -37,10 +45,10 @@ /// A 64-bit vector of [8 x i8]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi8(__m64 __a) { - return (__m64)__builtin_ia32_pabsb((__v8qi)__a); + return (__m64)__builtin_elementwise_abs((__v8qs)__a); } /// Computes the absolute value of each of the packed 8-bit signed @@ -73,10 +81,10 @@ _mm_abs_epi8(__m128i __a) /// A 64-bit vector of [4 x i16]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi16(__m64 __a) { - return (__m64)__builtin_ia32_pabsw((__v4hi)__a); + return (__m64)__builtin_elementwise_abs((__v4hi)__a); } /// Computes the absolute value of each of the packed 16-bit signed @@ -109,10 +117,10 @@ _mm_abs_epi16(__m128i __a) /// A 64-bit vector of [2 x i32]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi32(__m64 __a) { - return (__m64)__builtin_ia32_pabsd((__v2si)__a); + return (__m64)__builtin_elementwise_abs((__v2si)__a); } /// Computes the absolute value of each of the packed 32-bit signed @@ -177,7 +185,10 @@ _mm_abs_epi32(__m128i __a) /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. #define _mm_alignr_pi8(a, b, n) \ - ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))) + ((__m64)__builtin_shufflevector( \ + __builtin_ia32_psrldqi128_byteshift( \ + __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ + (n)), __extension__ (__v2di){}, 0)) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. @@ -242,10 +253,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phaddw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -265,10 +277,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b) /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_phaddd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally adds, with saturation, the adjacent pairs of values contained @@ -317,10 +330,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadds_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phaddsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -386,10 +400,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phsubw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -409,10 +424,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_phsubd128( + (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){})); } /// Horizontally subtracts, with saturation, the adjacent pairs of values @@ -461,10 +477,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_phsubsw128( + (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){})); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -525,10 +542,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit @@ -565,10 +583,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b) /// A 64-bit vector of [4 x i16] containing one of the source operands. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhrs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Copies the 8-bit integers from a 128-bit integer vector to the @@ -614,12 +633,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the /// destination. \n -/// Bits [3:0] select the source byte to be copied. +/// Bits [2:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pshufb128( + (__v16qi)__builtin_shufflevector( + (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1), + (__v16qi)__anyext128(__b))); } /// For each 8-bit integer in the first source operand, perform one of @@ -720,10 +742,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b) /// A 64-bit integer vector containing control bytes corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// For each 16-bit integer in the first source operand, perform one of @@ -746,10 +769,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b) /// A 64-bit integer vector containing control words corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// For each 32-bit integer in the first source operand, perform one of @@ -772,13 +796,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b) /// A 64-bit integer vector containing two control doublewords corresponding /// to positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX #endif /* __TMMINTRIN_H */ diff --git a/lib/include/vecintrin.h b/lib/include/vecintrin.h index 1f51e32c0d..a14c39f9f7 100644 --- a/lib/include/vecintrin.h +++ b/lib/include/vecintrin.h @@ -468,6 +468,27 @@ vec_perm(__vector __bool long long __a, __vector __bool long long __b, (__vector unsigned char)__a, (__vector unsigned char)__b, __c); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_perm(__vector signed __int128 __a, __vector signed __int128 __b, + __vector unsigned char __c) { + return (__vector signed __int128)__builtin_s390_vperm( + (__vector unsigned char)__a, (__vector unsigned char)__b, __c); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_perm(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned char __c) { + return (__vector unsigned __int128)__builtin_s390_vperm( + (__vector unsigned char)__a, (__vector unsigned char)__b, __c); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_perm(__vector __bool __int128 __a, __vector __bool __int128 __b, + __vector unsigned char __c) { + return (__vector __bool __int128)__builtin_s390_vperm( + (__vector unsigned char)__a, (__vector unsigned char)__b, __c); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_perm(__vector float __a, __vector float __b, @@ -514,9 +535,19 @@ vec_permi(__vector double __a, __vector double __b, int __c) (__vector unsigned long long)(Y), \ (((Z) & 2) << 1) | ((Z) & 1))) +/*-- vec_bperm --------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +static inline __ATTRS_ai __vector unsigned long long +vec_bperm(__vector unsigned __int128 __a, __vector unsigned char __b) { + return __builtin_s390_vbperm((__vector unsigned char)__a, __b); +} +#endif + /*-- vec_bperm_u128 ---------------------------------------------------------*/ #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned long long vec_bperm_u128(__vector unsigned char __a, __vector unsigned char __b) { return __builtin_s390_vbperm(__a, __b); @@ -558,6 +589,18 @@ vec_revb(__vector unsigned long long __vec) { return __builtin_s390_vlbrg(__vec); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_revb(__vector signed __int128 __vec) { + return (__vector signed __int128) + __builtin_s390_vlbrq((unsigned __int128)__vec); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_revb(__vector unsigned __int128 __vec) { + return (__vector unsigned __int128) + __builtin_s390_vlbrq((unsigned __int128)__vec); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_revb(__vector float __vec) { @@ -820,6 +863,46 @@ vec_sel(__vector unsigned long long __a, __vector unsigned long long __b, (~(__vector unsigned long long)__c & __a)); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_sel(__vector signed __int128 __a, __vector signed __int128 __b, + __vector unsigned __int128 __c) { + return (((__vector signed __int128)__c & __b) | + (~(__vector signed __int128)__c & __a)); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_sel(__vector signed __int128 __a, __vector signed __int128 __b, + __vector __bool __int128 __c) { + return (((__vector signed __int128)__c & __b) | + (~(__vector signed __int128)__c & __a)); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_sel(__vector __bool __int128 __a, __vector __bool __int128 __b, + __vector unsigned __int128 __c) { + return (((__vector __bool __int128)__c & __b) | + (~(__vector __bool __int128)__c & __a)); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_sel(__vector __bool __int128 __a, __vector __bool __int128 __b, + __vector __bool __int128 __c) { + return (__c & __b) | (~__c & __a); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sel(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__c & __b) | (~__c & __a); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sel(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector __bool __int128 __c) { + return (((__vector unsigned __int128)__c & __b) | + (~(__vector unsigned __int128)__c & __a)); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_sel(__vector float __a, __vector float __b, __vector unsigned int __c) { @@ -1078,6 +1161,22 @@ vec_xl(long __offset, const unsigned long long *__ptr) { return V; } +static inline __ATTRS_o_ai __vector signed __int128 +vec_xl(long __offset, const signed __int128 *__ptr) { + __vector signed __int128 V; + __builtin_memcpy(&V, ((const char *)__ptr + __offset), + sizeof(__vector signed __int128)); + return V; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_xl(long __offset, const unsigned __int128 *__ptr) { + __vector unsigned __int128 V; + __builtin_memcpy(&V, ((const char *)__ptr + __offset), + sizeof(__vector unsigned __int128)); + return V; +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_xl(long __offset, const float *__ptr) { @@ -1294,6 +1393,22 @@ vec_xst(__vector unsigned long long __vec, long __offset, sizeof(__vector unsigned long long)); } +static inline __ATTRS_o_ai void +vec_xst(__vector signed __int128 __vec, long __offset, + signed __int128 *__ptr) { + __vector signed __int128 V = __vec; + __builtin_memcpy(((char *)__ptr + __offset), &V, + sizeof(__vector signed __int128)); +} + +static inline __ATTRS_o_ai void +vec_xst(__vector unsigned __int128 __vec, long __offset, + unsigned __int128 *__ptr) { + __vector unsigned __int128 V = __vec; + __builtin_memcpy(((char *)__ptr + __offset), &V, + sizeof(__vector unsigned __int128)); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai void vec_xst(__vector float __vec, long __offset, float *__ptr) { @@ -1465,6 +1580,14 @@ extern __ATTRS_o __vector unsigned long long vec_load_bndry(const unsigned long long *__ptr, unsigned short __len) __constant_pow2_range(__len, 64, 4096); +extern __ATTRS_o __vector signed __int128 +vec_load_bndry(const signed __int128 *__ptr, unsigned short __len) + __constant_pow2_range(__len, 64, 4096); + +extern __ATTRS_o __vector unsigned __int128 +vec_load_bndry(const unsigned __int128 *__ptr, unsigned short __len) + __constant_pow2_range(__len, 64, 4096); + #if __ARCH__ >= 12 extern __ATTRS_o __vector float vec_load_bndry(const float *__ptr, unsigned short __len) @@ -1496,43 +1619,51 @@ vec_load_len(const unsigned char *__ptr, unsigned int __len) { return (__vector unsigned char)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_load_len(const signed short *__ptr, unsigned int __len) { return (__vector signed short)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_load_len(const unsigned short *__ptr, unsigned int __len) { return (__vector unsigned short)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_load_len(const signed int *__ptr, unsigned int __len) { return (__vector signed int)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_load_len(const unsigned int *__ptr, unsigned int __len) { return (__vector unsigned int)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_load_len(const signed long long *__ptr, unsigned int __len) { return (__vector signed long long)__builtin_s390_vll(__len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_load_len(const unsigned long long *__ptr, unsigned int __len) { return (__vector unsigned long long)__builtin_s390_vll(__len, __ptr); } #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_load_len(const float *__ptr, unsigned int __len) { return (__vector float)__builtin_s390_vll(__len, __ptr); } #endif +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_load_len(const double *__ptr, unsigned int __len) { return (__vector double)__builtin_s390_vll(__len, __ptr); @@ -1541,7 +1672,12 @@ vec_load_len(const double *__ptr, unsigned int __len) { /*-- vec_load_len_r ---------------------------------------------------------*/ #if __ARCH__ >= 12 -static inline __ATTRS_ai __vector unsigned char +static inline __ATTRS_o_ai __vector signed char +vec_load_len_r(const signed char *__ptr, unsigned int __len) { + return (__vector signed char)__builtin_s390_vlrlr(__len, __ptr); +} + +static inline __ATTRS_o_ai __vector unsigned char vec_load_len_r(const unsigned char *__ptr, unsigned int __len) { return (__vector unsigned char)__builtin_s390_vlrlr(__len, __ptr); } @@ -1561,36 +1697,42 @@ vec_store_len(__vector unsigned char __vec, unsigned char *__ptr, __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector signed short __vec, signed short *__ptr, unsigned int __len) { __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector unsigned short __vec, unsigned short *__ptr, unsigned int __len) { __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector signed int __vec, signed int *__ptr, unsigned int __len) { __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector unsigned int __vec, unsigned int *__ptr, unsigned int __len) { __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector signed long long __vec, signed long long *__ptr, unsigned int __len) { __builtin_s390_vstl((__vector signed char)__vec, __len, __ptr); } +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector unsigned long long __vec, unsigned long long *__ptr, unsigned int __len) { @@ -1598,6 +1740,7 @@ vec_store_len(__vector unsigned long long __vec, unsigned long long *__ptr, } #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector float __vec, float *__ptr, unsigned int __len) { @@ -1605,6 +1748,7 @@ vec_store_len(__vector float __vec, float *__ptr, } #endif +// This prototype is deprecated. static inline __ATTRS_o_ai void vec_store_len(__vector double __vec, double *__ptr, unsigned int __len) { @@ -1614,7 +1758,13 @@ vec_store_len(__vector double __vec, double *__ptr, /*-- vec_store_len_r --------------------------------------------------------*/ #if __ARCH__ >= 12 -static inline __ATTRS_ai void +static inline __ATTRS_o_ai void +vec_store_len_r(__vector signed char __vec, signed char *__ptr, + unsigned int __len) { + __builtin_s390_vstrlr(__vec, __len, __ptr); +} + +static inline __ATTRS_o_ai void vec_store_len_r(__vector unsigned char __vec, unsigned char *__ptr, unsigned int __len) { __builtin_s390_vstrlr((__vector signed char)__vec, __len, __ptr); @@ -1711,6 +1861,35 @@ vec_genmasks_64(unsigned char __first, unsigned char __last) return (__vector unsigned long long)__value; } +/*-- vec_gen_element_masks_* ------------------------------------------------*/ + +#if __ARCH__ >= 15 +static inline __ATTRS_ai __vector unsigned char +vec_gen_element_masks_8(__vector unsigned short __mask) { + return __builtin_s390_vgemb(__mask); +} + +static inline __ATTRS_ai __vector unsigned short +vec_gen_element_masks_16(__vector unsigned char __mask) { + return __builtin_s390_vgemh(__mask); +} + +static inline __ATTRS_ai __vector unsigned int +vec_gen_element_masks_32(__vector unsigned char __mask) { + return __builtin_s390_vgemf(__mask); +} + +static inline __ATTRS_ai __vector unsigned long long +vec_gen_element_masks_64(__vector unsigned char __mask) { + return __builtin_s390_vgemg(__mask); +} + +static inline __ATTRS_ai __vector unsigned __int128 +vec_gen_element_masks_128(__vector unsigned char __mask) { + return (__vector unsigned __int128)__builtin_s390_vgemq(__mask); +} +#endif + /*-- vec_splat --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed char @@ -1894,6 +2073,16 @@ vec_splats(unsigned long long __scalar) { return (__vector unsigned long long)__scalar; } +static inline __ATTRS_o_ai __vector signed __int128 +vec_splats(signed __int128 __scalar) { + return (__vector signed __int128)__scalar; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_splats(unsigned __int128 __scalar) { + return (__vector unsigned __int128)__scalar; +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_splats(float __scalar) { @@ -2166,6 +2355,27 @@ vec_pack(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector unsigned int)(__ac[1], __ac[3], __bc[1], __bc[3]); } +static inline __ATTRS_o_ai __vector signed long long +vec_pack(__vector signed __int128 __a, __vector signed __int128 __b) { + __vector signed long long __ac = (__vector signed long long)__a; + __vector signed long long __bc = (__vector signed long long)__b; + return (__vector signed long long)(__ac[1], __bc[1]); +} + +static inline __ATTRS_o_ai __vector __bool long long +vec_pack(__vector __bool __int128 __a, __vector __bool __int128 __b) { + __vector __bool long long __ac = (__vector __bool long long)__a; + __vector __bool long long __bc = (__vector __bool long long)__b; + return (__vector __bool long long)(__ac[1], __bc[1]); +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_pack(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + __vector unsigned long long __ac = (__vector unsigned long long)__a; + __vector unsigned long long __bc = (__vector unsigned long long)__b; + return (__vector unsigned long long)(__ac[1], __bc[1]); +} + /*-- vec_packs --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed char @@ -2344,6 +2554,24 @@ vec_unpackh(__vector unsigned int __a) { return __builtin_s390_vuplhf(__a); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_unpackh(__vector signed long long __a) { + return (__vector signed __int128)__builtin_s390_vuphg(__a); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_unpackh(__vector __bool long long __a) { + return ((__vector __bool __int128) + __builtin_s390_vuphg((__vector signed long long)__a)); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_unpackh(__vector unsigned long long __a) { + return (__vector unsigned __int128)__builtin_s390_vuplhg(__a); +} +#endif + /*-- vec_unpackl ------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed short @@ -2394,6 +2622,24 @@ vec_unpackl(__vector unsigned int __a) { return __builtin_s390_vupllf(__a); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_unpackl(__vector signed long long __a) { + return (__vector signed __int128)__builtin_s390_vuplg(__a); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_unpackl(__vector __bool long long __a) { + return ((__vector __bool __int128) + __builtin_s390_vuplg((__vector signed long long)__a)); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_unpackl(__vector unsigned long long __a) { + return (__vector unsigned __int128)__builtin_s390_vupllg(__a); +} +#endif + /*-- vec_cmpeq --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector __bool char @@ -2456,6 +2702,21 @@ vec_cmpeq(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector __bool long long)(__a == __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpeq(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return (__vector __bool __int128)(__a == __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpeq(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector __bool __int128)(__a == __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpeq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector __bool __int128)(__a == __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector __bool int vec_cmpeq(__vector float __a, __vector float __b) { @@ -2510,6 +2771,16 @@ vec_cmpge(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector __bool long long)(__a >= __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpge(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector __bool __int128)(__a >= __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector __bool __int128)(__a >= __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector __bool int vec_cmpge(__vector float __a, __vector float __b) { @@ -2564,6 +2835,16 @@ vec_cmpgt(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector __bool long long)(__a > __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpgt(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector __bool __int128)(__a > __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmpgt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector __bool __int128)(__a > __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector __bool int vec_cmpgt(__vector float __a, __vector float __b) { @@ -2618,6 +2899,16 @@ vec_cmple(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector __bool long long)(__a <= __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmple(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector __bool __int128)(__a <= __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmple(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector __bool __int128)(__a <= __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector __bool int vec_cmple(__vector float __a, __vector float __b) { @@ -2672,6 +2963,16 @@ vec_cmplt(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector __bool long long)(__a < __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmplt(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector __bool __int128)(__a < __b); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_cmplt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector __bool __int128)(__a < __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector __bool int vec_cmplt(__vector float __a, __vector float __b) { @@ -2914,6 +3215,29 @@ vec_all_eq(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_eq(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 0; +} + +static inline __ATTRS_o_ai int +vec_all_eq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 0; +} + +static inline __ATTRS_o_ai int +vec_all_eq(__vector __bool __int128 __a, __vector __bool __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_eq(__vector float __a, __vector float __b) { @@ -3161,6 +3485,29 @@ vec_all_ne(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_ne(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 3; +} + +static inline __ATTRS_o_ai int +vec_all_ne(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 3; +} + +static inline __ATTRS_o_ai int +vec_all_ne(__vector __bool __int128 __a, __vector __bool __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 3; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_ne(__vector float __a, __vector float __b) { @@ -3399,6 +3746,22 @@ vec_all_ge(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_ge(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc); + return __cc == 3; +} + +static inline __ATTRS_o_ai int +vec_all_ge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc); + return __cc == 3; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_ge(__vector float __a, __vector float __b) { @@ -3637,6 +4000,22 @@ vec_all_gt(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_gt(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc); + return __cc == 0; +} + +static inline __ATTRS_o_ai int +vec_all_gt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_gt(__vector float __a, __vector float __b) { @@ -3875,6 +4254,22 @@ vec_all_le(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 3; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_le(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc); + return __cc == 3; +} + +static inline __ATTRS_o_ai int +vec_all_le(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc == 3; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_le(__vector float __a, __vector float __b) { @@ -4113,6 +4508,22 @@ vec_all_lt(__vector __bool long long __a, __vector __bool long long __b) { return __cc == 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_all_lt(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc); + return __cc == 0; +} + +static inline __ATTRS_o_ai int +vec_all_lt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc); + return __cc == 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_all_lt(__vector float __a, __vector float __b) { @@ -4467,6 +4878,29 @@ vec_any_eq(__vector __bool long long __a, __vector __bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_eq(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc <= 1; +} + +static inline __ATTRS_o_ai int +vec_any_eq(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc <= 1; +} + +static inline __ATTRS_o_ai int +vec_any_eq(__vector __bool __int128 __a, __vector __bool __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc <= 1; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_eq(__vector float __a, __vector float __b) { @@ -4713,6 +5147,29 @@ vec_any_ne(__vector __bool long long __a, __vector __bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_ne(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc != 0; +} + +static inline __ATTRS_o_ai int +vec_any_ne(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc != 0; +} + +static inline __ATTRS_o_ai int +vec_any_ne(__vector __bool __int128 __a, __vector __bool __int128 __b) { + int __cc; + __builtin_s390_vceqqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc != 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_ne(__vector float __a, __vector float __b) { @@ -4951,6 +5408,22 @@ vec_any_ge(__vector __bool long long __a, __vector __bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_ge(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc); + return __cc != 0; +} + +static inline __ATTRS_o_ai int +vec_any_ge(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc); + return __cc != 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_ge(__vector float __a, __vector float __b) { @@ -5189,6 +5662,22 @@ vec_any_gt(__vector __bool long long __a, __vector __bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_gt(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc); + return __cc <= 1; +} + +static inline __ATTRS_o_ai int +vec_any_gt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc <= 1; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_gt(__vector float __a, __vector float __b) { @@ -5427,6 +5916,22 @@ vec_any_le(__vector __bool long long __a, __vector __bool long long __b) { return __cc != 0; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_le(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__a, (signed __int128)__b, &__cc); + return __cc != 0; +} + +static inline __ATTRS_o_ai int +vec_any_le(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__a, (unsigned __int128)__b, &__cc); + return __cc != 0; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_le(__vector float __a, __vector float __b) { @@ -5665,6 +6170,22 @@ vec_any_lt(__vector __bool long long __a, __vector __bool long long __b) { return __cc <= 1; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai int +vec_any_lt(__vector signed __int128 __a, __vector signed __int128 __b) { + int __cc; + __builtin_s390_vchqs((signed __int128)__b, (signed __int128)__a, &__cc); + return __cc <= 1; +} + +static inline __ATTRS_o_ai int +vec_any_lt(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + int __cc; + __builtin_s390_vchlqs((unsigned __int128)__b, (unsigned __int128)__a, &__cc); + return __cc <= 1; +} +#endif + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_any_lt(__vector float __a, __vector float __b) { @@ -5789,6 +6310,385 @@ vec_any_numeric(__vector double __a) { return __cc != 0; } +/*-- vec_blend --------------------------------------------------------------*/ + +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed char +vec_blend(__vector signed char __a, __vector signed char __b, + __vector signed char __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0)); +} + +static inline __ATTRS_o_ai __vector __bool char +vec_blend(__vector __bool char __a, __vector __bool char __b, + __vector signed char __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0)); +} + +static inline __ATTRS_o_ai __vector unsigned char +vec_blend(__vector unsigned char __a, __vector unsigned char __b, + __vector signed char __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed char)0)); +} + +static inline __ATTRS_o_ai __vector signed short +vec_blend(__vector signed short __a, __vector signed short __b, + __vector signed short __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0)); +} + +static inline __ATTRS_o_ai __vector __bool short +vec_blend(__vector __bool short __a, __vector __bool short __b, + __vector signed short __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0)); +} + +static inline __ATTRS_o_ai __vector unsigned short +vec_blend(__vector unsigned short __a, __vector unsigned short __b, + __vector signed short __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed short)0)); +} + +static inline __ATTRS_o_ai __vector signed int +vec_blend(__vector signed int __a, __vector signed int __b, + __vector signed int __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0)); +} + +static inline __ATTRS_o_ai __vector __bool int +vec_blend(__vector __bool int __a, __vector __bool int __b, + __vector signed int __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0)); +} + +static inline __ATTRS_o_ai __vector unsigned int +vec_blend(__vector unsigned int __a, __vector unsigned int __b, + __vector signed int __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0)); +} + +static inline __ATTRS_o_ai __vector signed long long +vec_blend(__vector signed long long __a, __vector signed long long __b, + __vector signed long long __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0)); +} + +static inline __ATTRS_o_ai __vector __bool long long +vec_blend(__vector __bool long long __a, __vector __bool long long __b, + __vector signed long long __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0)); +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_blend(__vector unsigned long long __a, __vector unsigned long long __b, + __vector signed long long __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0)); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_blend(__vector signed __int128 __a, __vector signed __int128 __b, + __vector signed __int128 __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0)); +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_blend(__vector __bool __int128 __a, __vector __bool __int128 __b, + __vector signed __int128 __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0)); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_blend(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector signed __int128 __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed __int128)0)); +} + +static inline __ATTRS_o_ai __vector float +vec_blend(__vector float __a, __vector float __b, + __vector signed int __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed int)0)); +} + +static inline __ATTRS_o_ai __vector double +vec_blend(__vector double __a, __vector double __b, + __vector signed long long __c) { + return vec_sel(__a, __b, vec_cmplt(__c, (__vector signed long long)0)); +} +#endif + +/*-- vec_and ---------------------------------------------------------------*/ + +static inline __ATTRS_o_ai __vector __bool char +vec_and(__vector __bool char __a, __vector __bool char __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector signed char +vec_and(__vector signed char __a, __vector signed char __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector unsigned char +vec_and(__vector unsigned char __a, __vector unsigned char __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector __bool short +vec_and(__vector __bool short __a, __vector __bool short __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector signed short +vec_and(__vector signed short __a, __vector signed short __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector unsigned short +vec_and(__vector unsigned short __a, __vector unsigned short __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector __bool int +vec_and(__vector __bool int __a, __vector __bool int __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector signed int +vec_and(__vector signed int __a, __vector signed int __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector unsigned int +vec_and(__vector unsigned int __a, __vector unsigned int __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector __bool long long +vec_and(__vector __bool long long __a, __vector __bool long long __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector signed long long +vec_and(__vector signed long long __a, __vector signed long long __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_and(__vector unsigned long long __a, __vector unsigned long long __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_and(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_and(__vector signed __int128 __a, __vector signed __int128 __b) { + return __a & __b; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_and(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return __a & __b; +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai __vector float +vec_and(__vector float __a, __vector float __b) { + return (__vector float)((__vector unsigned int)__a & + (__vector unsigned int)__b); +} +#endif + +static inline __ATTRS_o_ai __vector double +vec_and(__vector double __a, __vector double __b) { + return (__vector double)((__vector unsigned long long)__a & + (__vector unsigned long long)__b); +} + +/*-- vec_or ----------------------------------------------------------------*/ + +static inline __ATTRS_o_ai __vector __bool char +vec_or(__vector __bool char __a, __vector __bool char __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector signed char +vec_or(__vector signed char __a, __vector signed char __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector unsigned char +vec_or(__vector unsigned char __a, __vector unsigned char __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector __bool short +vec_or(__vector __bool short __a, __vector __bool short __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector signed short +vec_or(__vector signed short __a, __vector signed short __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector unsigned short +vec_or(__vector unsigned short __a, __vector unsigned short __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector __bool int +vec_or(__vector __bool int __a, __vector __bool int __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector signed int +vec_or(__vector signed int __a, __vector signed int __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector unsigned int +vec_or(__vector unsigned int __a, __vector unsigned int __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector __bool long long +vec_or(__vector __bool long long __a, __vector __bool long long __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector signed long long +vec_or(__vector signed long long __a, __vector signed long long __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_or(__vector unsigned long long __a, __vector unsigned long long __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_or(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_or(__vector signed __int128 __a, __vector signed __int128 __b) { + return __a | __b; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_or(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return __a | __b; +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai __vector float +vec_or(__vector float __a, __vector float __b) { + return (__vector float)((__vector unsigned int)__a | + (__vector unsigned int)__b); +} +#endif + +static inline __ATTRS_o_ai __vector double +vec_or(__vector double __a, __vector double __b) { + return (__vector double)((__vector unsigned long long)__a | + (__vector unsigned long long)__b); +} + +/*-- vec_xor ----------------------------------------------------------------*/ + +static inline __ATTRS_o_ai __vector __bool char +vec_xor(__vector __bool char __a, __vector __bool char __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector signed char +vec_xor(__vector signed char __a, __vector signed char __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector unsigned char +vec_xor(__vector unsigned char __a, __vector unsigned char __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector __bool short +vec_xor(__vector __bool short __a, __vector __bool short __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector signed short +vec_xor(__vector signed short __a, __vector signed short __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector unsigned short +vec_xor(__vector unsigned short __a, __vector unsigned short __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector __bool int +vec_xor(__vector __bool int __a, __vector __bool int __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector signed int +vec_xor(__vector signed int __a, __vector signed int __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector unsigned int +vec_xor(__vector unsigned int __a, __vector unsigned int __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector __bool long long +vec_xor(__vector __bool long long __a, __vector __bool long long __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector signed long long +vec_xor(__vector signed long long __a, __vector signed long long __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_xor(__vector unsigned long long __a, __vector unsigned long long __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector __bool __int128 +vec_xor(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_xor(__vector signed __int128 __a, __vector signed __int128 __b) { + return __a ^ __b; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_xor(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return __a ^ __b; +} + +#if __ARCH__ >= 12 +static inline __ATTRS_o_ai __vector float +vec_xor(__vector float __a, __vector float __b) { + return (__vector float)((__vector unsigned int)__a ^ + (__vector unsigned int)__b); +} +#endif + +static inline __ATTRS_o_ai __vector double +vec_xor(__vector double __a, __vector double __b) { + return (__vector double)((__vector unsigned long long)__a ^ + (__vector unsigned long long)__b); +} + /*-- vec_andc ---------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector __bool char @@ -5947,6 +6847,21 @@ vec_andc(__vector unsigned long long __a, __vector __bool long long __b) { return __a & ~__b; } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_andc(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return __a & ~__b; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_andc(__vector signed __int128 __a, __vector signed __int128 __b) { + return __a & ~__b; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_andc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return __a & ~__b; +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_andc(__vector float __a, __vector float __b) { @@ -6133,6 +7048,21 @@ vec_nor(__vector unsigned long long __a, __vector __bool long long __b) { return ~(__a | __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_nor(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return ~(__a | __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_nor(__vector signed __int128 __a, __vector signed __int128 __b) { + return ~(__a | __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_nor(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return ~(__a | __b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_nor(__vector float __a, __vector float __b) { @@ -6224,6 +7154,21 @@ vec_orc(__vector unsigned long long __a, __vector unsigned long long __b) { return __a | ~__b; } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_orc(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_orc(__vector signed __int128 __a, __vector signed __int128 __b) { + return __a | ~__b; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_orc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return __a | ~__b; +} + static inline __ATTRS_o_ai __vector float vec_orc(__vector float __a, __vector float __b) { return (__vector float)((__vector unsigned int)__a | @@ -6300,6 +7245,21 @@ vec_nand(__vector unsigned long long __a, __vector unsigned long long __b) { return ~(__a & __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_nand(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_nand(__vector signed __int128 __a, __vector signed __int128 __b) { + return ~(__a & __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_nand(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return ~(__a & __b); +} + static inline __ATTRS_o_ai __vector float vec_nand(__vector float __a, __vector float __b) { return (__vector float)~((__vector unsigned int)__a & @@ -6376,6 +7336,21 @@ vec_eqv(__vector unsigned long long __a, __vector unsigned long long __b) { return ~(__a ^ __b); } +static inline __ATTRS_o_ai __vector __bool __int128 +vec_eqv(__vector __bool __int128 __a, __vector __bool __int128 __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_eqv(__vector signed __int128 __a, __vector signed __int128 __b) { + return ~(__a ^ __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_eqv(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return ~(__a ^ __b); +} + static inline __ATTRS_o_ai __vector float vec_eqv(__vector float __a, __vector float __b) { return (__vector float)~((__vector unsigned int)__a ^ @@ -6389,6 +7364,91 @@ vec_eqv(__vector double __a, __vector double __b) { } #endif +/*-- vec_evaluate -----------------------------------------------------------*/ + +#if __ARCH__ >= 15 +extern __ATTRS_o __vector signed char +vec_evaluate(__vector signed char __a, __vector signed char __b, + __vector signed char __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector unsigned char +vec_evaluate(__vector unsigned char __a, __vector unsigned char __b, + __vector unsigned char __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector __bool char +vec_evaluate(__vector __bool char __a, __vector __bool char __b, + __vector __bool char __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector signed short +vec_evaluate(__vector signed short __a, __vector signed short __b, + __vector signed short __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector unsigned short +vec_evaluate(__vector unsigned short __a, __vector unsigned short __b, + __vector unsigned short __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector __bool short +vec_evaluate(__vector __bool short __a, __vector __bool short __b, + __vector __bool short __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector signed int +vec_evaluate(__vector signed int __a, __vector signed int __b, + __vector signed int __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector unsigned int +vec_evaluate(__vector unsigned int __a, __vector unsigned int __b, + __vector unsigned int __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector __bool int +vec_evaluate(__vector __bool int __a, __vector __bool int __b, + __vector __bool int __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector signed long long +vec_evaluate(__vector signed long long __a, __vector signed long long __b, + __vector signed long long __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector unsigned long long +vec_evaluate(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned long long __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector __bool long long +vec_evaluate(__vector __bool long long __a, __vector __bool long long __b, + __vector __bool long long __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector signed __int128 +vec_evaluate(__vector signed __int128 __a, __vector signed __int128 __b, + __vector signed __int128 __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector unsigned __int128 +vec_evaluate(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c, unsigned char __d) + __constant(__d); + +extern __ATTRS_o __vector __bool __int128 +vec_evaluate(__vector __bool __int128 __a, __vector __bool __int128 __b, + __vector __bool __int128 __c, unsigned char __d) + __constant(__d); + +#define vec_evaluate(A, B, C, D) \ + ((__typeof__((vec_evaluate)((A), (B), (C), (D)))) \ + __builtin_s390_veval((__vector unsigned char)(A), \ + (__vector unsigned char)(B), \ + (__vector unsigned char)(C), (D))) +#endif + /*-- vec_cntlz --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector unsigned char @@ -6431,6 +7491,20 @@ vec_cntlz(__vector unsigned long long __a) { return __builtin_s390_vclzg(__a); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_cntlz(__vector signed __int128 __a) { + return (__vector unsigned __int128) + __builtin_s390_vclzq((unsigned __int128)__a); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_cntlz(__vector unsigned __int128 __a) { + return (__vector unsigned __int128) + __builtin_s390_vclzq((unsigned __int128)__a); +} +#endif + /*-- vec_cnttz --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector unsigned char @@ -6473,46 +7547,60 @@ vec_cnttz(__vector unsigned long long __a) { return __builtin_s390_vctzg(__a); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_cnttz(__vector signed __int128 __a) { + return (__vector unsigned __int128) + __builtin_s390_vctzq((unsigned __int128)__a); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_cnttz(__vector unsigned __int128 __a) { + return (__vector unsigned __int128) + __builtin_s390_vctzq((unsigned __int128)__a); +} +#endif + /*-- vec_popcnt -------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector unsigned char vec_popcnt(__vector signed char __a) { - return __builtin_s390_vpopctb((__vector unsigned char)__a); + return __builtin_elementwise_popcount((__vector unsigned char)__a); } static inline __ATTRS_o_ai __vector unsigned char vec_popcnt(__vector unsigned char __a) { - return __builtin_s390_vpopctb(__a); + return __builtin_elementwise_popcount(__a); } static inline __ATTRS_o_ai __vector unsigned short vec_popcnt(__vector signed short __a) { - return __builtin_s390_vpopcth((__vector unsigned short)__a); + return __builtin_elementwise_popcount((__vector unsigned short)__a); } static inline __ATTRS_o_ai __vector unsigned short vec_popcnt(__vector unsigned short __a) { - return __builtin_s390_vpopcth(__a); + return __builtin_elementwise_popcount(__a); } static inline __ATTRS_o_ai __vector unsigned int vec_popcnt(__vector signed int __a) { - return __builtin_s390_vpopctf((__vector unsigned int)__a); + return __builtin_elementwise_popcount((__vector unsigned int)__a); } static inline __ATTRS_o_ai __vector unsigned int vec_popcnt(__vector unsigned int __a) { - return __builtin_s390_vpopctf(__a); + return __builtin_elementwise_popcount(__a); } static inline __ATTRS_o_ai __vector unsigned long long vec_popcnt(__vector signed long long __a) { - return __builtin_s390_vpopctg((__vector unsigned long long)__a); + return __builtin_elementwise_popcount((__vector unsigned long long)__a); } static inline __ATTRS_o_ai __vector unsigned long long vec_popcnt(__vector unsigned long long __a) { - return __builtin_s390_vpopctg(__a); + return __builtin_elementwise_popcount(__a); } /*-- vec_rl -----------------------------------------------------------------*/ @@ -6904,8 +7992,21 @@ vec_sll(__vector unsigned long long __a, __vector unsigned int __b) { (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_sll(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vsl( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sll(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vsl( + (__vector unsigned char)__a, __b); +} + /*-- vec_slb ----------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed char vec_slb(__vector signed char __a, __vector signed char __b) { return (__vector signed char)__builtin_s390_vslb( @@ -6918,6 +8019,7 @@ vec_slb(__vector signed char __a, __vector unsigned char __b) { (__vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_slb(__vector unsigned char __a, __vector signed char __b) { return __builtin_s390_vslb(__a, (__vector unsigned char)__b); @@ -6928,110 +8030,187 @@ vec_slb(__vector unsigned char __a, __vector unsigned char __b) { return __builtin_s390_vslb(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_slb(__vector signed short __a, __vector signed short __b) { return (__vector signed short)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_slb(__vector signed short __a, __vector unsigned short __b) { return (__vector signed short)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed short +vec_slb(__vector signed short __a, __vector unsigned char __b) { + return (__vector signed short)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_slb(__vector unsigned short __a, __vector signed short __b) { return (__vector unsigned short)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_slb(__vector unsigned short __a, __vector unsigned short __b) { return (__vector unsigned short)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned short +vec_slb(__vector unsigned short __a, __vector unsigned char __b) { + return (__vector unsigned short)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_slb(__vector signed int __a, __vector signed int __b) { return (__vector signed int)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_slb(__vector signed int __a, __vector unsigned int __b) { return (__vector signed int)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed int +vec_slb(__vector signed int __a, __vector unsigned char __b) { + return (__vector signed int)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_slb(__vector unsigned int __a, __vector signed int __b) { return (__vector unsigned int)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_slb(__vector unsigned int __a, __vector unsigned int __b) { return (__vector unsigned int)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned int +vec_slb(__vector unsigned int __a, __vector unsigned char __b) { + return (__vector unsigned int)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_slb(__vector signed long long __a, __vector signed long long __b) { return (__vector signed long long)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_slb(__vector signed long long __a, __vector unsigned long long __b) { return (__vector signed long long)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed long long +vec_slb(__vector signed long long __a, __vector unsigned char __b) { + return (__vector signed long long)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_slb(__vector unsigned long long __a, __vector signed long long __b) { return (__vector unsigned long long)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_slb(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector unsigned long long)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned long long +vec_slb(__vector unsigned long long __a, __vector unsigned char __b) { + return (__vector unsigned long long)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_slb(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_slb(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_slb(__vector float __a, __vector signed int __b) { return (__vector float)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_slb(__vector float __a, __vector unsigned int __b) { return (__vector float)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } + +static inline __ATTRS_o_ai __vector float +vec_slb(__vector float __a, __vector unsigned char __b) { + return (__vector float)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} #endif +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_slb(__vector double __a, __vector signed long long __b) { return (__vector double)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_slb(__vector double __a, __vector unsigned long long __b) { return (__vector double)__builtin_s390_vslb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector double +vec_slb(__vector double __a, __vector unsigned char __b) { + return (__vector double)__builtin_s390_vslb( + (__vector unsigned char)__a, __b); +} + /*-- vec_sld ----------------------------------------------------------------*/ extern __ATTRS_o __vector signed char vec_sld(__vector signed char __a, __vector signed char __b, int __c) __constant_range(__c, 0, 15); +// This prototype is deprecated. extern __ATTRS_o __vector __bool char vec_sld(__vector __bool char __a, __vector __bool char __b, int __c) __constant_range(__c, 0, 15); @@ -7044,6 +8223,7 @@ extern __ATTRS_o __vector signed short vec_sld(__vector signed short __a, __vector signed short __b, int __c) __constant_range(__c, 0, 15); +// This prototype is deprecated. extern __ATTRS_o __vector __bool short vec_sld(__vector __bool short __a, __vector __bool short __b, int __c) __constant_range(__c, 0, 15); @@ -7056,6 +8236,7 @@ extern __ATTRS_o __vector signed int vec_sld(__vector signed int __a, __vector signed int __b, int __c) __constant_range(__c, 0, 15); +// This prototype is deprecated. extern __ATTRS_o __vector __bool int vec_sld(__vector __bool int __a, __vector __bool int __b, int __c) __constant_range(__c, 0, 15); @@ -7068,6 +8249,7 @@ extern __ATTRS_o __vector signed long long vec_sld(__vector signed long long __a, __vector signed long long __b, int __c) __constant_range(__c, 0, 15); +// This prototype is deprecated. extern __ATTRS_o __vector __bool long long vec_sld(__vector __bool long long __a, __vector __bool long long __b, int __c) __constant_range(__c, 0, 15); @@ -7077,6 +8259,15 @@ vec_sld(__vector unsigned long long __a, __vector unsigned long long __b, int __c) __constant_range(__c, 0, 15); +extern __ATTRS_o __vector signed __int128 +vec_sld(__vector signed __int128 __a, __vector signed __int128 __b, int __c) + __constant_range(__c, 0, 15); + +extern __ATTRS_o __vector unsigned __int128 +vec_sld(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + int __c) + __constant_range(__c, 0, 15); + #if __ARCH__ >= 12 extern __ATTRS_o __vector float vec_sld(__vector float __a, __vector float __b, int __c) @@ -7126,6 +8317,15 @@ vec_sldw(__vector unsigned long long __a, __vector unsigned long long __b, int __c) __constant_range(__c, 0, 3); +extern __ATTRS_o __vector signed __int128 +vec_sldw(__vector signed __int128 __a, __vector signed __int128 __b, int __c) + __constant_range(__c, 0, 3); + +extern __ATTRS_o __vector unsigned __int128 +vec_sldw(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + int __c) + __constant_range(__c, 0, 3); + // This prototype is deprecated. extern __ATTRS_o __vector double vec_sldw(__vector double __a, __vector double __b, int __c) @@ -7172,6 +8372,15 @@ vec_sldb(__vector unsigned long long __a, __vector unsigned long long __b, int __c) __constant_range(__c, 0, 7); +extern __ATTRS_o __vector signed __int128 +vec_sldb(__vector signed __int128 __a, __vector signed __int128 __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o __vector unsigned __int128 +vec_sldb(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + int __c) + __constant_range(__c, 0, 7); + extern __ATTRS_o __vector float vec_sldb(__vector float __a, __vector float __b, int __c) __constant_range(__c, 0, 7); @@ -7429,8 +8638,21 @@ vec_sral(__vector unsigned long long __a, __vector unsigned int __b) { (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_sral(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vsra( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sral(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vsra( + (__vector unsigned char)__a, __b); +} + /*-- vec_srab ---------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed char vec_srab(__vector signed char __a, __vector signed char __b) { return (__vector signed char)__builtin_s390_vsrab( @@ -7443,6 +8665,7 @@ vec_srab(__vector signed char __a, __vector unsigned char __b) { (__vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_srab(__vector unsigned char __a, __vector signed char __b) { return __builtin_s390_vsrab(__a, (__vector unsigned char)__b); @@ -7453,104 +8676,180 @@ vec_srab(__vector unsigned char __a, __vector unsigned char __b) { return __builtin_s390_vsrab(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_srab(__vector signed short __a, __vector signed short __b) { return (__vector signed short)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_srab(__vector signed short __a, __vector unsigned short __b) { return (__vector signed short)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed short +vec_srab(__vector signed short __a, __vector unsigned char __b) { + return (__vector signed short)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_srab(__vector unsigned short __a, __vector signed short __b) { return (__vector unsigned short)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_srab(__vector unsigned short __a, __vector unsigned short __b) { return (__vector unsigned short)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned short +vec_srab(__vector unsigned short __a, __vector unsigned char __b) { + return (__vector unsigned short)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_srab(__vector signed int __a, __vector signed int __b) { return (__vector signed int)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_srab(__vector signed int __a, __vector unsigned int __b) { return (__vector signed int)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed int +vec_srab(__vector signed int __a, __vector unsigned char __b) { + return (__vector signed int)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_srab(__vector unsigned int __a, __vector signed int __b) { return (__vector unsigned int)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_srab(__vector unsigned int __a, __vector unsigned int __b) { return (__vector unsigned int)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned int +vec_srab(__vector unsigned int __a, __vector unsigned char __b) { + return (__vector unsigned int)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_srab(__vector signed long long __a, __vector signed long long __b) { return (__vector signed long long)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_srab(__vector signed long long __a, __vector unsigned long long __b) { return (__vector signed long long)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed long long +vec_srab(__vector signed long long __a, __vector unsigned char __b) { + return (__vector signed long long)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_srab(__vector unsigned long long __a, __vector signed long long __b) { return (__vector unsigned long long)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_srab(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector unsigned long long)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned long long +vec_srab(__vector unsigned long long __a, __vector unsigned char __b) { + return (__vector unsigned long long)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_srab(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_srab(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_srab(__vector float __a, __vector signed int __b) { return (__vector float)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_srab(__vector float __a, __vector unsigned int __b) { return (__vector float)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } + +static inline __ATTRS_o_ai __vector float +vec_srab(__vector float __a, __vector unsigned char __b) { + return (__vector float)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} #endif +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_srab(__vector double __a, __vector signed long long __b) { return (__vector double)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_srab(__vector double __a, __vector unsigned long long __b) { return (__vector double)__builtin_s390_vsrab( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector double +vec_srab(__vector double __a, __vector unsigned char __b) { + return (__vector double)__builtin_s390_vsrab( + (__vector unsigned char)__a, __b); +} + /*-- vec_srl ----------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed char @@ -7794,8 +9093,21 @@ vec_srl(__vector unsigned long long __a, __vector unsigned int __b) { (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_srl(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vsrl( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_srl(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vsrl( + (__vector unsigned char)__a, __b); +} + /*-- vec_srb ----------------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed char vec_srb(__vector signed char __a, __vector signed char __b) { return (__vector signed char)__builtin_s390_vsrlb( @@ -7808,6 +9120,7 @@ vec_srb(__vector signed char __a, __vector unsigned char __b) { (__vector unsigned char)__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_srb(__vector unsigned char __a, __vector signed char __b) { return __builtin_s390_vsrlb(__a, (__vector unsigned char)__b); @@ -7818,104 +9131,180 @@ vec_srb(__vector unsigned char __a, __vector unsigned char __b) { return __builtin_s390_vsrlb(__a, __b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_srb(__vector signed short __a, __vector signed short __b) { return (__vector signed short)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed short vec_srb(__vector signed short __a, __vector unsigned short __b) { return (__vector signed short)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed short +vec_srb(__vector signed short __a, __vector unsigned char __b) { + return (__vector signed short)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_srb(__vector unsigned short __a, __vector signed short __b) { return (__vector unsigned short)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned short vec_srb(__vector unsigned short __a, __vector unsigned short __b) { return (__vector unsigned short)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned short +vec_srb(__vector unsigned short __a, __vector unsigned char __b) { + return (__vector unsigned short)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_srb(__vector signed int __a, __vector signed int __b) { return (__vector signed int)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed int vec_srb(__vector signed int __a, __vector unsigned int __b) { return (__vector signed int)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed int +vec_srb(__vector signed int __a, __vector unsigned char __b) { + return (__vector signed int)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_srb(__vector unsigned int __a, __vector signed int __b) { return (__vector unsigned int)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned int vec_srb(__vector unsigned int __a, __vector unsigned int __b) { return (__vector unsigned int)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned int +vec_srb(__vector unsigned int __a, __vector unsigned char __b) { + return (__vector unsigned int)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_srb(__vector signed long long __a, __vector signed long long __b) { return (__vector signed long long)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector signed long long vec_srb(__vector signed long long __a, __vector unsigned long long __b) { return (__vector signed long long)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector signed long long +vec_srb(__vector signed long long __a, __vector unsigned char __b) { + return (__vector signed long long)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_srb(__vector unsigned long long __a, __vector signed long long __b) { return (__vector unsigned long long)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned long long vec_srb(__vector unsigned long long __a, __vector unsigned long long __b) { return (__vector unsigned long long)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector unsigned long long +vec_srb(__vector unsigned long long __a, __vector unsigned char __b) { + return (__vector unsigned long long)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_srb(__vector signed __int128 __a, __vector unsigned char __b) { + return (__vector signed __int128)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_srb(__vector unsigned __int128 __a, __vector unsigned char __b) { + return (__vector unsigned __int128)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + #if __ARCH__ >= 12 +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_srb(__vector float __a, __vector signed int __b) { return (__vector float)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector float vec_srb(__vector float __a, __vector unsigned int __b) { return (__vector float)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } + +static inline __ATTRS_o_ai __vector float +vec_srb(__vector float __a, __vector unsigned char __b) { + return (__vector float)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} #endif +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_srb(__vector double __a, __vector signed long long __b) { return (__vector double)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +// This prototype is deprecated. static inline __ATTRS_o_ai __vector double vec_srb(__vector double __a, __vector unsigned long long __b) { return (__vector double)__builtin_s390_vsrlb( (__vector unsigned char)__a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai __vector double +vec_srb(__vector double __a, __vector unsigned char __b) { + return (__vector double)__builtin_s390_vsrlb( + (__vector unsigned char)__a, __b); +} + /*-- vec_srdb ---------------------------------------------------------------*/ #if __ARCH__ >= 13 @@ -7953,6 +9342,15 @@ vec_srdb(__vector unsigned long long __a, __vector unsigned long long __b, int __c) __constant_range(__c, 0, 7); +extern __ATTRS_o __vector signed __int128 +vec_srdb(__vector signed __int128 __a, __vector signed __int128 __b, int __c) + __constant_range(__c, 0, 7); + +extern __ATTRS_o __vector unsigned __int128 +vec_srdb(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + int __c) + __constant_range(__c, 0, 7); + extern __ATTRS_o __vector float vec_srdb(__vector float __a, __vector float __b, int __c) __constant_range(__c, 0, 7); @@ -7989,6 +9387,11 @@ vec_abs(__vector signed long long __a) { return vec_sel(__a, -__a, vec_cmplt(__a, (__vector signed long long)0)); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_abs(__vector signed __int128 __a) { + return vec_sel(__a, -__a, vec_cmplt(__a, (__vector signed __int128)0)); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_abs(__vector float __a) { @@ -8169,6 +9572,16 @@ vec_max(__vector __bool long long __a, __vector unsigned long long __b) { return vec_sel(__b, __ac, vec_cmpgt(__ac, __b)); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_max(__vector signed __int128 __a, __vector signed __int128 __b) { + return vec_sel(__b, __a, vec_cmpgt(__a, __b)); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_max(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return vec_sel(__b, __a, vec_cmpgt(__a, __b)); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_max(__vector float __a, __vector float __b) { @@ -8339,6 +9752,16 @@ vec_min(__vector __bool long long __a, __vector unsigned long long __b) { return vec_sel(__ac, __b, vec_cmpgt(__ac, __b)); } +static inline __ATTRS_o_ai __vector signed __int128 +vec_min(__vector signed __int128 __a, __vector signed __int128 __b) { + return vec_sel(__a, __b, vec_cmpgt(__a, __b)); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_min(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return vec_sel(__a, __b, vec_cmpgt(__a, __b)); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai __vector float vec_min(__vector float __a, __vector float __b) { @@ -8357,9 +9780,11 @@ vec_min(__vector double __a, __vector double __b) { /*-- vec_add_u128 -----------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char vec_add_u128(__vector unsigned char __a, __vector unsigned char __b) { - return (__vector unsigned char)((__int128)__a + (__int128)__b); + return (__vector unsigned char)(__vector unsigned __int128) + ((__int128)__a + (__int128)__b); } /*-- vec_addc ---------------------------------------------------------------*/ @@ -8384,30 +9809,59 @@ vec_addc(__vector unsigned long long __a, __vector unsigned long long __b) { return __builtin_s390_vaccg(__a, __b); } -/*-- vec_addc_u128 ----------------------------------------------------------*/ - -static inline __ATTRS_ai __vector unsigned char -vec_addc_u128(__vector unsigned char __a, __vector unsigned char __b) { - return (__vector unsigned char) +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_addc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector unsigned __int128) __builtin_s390_vaccq((unsigned __int128)__a, (unsigned __int128)__b); } -/*-- vec_adde_u128 ----------------------------------------------------------*/ +/*-- vec_addc_u128 ----------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char -vec_adde_u128(__vector unsigned char __a, __vector unsigned char __b, - __vector unsigned char __c) { - return (__vector unsigned char) +vec_addc_u128(__vector unsigned char __a, __vector unsigned char __b) { + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vaccq((unsigned __int128)__a, (unsigned __int128)__b); +} + +/*-- vec_adde ---------------------------------------------------------------*/ + +static inline __ATTRS_ai __vector unsigned __int128 +vec_adde(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) __builtin_s390_vacq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c); } +/*-- vec_adde_u128 ----------------------------------------------------------*/ + +// This prototype is deprecated. +static inline __ATTRS_ai __vector unsigned char +vec_adde_u128(__vector unsigned char __a, __vector unsigned char __b, + __vector unsigned char __c) { + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vacq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); +} + +/*-- vec_addec --------------------------------------------------------------*/ + +static inline __ATTRS_ai __vector unsigned __int128 +vec_addec(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vacccq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); +} + /*-- vec_addec_u128 ---------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char vec_addec_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return (__vector unsigned char) + return (__vector unsigned char)(__vector unsigned __int128) __builtin_s390_vacccq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c); } @@ -8434,6 +9888,14 @@ vec_avg(__vector signed long long __a, __vector signed long long __b) { return __builtin_s390_vavgg(__a, __b); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_avg(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector signed __int128) + __builtin_s390_vavgq((signed __int128)__a, (signed __int128)__b); +} +#endif + static inline __ATTRS_o_ai __vector unsigned char vec_avg(__vector unsigned char __a, __vector unsigned char __b) { return __builtin_s390_vavglb(__a, __b); @@ -8454,6 +9916,14 @@ vec_avg(__vector unsigned long long __a, __vector unsigned long long __b) { return __builtin_s390_vavglg(__a, __b); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_avg(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector unsigned __int128) + __builtin_s390_vavglq((unsigned __int128)__a, (unsigned __int128)__b); +} +#endif + /*-- vec_checksum -----------------------------------------------------------*/ static inline __ATTRS_ai __vector unsigned int @@ -8478,12 +9948,19 @@ vec_gfmsum(__vector unsigned int __a, __vector unsigned int __b) { return __builtin_s390_vgfmf(__a, __b); } +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_gfmsum(__vector unsigned long long __a, __vector unsigned long long __b) { + return (__vector unsigned __int128)__builtin_s390_vgfmg(__a, __b); +} + /*-- vec_gfmsum_128 ---------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_gfmsum_128(__vector unsigned long long __a, __vector unsigned long long __b) { - return (__vector unsigned char)__builtin_s390_vgfmg(__a, __b); + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vgfmg(__a, __b); } /*-- vec_gfmsum_accum -------------------------------------------------------*/ @@ -8506,13 +9983,21 @@ vec_gfmsum_accum(__vector unsigned int __a, __vector unsigned int __b, return __builtin_s390_vgfmaf(__a, __b, __c); } +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_gfmsum_accum(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vgfmag(__a, __b, (unsigned __int128)__c); +} + /*-- vec_gfmsum_accum_128 ---------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_gfmsum_accum_128(__vector unsigned long long __a, __vector unsigned long long __b, __vector unsigned char __c) { - return (__vector unsigned char) + return (__vector unsigned char)(__vector unsigned __int128) __builtin_s390_vgfmag(__a, __b, (unsigned __int128)__c); } @@ -8590,6 +10075,56 @@ vec_mladd(__vector unsigned int __a, __vector unsigned int __b, return __a * __b + __c; } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed long long +vec_mladd(__vector signed long long __a, __vector signed long long __b, + __vector signed long long __c) { + return __a * __b + __c; +} + +static inline __ATTRS_o_ai __vector signed long long +vec_mladd(__vector unsigned long long __a, __vector signed long long __b, + __vector signed long long __c) { + return (__vector signed long long)__a * __b + __c; +} + +static inline __ATTRS_o_ai __vector signed long long +vec_mladd(__vector signed long long __a, __vector unsigned long long __b, + __vector unsigned long long __c) { + return __a * (__vector signed long long)__b + (__vector signed long long)__c; +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_mladd(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned long long __c) { + return __a * __b + __c; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_mladd(__vector signed __int128 __a, __vector signed __int128 __b, + __vector signed __int128 __c) { + return __a * __b + __c; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_mladd(__vector unsigned __int128 __a, __vector signed __int128 __b, + __vector signed __int128 __c) { + return (__vector signed __int128)__a * __b + __c; +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_mladd(__vector signed __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return __a * (__vector signed __int128)__b + (__vector signed __int128)__c; +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_mladd(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return __a * __b + __c; +} +#endif + /*-- vec_mhadd --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed char @@ -8628,6 +10163,34 @@ vec_mhadd(__vector unsigned int __a, __vector unsigned int __b, return __builtin_s390_vmalhf(__a, __b, __c); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed long long +vec_mhadd(__vector signed long long __a, __vector signed long long __b, + __vector signed long long __c) { + return __builtin_s390_vmahg(__a, __b, __c); +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_mhadd(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned long long __c) { + return __builtin_s390_vmalhg(__a, __b, __c); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_mhadd(__vector signed __int128 __a, __vector signed __int128 __b, + __vector signed __int128 __c) { + return (__vector signed __int128) + __builtin_s390_vmahq((signed __int128)__a, (signed __int128)__b, (signed __int128)__c); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_mhadd(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vmalhq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c); +} +#endif + /*-- vec_meadd --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed short @@ -8666,6 +10229,22 @@ vec_meadd(__vector unsigned int __a, __vector unsigned int __b, return __builtin_s390_vmalef(__a, __b, __c); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_meadd(__vector signed long long __a, __vector signed long long __b, + __vector signed __int128 __c) { + return (__vector signed __int128) + __builtin_s390_vmaeg(__a, __b, (signed __int128)__c); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_meadd(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vmaleg(__a, __b, (unsigned __int128)__c); +} +#endif + /*-- vec_moadd --------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed short @@ -8704,6 +10283,22 @@ vec_moadd(__vector unsigned int __a, __vector unsigned int __b, return __builtin_s390_vmalof(__a, __b, __c); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_moadd(__vector signed long long __a, __vector signed long long __b, + __vector signed __int128 __c) { + return (__vector signed __int128) + __builtin_s390_vmaog(__a, __b, (signed __int128)__c); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_moadd(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vmalog(__a, __b, (unsigned __int128)__c); +} +#endif + /*-- vec_mulh ---------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed char @@ -8736,6 +10331,30 @@ vec_mulh(__vector unsigned int __a, __vector unsigned int __b) { return __builtin_s390_vmlhf(__a, __b); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed long long +vec_mulh(__vector signed long long __a, __vector signed long long __b) { + return __builtin_s390_vmhg(__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned long long +vec_mulh(__vector unsigned long long __a, __vector unsigned long long __b) { + return __builtin_s390_vmlhg(__a, __b); +} + +static inline __ATTRS_o_ai __vector signed __int128 +vec_mulh(__vector signed __int128 __a, __vector signed __int128 __b) { + return (__vector signed __int128) + __builtin_s390_vmhq((signed __int128)__a, (signed __int128)__b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_mulh(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector unsigned __int128) + __builtin_s390_vmlhq((unsigned __int128)__a, (unsigned __int128)__b); +} +#endif + /*-- vec_mule ---------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed short @@ -8768,6 +10387,18 @@ vec_mule(__vector unsigned int __a, __vector unsigned int __b) { return __builtin_s390_vmlef(__a, __b); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_mule(__vector signed long long __a, __vector signed long long __b) { + return (__vector signed __int128)__builtin_s390_vmeg(__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_mule(__vector unsigned long long __a, __vector unsigned long long __b) { + return (__vector unsigned __int128)__builtin_s390_vmleg(__a, __b); +} +#endif + /*-- vec_mulo ---------------------------------------------------------------*/ static inline __ATTRS_o_ai __vector signed short @@ -8800,9 +10431,35 @@ vec_mulo(__vector unsigned int __a, __vector unsigned int __b) { return __builtin_s390_vmlof(__a, __b); } +#if __ARCH__ >= 15 +static inline __ATTRS_o_ai __vector signed __int128 +vec_mulo(__vector signed long long __a, __vector signed long long __b) { + return (__vector signed __int128)__builtin_s390_vmog(__a, __b); +} + +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_mulo(__vector unsigned long long __a, __vector unsigned long long __b) { + return (__vector unsigned __int128)__builtin_s390_vmlog(__a, __b); +} +#endif + +/*-- vec_msum ---------------------------------------------------------------*/ + +#if __ARCH__ >= 12 +extern __ATTRS_o __vector unsigned __int128 +vec_msum(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned __int128 __c, int __d) + __constant_range(__d, 0, 15); + +#define vec_msum(X, Y, Z, W) \ + ((__typeof__((vec_msum)((X), (Y), (Z), (W)))) \ + __builtin_s390_vmslg((X), (Y), (unsigned __int128)(Z), (W))) +#endif + /*-- vec_msum_u128 ----------------------------------------------------------*/ #if __ARCH__ >= 12 +// This prototype is deprecated. extern __ATTRS_o __vector unsigned char vec_msum_u128(__vector unsigned long long __a, __vector unsigned long long __b, __vector unsigned char __c, int __d) @@ -8810,14 +10467,17 @@ vec_msum_u128(__vector unsigned long long __a, __vector unsigned long long __b, #define vec_msum_u128(X, Y, Z, W) \ ((__typeof__((vec_msum_u128)((X), (Y), (Z), (W)))) \ + (__vector unsigned __int128) \ __builtin_s390_vmslg((X), (Y), (unsigned __int128)(Z), (W))) #endif /*-- vec_sub_u128 -----------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char vec_sub_u128(__vector unsigned char __a, __vector unsigned char __b) { - return (__vector unsigned char)((__int128)__a - (__int128)__b); + return (__vector unsigned char)(__vector unsigned __int128) + ((__int128)__a - (__int128)__b); } /*-- vec_subc ---------------------------------------------------------------*/ @@ -8842,30 +10502,59 @@ vec_subc(__vector unsigned long long __a, __vector unsigned long long __b) { return __builtin_s390_vscbig(__a, __b); } -/*-- vec_subc_u128 ----------------------------------------------------------*/ - -static inline __ATTRS_ai __vector unsigned char -vec_subc_u128(__vector unsigned char __a, __vector unsigned char __b) { - return (__vector unsigned char) +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_subc(__vector unsigned __int128 __a, __vector unsigned __int128 __b) { + return (__vector unsigned __int128) __builtin_s390_vscbiq((unsigned __int128)__a, (unsigned __int128)__b); } -/*-- vec_sube_u128 ----------------------------------------------------------*/ +/*-- vec_subc_u128 ----------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char -vec_sube_u128(__vector unsigned char __a, __vector unsigned char __b, - __vector unsigned char __c) { - return (__vector unsigned char) +vec_subc_u128(__vector unsigned char __a, __vector unsigned char __b) { + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vscbiq((unsigned __int128)__a, (unsigned __int128)__b); +} + +/*-- vec_sube ---------------------------------------------------------------*/ + +static inline __ATTRS_ai __vector unsigned __int128 +vec_sube(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) __builtin_s390_vsbiq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c); } +/*-- vec_sube_u128 ----------------------------------------------------------*/ + +// This prototype is deprecated. +static inline __ATTRS_ai __vector unsigned char +vec_sube_u128(__vector unsigned char __a, __vector unsigned char __b, + __vector unsigned char __c) { + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vsbiq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); +} + +/*-- vec_subec --------------------------------------------------------------*/ + +static inline __ATTRS_ai __vector unsigned __int128 +vec_subec(__vector unsigned __int128 __a, __vector unsigned __int128 __b, + __vector unsigned __int128 __c) { + return (__vector unsigned __int128) + __builtin_s390_vsbcbiq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); +} + /*-- vec_subec_u128 ---------------------------------------------------------*/ +// This prototype is deprecated. static inline __ATTRS_ai __vector unsigned char vec_subec_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return (__vector unsigned char) + return (__vector unsigned char)(__vector unsigned __int128) __builtin_s390_vsbcbiq((unsigned __int128)__a, (unsigned __int128)__b, (unsigned __int128)__c); } @@ -8882,16 +10571,32 @@ vec_sum2(__vector unsigned int __a, __vector unsigned int __b) { return __builtin_s390_vsumgf(__a, __b); } -/*-- vec_sum_u128 -----------------------------------------------------------*/ +/*-- vec_sum ----------------------------------------------------------------*/ -static inline __ATTRS_o_ai __vector unsigned char -vec_sum_u128(__vector unsigned int __a, __vector unsigned int __b) { - return (__vector unsigned char)__builtin_s390_vsumqf(__a, __b); +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sum(__vector unsigned int __a, __vector unsigned int __b) { + return (__vector unsigned __int128)__builtin_s390_vsumqf(__a, __b); } +static inline __ATTRS_o_ai __vector unsigned __int128 +vec_sum(__vector unsigned long long __a, __vector unsigned long long __b) { + return (__vector unsigned __int128)__builtin_s390_vsumqg(__a, __b); +} + +/*-- vec_sum_u128 -----------------------------------------------------------*/ + +// This prototype is deprecated. +static inline __ATTRS_o_ai __vector unsigned char +vec_sum_u128(__vector unsigned int __a, __vector unsigned int __b) { + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vsumqf(__a, __b); +} + +// This prototype is deprecated. static inline __ATTRS_o_ai __vector unsigned char vec_sum_u128(__vector unsigned long long __a, __vector unsigned long long __b) { - return (__vector unsigned char)__builtin_s390_vsumqg(__a, __b); + return (__vector unsigned char)(__vector unsigned __int128) + __builtin_s390_vsumqg(__a, __b); } /*-- vec_sum4 ---------------------------------------------------------------*/ @@ -8956,6 +10661,19 @@ vec_test_mask(__vector unsigned long long __a, (__vector unsigned char)__b); } +static inline __ATTRS_o_ai int +vec_test_mask(__vector signed __int128 __a, __vector unsigned __int128 __b) { + return __builtin_s390_vtm((__vector unsigned char)__a, + (__vector unsigned char)__b); +} + +static inline __ATTRS_o_ai int +vec_test_mask(__vector unsigned __int128 __a, + __vector unsigned __int128 __b) { + return __builtin_s390_vtm((__vector unsigned char)__a, + (__vector unsigned char)__b); +} + #if __ARCH__ >= 12 static inline __ATTRS_o_ai int vec_test_mask(__vector float __a, __vector unsigned int __b) { diff --git a/lib/include/wasm_simd128.h b/lib/include/wasm_simd128.h index 2327bec525..08e39bf1a7 100644 --- a/lib/include/wasm_simd128.h +++ b/lib/include/wasm_simd128.h @@ -33,6 +33,7 @@ typedef unsigned long long __u64x2 __attribute__((__vector_size__(16), __aligned__(16))); typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16))); typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16))); +typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16))); typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8))); typedef unsigned char __u8x8 @@ -956,7 +957,7 @@ static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) { } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) { - return (v128_t)__builtin_wasm_popcnt_i8x16((__i8x16)__a); + return (v128_t)__builtin_elementwise_popcount((__i8x16)__a); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a, @@ -981,12 +982,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_sat_s_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_elementwise_add_sat((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_add_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_sat_u_i8x16((__u8x16)__a, (__u8x16)__b); + return (v128_t)__builtin_elementwise_add_sat((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a, @@ -996,32 +997,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_sat_s_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_elementwise_sub_sat((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_sub_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_sat_u_i8x16((__u8x16)__a, (__u8x16)__b); + return (v128_t)__builtin_elementwise_sub_sat((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_s_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_elementwise_min((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b); + return (v128_t)__builtin_elementwise_min((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_s_i8x16((__i8x16)__a, (__i8x16)__b); + return (v128_t)__builtin_elementwise_max((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b); + return (v128_t)__builtin_elementwise_max((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a, @@ -1067,12 +1068,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_sat_s_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_elementwise_add_sat((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_add_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_sat_u_i16x8((__u16x8)__a, (__u16x8)__b); + return (v128_t)__builtin_elementwise_add_sat((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a, @@ -1082,12 +1083,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_sat_s_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_elementwise_sub_sat((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_sub_sat(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_sat_u_i16x8((__u16x8)__a, (__u16x8)__b); + return (v128_t)__builtin_elementwise_sub_sat((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a, @@ -1097,22 +1098,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_s_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_elementwise_min((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b); + return (v128_t)__builtin_elementwise_min((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_s_i16x8((__i16x8)__a, (__i16x8)__b); + return (v128_t)__builtin_elementwise_max((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b); + return (v128_t)__builtin_elementwise_max((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a, @@ -1168,22 +1169,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_mul(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_s_i32x4((__i32x4)__a, (__i32x4)__b); + return (v128_t)__builtin_elementwise_min((__i32x4)__a, (__i32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b); + return (v128_t)__builtin_elementwise_min((__u32x4)__a, (__u32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_s_i32x4((__i32x4)__a, (__i32x4)__b); + return (v128_t)__builtin_elementwise_max((__i32x4)__a, (__i32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b); + return (v128_t)__builtin_elementwise_max((__u32x4)__a, (__u32x4)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_dot_i16x8(v128_t __a, @@ -1878,6 +1879,151 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) { (__i8x16)__a, (__i8x16)__b, (__i32x4)__c); } +// FP16 intrinsics +#define __FP16_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("fp16"), \ + __min_vector_width__(128))) + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) { + return (v128_t)__builtin_wasm_splat_f16x8(__a); +} + +#ifdef __wasm_fp16__ +// TODO Replace the following macros with regular C functions and use normal +// target-independent vector code like the other replace/extract instructions. + +#define wasm_f16x8_extract_lane(__a, __i) \ + (__builtin_wasm_extract_lane_f16x8((__f16x8)(__a), __i)) + +#define wasm_f16x8_replace_lane(__a, __i, __b) \ + ((v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)(__a), __i, __b)) + +#endif + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) { + return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) { + return (v128_t)(-(__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) { + return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) { + return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) { + return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) { + return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a == (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a != (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a < (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a > (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a <= (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) { + return (v128_t)((__f16x8)__a >= (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a + (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a - (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a * (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a, + v128_t __b) { + return (v128_t)((__f16x8)__a / (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a, + v128_t __b) { + return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b); +} + +static __inline__ v128_t __FP16_FN_ATTRS +wasm_i16x8_trunc_sat_f16x8(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS +wasm_u16x8_trunc_sat_f16x8(v128_t __a) { + return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) { + return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) { + return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a, + v128_t __b, + v128_t __c) { + return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b, + (__f16x8)__c); +} + +static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a, + v128_t __b, + v128_t __c) { + return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b, + (__f16x8)__c); +} + // Deprecated intrinsics static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle") diff --git a/lib/include/xmmintrin.h b/lib/include/xmmintrin.h index 6fb27297af..20e66d1901 100644 --- a/lib/include/xmmintrin.h +++ b/lib/include/xmmintrin.h @@ -32,12 +32,41 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); #endif /* Define the default attributes for the functions in this file. */ +#if defined(__EVEX512__) && !defined(__AVX10_1_512__) #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \ __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX \ +#define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, \ - __target__("mmx,sse,no-evex512"), __min_vector_width__(64))) + __target__("sse2,no-evex512"), __min_vector_width__(128))) +#else +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sse"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) +#endif + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 +#endif + +#define __trunc64(x) \ + (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) +#define __anyext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, -1, -1) +#define __zeroupper64(x) \ + (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \ + 1, 4, 5) /// Adds the 32-bit float values in the low-order bits of the operands. /// @@ -54,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum /// of the lower 32 bits of both operands. The upper 96 bits are copied from /// the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_add_ss(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_add_ss(__m128 __a, __m128 __b) { __a[0] += __b[0]; return __a; } @@ -74,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing one of the source operands. /// \returns A 128-bit vector of [4 x float] containing the sums of both /// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_add_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_add_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a + (__v4sf)__b); } @@ -96,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// difference of the lower 32 bits of both operands. The upper 96 bits are /// copied from the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sub_ss(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sub_ss(__m128 __a, __m128 __b) { __a[0] -= __b[0]; return __a; } @@ -117,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing the subtrahend. /// \returns A 128-bit vector of [4 x float] containing the differences between /// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sub_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sub_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a - (__v4sf)__b); } @@ -139,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b) /// \returns A 128-bit vector of [4 x float] containing the product of the lower /// 32 bits of both operands. The upper 96 bits are copied from the upper 96 /// bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mul_ss(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_ss(__m128 __a, __m128 __b) { __a[0] *= __b[0]; return __a; } @@ -159,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing one of the source operands. /// \returns A 128-bit vector of [4 x float] containing the products of both /// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mul_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a * (__v4sf)__b); } @@ -181,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b) /// \returns A 128-bit vector of [4 x float] containing the quotients of the /// lower 32 bits of both operands. The upper 96 bits are copied from the /// upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_div_ss(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_div_ss(__m128 __a, __m128 __b) { __a[0] /= __b[0]; return __a; } @@ -200,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing the divisor. /// \returns A 128-bit vector of [4 x float] containing the quotients of both /// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_div_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_div_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a / (__v4sf)__b); } @@ -416,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b) /// A 128-bit vector containing one of the source operands. /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the /// values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_and_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_and_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a & (__v4su)__b); } @@ -438,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b) /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the /// one's complement of the first operand and the values in the second /// operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_andnot_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_andnot_ps(__m128 __a, __m128 __b) { return (__m128)(~(__v4su)__a & (__v4su)__b); } @@ -456,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing one of the source operands. /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the /// values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_or_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_or_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a | (__v4su)__b); } @@ -475,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b) /// A 128-bit vector of [4 x float] containing one of the source operands. /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR /// of the values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_xor_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_xor_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a ^ (__v4su)__b); } @@ -1448,10 +1465,10 @@ _mm_cvtss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a))); } /// Converts two low-order float values in a 128-bit vector of @@ -1468,7 +1485,7 @@ _mm_cvtps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a) { return _mm_cvtps_pi32(__a); @@ -1558,10 +1575,10 @@ _mm_cvttss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a))); } /// Converts the lower (first) two elements of a 128-bit vector of [4 x float] @@ -1579,7 +1596,7 @@ _mm_cvttps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a) { return _mm_cvttps_pi32(__a); @@ -1601,9 +1618,8 @@ _mm_cvtt_ps2pi(__m128 __a) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi32_ss(__m128 __a, int __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a, + int __b) { __a[0] = __b; return __a; } @@ -1624,9 +1640,8 @@ _mm_cvtsi32_ss(__m128 __a, int __b) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvt_si2ss(__m128 __a, int __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a, + int __b) { return _mm_cvtsi32_ss(__a, __b); } @@ -1648,9 +1663,8 @@ _mm_cvt_si2ss(__m128 __a, int __b) /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the /// converted value of the second operand. The upper 96 bits are copied from /// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi64_ss(__m128 __a, long long __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtsi64_ss(__m128 __a, long long __b) { __a[0] = __b; return __a; } @@ -1674,10 +1688,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b) { - return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); + return (__m128)__builtin_shufflevector( + (__v4sf)__a, + __builtin_convertvector((__v4si)__zext128(__b), __v4sf), + 4, 5, 2, 3); } /// Converts two elements of a 64-bit vector of [2 x i32] into two @@ -1697,7 +1714,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value from the second operand. The upper 64 bits are copied /// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); @@ -1714,9 +1731,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b) /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the extraction. /// \returns A 32-bit float containing the extracted value. -static __inline__ float __DEFAULT_FN_ATTRS -_mm_cvtss_f32(__m128 __a) -{ +static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_cvtss_f32(__m128 __a) { return __a[0]; } @@ -1907,9 +1923,8 @@ _mm_undefined_ps(void) /// \returns An initialized 128-bit floating-point vector of [4 x float]. The /// lower 32 bits contain the value provided in the source operand. The /// upper 96 bits are set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ss(float __w) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set_ss(float __w) { return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f }; } @@ -1925,9 +1940,8 @@ _mm_set_ss(float __w) /// A single-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set1_ps(float __w) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set1_ps(float __w) { return __extension__ (__m128){ __w, __w, __w, __w }; } @@ -1944,9 +1958,8 @@ _mm_set1_ps(float __w) /// A single-precision floating-point value used to initialize each vector /// element of the result. /// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ps1(float __w) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set_ps1(float __w) { return _mm_set1_ps(__w); } @@ -1971,9 +1984,8 @@ _mm_set_ps1(float __w) /// A single-precision floating-point value used to initialize bits [31:0] /// of the result. /// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ps(float __z, float __y, float __x, float __w) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_set_ps(float __z, float __y, float __x, float __w) { return __extension__ (__m128){ __w, __x, __y, __z }; } @@ -1999,9 +2011,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w) /// A single-precision floating-point value used to initialize bits [127:96] /// of the result. /// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_setr_ps(float __z, float __y, float __x, float __w) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_setr_ps(float __z, float __y, float __x, float __w) { return __extension__ (__m128){ __z, __y, __x, __w }; } @@ -2014,9 +2025,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w) /// /// \returns An initialized 128-bit floating-point vector of [4 x float] with /// all elements set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_setzero_ps(void) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_setzero_ps(void) { return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; } @@ -2231,10 +2241,10 @@ _mm_storer_ps(float *__p, __m128 __a) /// A pointer to an aligned memory location used to store the register value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(void *__p, __m64 __a) { - __builtin_ia32_movntq((__m64 *)__p, __a); + __builtin_nontemporal_store(__a, (__m64 *)__p); } /// Moves packed float values from a 128-bit vector of [4 x float] to a @@ -2296,7 +2306,7 @@ void _mm_sfence(void); /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. #define _mm_extract_pi16(a, n) \ - ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) + ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) /// Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset @@ -2342,10 +2352,10 @@ void _mm_sfence(void); /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); + return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2361,10 +2371,10 @@ _mm_max_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); + return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b); } /// Compares each of the corresponding packed 16-bit integer values of @@ -2380,10 +2390,10 @@ _mm_max_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); + return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2399,10 +2409,10 @@ _mm_min_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); + return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b); } /// Takes the most significant bit from each 8-bit element in a 64-bit @@ -2417,10 +2427,10 @@ _mm_min_pu8(__m64 __a, __m64 __b) /// A 64-bit integer vector containing the values with bits to be extracted. /// \returns The most significant bit from each 8-bit element in \a __a, /// written to bits [7:0]. -static __inline__ int __DEFAULT_FN_ATTRS_MMX +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a) { - return __builtin_ia32_pmovmskb((__v8qi)__a); + return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a)); } /// Multiplies packed 16-bit unsigned integer values and writes the @@ -2436,10 +2446,11 @@ _mm_movemask_pi8(__m64 __a) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the @@ -2476,8 +2487,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// _MM_SHUFFLE(b6, b4, b2, b0) can create an 8-bit mask of the form /// [b6, b4, b2, b0]. /// \returns A 64-bit integer vector containing the shuffled values. -#define _mm_shuffle_pi16(a, n) \ - ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) +#define _mm_shuffle_pi16(a, n) \ + ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \ + (n) & 0x3, ((n) >> 2) & 0x3, \ + ((n) >> 4) & 0x3, ((n) >> 6) & 0x3)) /// Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as @@ -2502,10 +2515,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// A pointer to a 64-bit memory location that will receive the conditionally /// copied integer values. The address of the memory location does not have /// to be aligned. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { - __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); + // This is complex, because we need to support the case where __p is pointing + // within the last 15 to 8 bytes of a page. In that case, using a 128-bit + // write might cause a trap where a 64-bit maskmovq would not. (Memory + // locations not selected by the mask bits might still cause traps.) + __m128i __d128 = __anyext128(__d); + __m128i __n128 = __zext128(__n); + if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 && + ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) { + // If there's a risk of spurious trap due to a 128-bit write, back up the + // pointer by 8 bytes and shift values in registers to match. + __p -= 8; + __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8); + __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8); + } + + __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p); } /// Computes the rounded averages of the packed unsigned 8-bit integer @@ -2521,10 +2549,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Computes the rounded averages of the packed unsigned 16-bit integer @@ -2540,10 +2569,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Subtracts the corresponding 8-bit unsigned integer values of the two @@ -2562,10 +2592,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b) /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the /// sets of absolute differences between both operands. The upper bits are /// cleared. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a), + (__v16qi)__zext128(__b))); } #if defined(__cplusplus) @@ -2741,9 +2772,8 @@ void _mm_setcsr(unsigned int __i); /// Bits [95:64] are written to bits [63:32] of the destination. \n /// Bits [127:96] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x float] containing the interleaved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_unpackhi_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpackhi_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); } @@ -2763,9 +2793,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b) /// Bits [31:0] are written to bits [63:32] of the destination. \n /// Bits [63:32] are written to bits [127:96] of the destination. /// \returns A 128-bit vector of [4 x float] containing the interleaved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_unpacklo_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_unpacklo_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); } @@ -2785,9 +2814,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are /// written to the lower 32 bits of the result. /// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_move_ss(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_move_ss(__m128 __a, __m128 __b) { __a[0] = __b[0]; return __a; } @@ -2807,9 +2835,8 @@ _mm_move_ss(__m128 __a, __m128 __b) /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are /// written to the lower 64 bits of the result. /// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_movehl_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_movehl_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); } @@ -2828,9 +2855,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b) /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are /// written to the upper 64 bits of the result. /// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_movelh_ps(__m128 __a, __m128 __b) -{ +static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_movelh_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); } @@ -2846,22 +2872,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b) /// from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi16(__b, __a); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hi)__a, __v4sf); } /// Converts a 64-bit vector of 16-bit unsigned integer values into a @@ -2876,21 +2890,10 @@ _mm_cvtpi16_ps(__m64 __a) /// destination are copied from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hu)__a, __v4sf); } /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] @@ -2905,16 +2908,12 @@ _mm_cvtpu16_ps(__m64 __a) /// from the corresponding lower 4 elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi8(__b, __a); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){}, + 0, 1, 2, 3), __v4sf); } /// Converts the lower four unsigned 8-bit integer values from a 64-bit @@ -2930,15 +2929,12 @@ _mm_cvtpi8_ps(__m64 __a) /// operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){}, + 0, 1, 2, 3), __v4sf); } /// Converts the two 32-bit signed integer values from each 64-bit vector @@ -2957,16 +2953,12 @@ _mm_cvtpu8_ps(__m64 __a) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// copied and converted values from the first operand. The upper 64 bits /// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { - __m128 __c; - - __c = _mm_setzero_ps(); - __c = _mm_cvtpi32_ps(__c, __b); - __c = _mm_movelh_ps(__c, __c); - - return _mm_cvtpi32_ps(__c, __a); + return __builtin_convertvector( + __builtin_shufflevector((__v2si)__a, (__v2si)__b, + 0, 1, 2, 3), __v4sf); } /// Converts each single-precision floating-point element of a 128-bit @@ -2986,16 +2978,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) /// A 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a) { - __m64 __b, __c; - - __b = _mm_cvtps_pi32(__a); - __a = _mm_movehl_ps(__a, __a); - __c = _mm_cvtps_pi32(__a); - - return _mm_packs_pi32(__b, __c); + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps())); } /// Converts each single-precision floating-point element of a 128-bit @@ -3016,7 +3003,7 @@ _mm_cvtps_pi16(__m128 __a) /// 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the /// converted values and the uppper 32 bits are set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a) { __m64 __b, __c; @@ -3196,8 +3183,14 @@ do { \ #define _m_psadbw _mm_sad_pu8 #define _m_ _mm_ +#undef __trunc64 +#undef __zext128 +#undef __anyext128 +#undef __zeroupper64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX +#undef __DEFAULT_FN_ATTRS_CONSTEXPR +#undef __DEFAULT_FN_ATTRS_SSE2 +#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR /* Ugly hack for backwards-compatibility (compatible with gcc) */ #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)