mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
zig cc: Update intrinsic headers to Clang 20.
This commit is contained in:
parent
0181cfe8ad
commit
ce754724b3
5
lib/include/adcintrin.h
vendored
5
lib/include/adcintrin.h
vendored
@ -15,7 +15,12 @@
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__)) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#endif
|
||||
|
||||
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
||||
#if defined(__cplusplus)
|
||||
|
||||
5
lib/include/adxintrin.h
vendored
5
lib/include/adxintrin.h
vendored
@ -15,8 +15,13 @@
|
||||
#define __ADXINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("adx"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("adx")))
|
||||
#endif
|
||||
|
||||
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
||||
#if defined(__cplusplus)
|
||||
|
||||
16
lib/include/altivec.h
vendored
16
lib/include/altivec.h
vendored
@ -2502,37 +2502,37 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_popcnt(vector signed char __a) {
|
||||
return (vector unsigned char)__builtin_altivec_vpopcntb(
|
||||
return (vector unsigned char)__builtin_elementwise_popcount(
|
||||
(vector unsigned char)__a);
|
||||
}
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_popcnt(vector unsigned char __a) {
|
||||
return __builtin_altivec_vpopcntb(__a);
|
||||
return __builtin_elementwise_popcount(__a);
|
||||
}
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_popcnt(vector signed short __a) {
|
||||
return (vector unsigned short)__builtin_altivec_vpopcnth(
|
||||
return (vector unsigned short)__builtin_elementwise_popcount(
|
||||
(vector unsigned short)__a);
|
||||
}
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_popcnt(vector unsigned short __a) {
|
||||
return __builtin_altivec_vpopcnth(__a);
|
||||
return __builtin_elementwise_popcount(__a);
|
||||
}
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_popcnt(vector signed int __a) {
|
||||
return __builtin_altivec_vpopcntw((vector unsigned int)__a);
|
||||
return __builtin_elementwise_popcount((vector unsigned int)__a);
|
||||
}
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_popcnt(vector unsigned int __a) {
|
||||
return __builtin_altivec_vpopcntw(__a);
|
||||
return __builtin_elementwise_popcount(__a);
|
||||
}
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_popcnt(vector signed long long __a) {
|
||||
return __builtin_altivec_vpopcntd((vector unsigned long long)__a);
|
||||
return __builtin_elementwise_popcount((vector unsigned long long)__a);
|
||||
}
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_popcnt(vector unsigned long long __a) {
|
||||
return __builtin_altivec_vpopcntd(__a);
|
||||
return __builtin_elementwise_popcount(__a);
|
||||
}
|
||||
|
||||
#define vec_vclz vec_cntlz
|
||||
|
||||
382
lib/include/amxavx512intrin.h
vendored
Normal file
382
lib/include/amxavx512intrin.h
vendored
Normal file
@ -0,0 +1,382 @@
|
||||
/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AMX_AVX512INTRIN_H
|
||||
#define __AMX_AVX512INTRIN_H
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_AVX512 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-avx512,avx10.2-512")))
|
||||
|
||||
/// Moves a row from a tile register to a zmm destination register, converting
|
||||
/// the int32 source elements to fp32. The row of the tile is selected by a
|
||||
/// 32b GPR.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL >> 3
|
||||
/// row_index := row & 0xffff
|
||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
||||
/// dst.dword[i] := 0
|
||||
/// ELSE
|
||||
/// dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX_VL-1:VL] := 0
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
|
||||
///
|
||||
/// \param tsrc
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
/// \param row
|
||||
/// The row of the source tile
|
||||
#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
|
||||
|
||||
/// Moves a row from a tile register to a zmm destination register, converting
|
||||
/// the fp32 source elements to bf16. It places the resulting bf16 elements
|
||||
/// in the high 16 bits within each dword. The row of the tile is selected
|
||||
/// by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL >> 3
|
||||
/// row_index := row & 0xffff
|
||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
||||
/// dst.dword[i] := 0
|
||||
/// ELSE
|
||||
/// dst.word[2*i+0] := 0
|
||||
/// dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX_VL-1:VL] := 0
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
|
||||
///
|
||||
/// \param tsrc
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
/// \param row
|
||||
/// The the row of the source tile.
|
||||
#define _tile_cvtrowps2bf16h(tsrc, row) \
|
||||
__builtin_ia32_tcvtrowps2bf16h(tsrc, row)
|
||||
|
||||
/// Moves a row from a tile register to a zmm destination register, converting
|
||||
/// the fp32 source elements to bf16. It places the resulting bf16 elements
|
||||
/// in the low 16 bits within each dword. The row of the tile is selected
|
||||
/// by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL >> 3
|
||||
/// row_index := row & 0xffff
|
||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
||||
/// dst.dword[i] := 0
|
||||
/// ELSE
|
||||
/// dst.word[2*i+1] := 0
|
||||
/// dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX_VL-1:VL] := 0
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
|
||||
///
|
||||
/// \param tsrc
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
/// \param row
|
||||
/// The the row of the source tile.
|
||||
#define _tile_cvtrowps2bf16l(tsrc, row) \
|
||||
__builtin_ia32_tcvtrowps2bf16l(tsrc, row)
|
||||
|
||||
/// Moves a row from a tile register to a zmm destination register, converting
|
||||
/// the fp32 source elements to fp16. It places the resulting fp16 elements
|
||||
/// in the high 16 bits within each dword. The row of the tile is selected
|
||||
/// by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL >> 3
|
||||
/// row_index := row & 0xffff
|
||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
||||
/// dst.dword[i] := 0
|
||||
/// ELSE
|
||||
/// dst.word[2*i+0] := 0
|
||||
/// dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX_VL-1:VL] := 0
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
|
||||
///
|
||||
/// \param tsrc
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
/// \param row
|
||||
/// The the row of the source tile.
|
||||
#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
|
||||
|
||||
/// Moves a row from a tile register to a zmm destination register, converting
|
||||
/// the fp32 source elements to fp16. It places the resulting fp16 elements
|
||||
/// in the low 16 bits within each dword. The row of the tile is selected
|
||||
/// by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL >> 3
|
||||
/// row_index := row & 0xffff
|
||||
/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes / 4) - 1
|
||||
/// IF i + row_chunk / 4 >= tsrc.colsb / 4
|
||||
/// dst.dword[i] := 0
|
||||
/// ELSE
|
||||
/// dst.word[2*i+1] := 0
|
||||
/// dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX_VL-1:VL] := 0
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
|
||||
///
|
||||
/// \param tsrc
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
/// \param row
|
||||
/// The the row of the source tile.
|
||||
#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
|
||||
|
||||
/// Move one row of a tile data to a v16f32 data.
|
||||
/// The row of the tile is selected by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// __m512 _tile_movrow(__tile a, unsigned b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
|
||||
///
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v16f32 data. Size is 64 Bytes.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// VL := 512
|
||||
/// VL_bytes := VL>>3
|
||||
/// row_index := b&0xffff
|
||||
/// row_chunk := ((b>>16)&0xffff) * VL_bytes
|
||||
/// FOR i := 0 TO (VL_bytes-1)
|
||||
/// IF (row_chunk + i >= a.colsb)
|
||||
/// dst.byte[i] := 0
|
||||
/// ELSE
|
||||
/// dst.byte[i] := a.row[row_index].byte[row_chunk+i]
|
||||
/// ENDFOR
|
||||
/// \endcode
|
||||
#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
|
||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
||||
return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
|
||||
_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
|
||||
_tile1024i src, unsigned u) {
|
||||
return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
|
||||
_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
|
||||
_tile1024i src, unsigned u) {
|
||||
return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
|
||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
||||
return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
|
||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
||||
return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
|
||||
unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
|
||||
return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
|
||||
}
|
||||
|
||||
/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
|
||||
/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
|
||||
/// MXCSR.RC=RNE. Embedded rounding is not supported.
|
||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v16f32 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
|
||||
return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
|
||||
/// elements to bf16 at high 16-bits of each dword.
|
||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v32bf16 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
|
||||
return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
|
||||
/// elements to bf16 at low 16-bits of each dword.
|
||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v32bf16 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
|
||||
return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
|
||||
/// elements to fp16 at high 16-bits of each dword.
|
||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v32fp16 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
|
||||
return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
|
||||
/// elements to fp16 at low 16-bits of each dword.
|
||||
/// The row and chunk elements of tile is fetched from 32bit src1.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v32fp16 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
|
||||
return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
/// Move one row of a tile data to a v16f32 data.
|
||||
/// The row of the tile is selected by a 32b GPR.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
|
||||
///
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source r32. Size is 4 Bytes.
|
||||
/// \returns
|
||||
/// The destination v16i32 data. Size is 64 Bytes.
|
||||
__DEFAULT_FN_ATTRS_AVX512
|
||||
static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
|
||||
return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
|
||||
}
|
||||
|
||||
#endif // __x86_64__ && __SSE2__
|
||||
#endif // __AMX_AVX512INTRIN_H
|
||||
94
lib/include/amxbf16transposeintrin.h
vendored
Normal file
94
lib/include/amxbf16transposeintrin.h
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_BF16TRANSPOSEINTRIN_H
|
||||
#define __AMX_BF16TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-bf16,amx-transpose")))
|
||||
|
||||
/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
|
||||
/// tiles \a a and \a b, accumulating the intermediate single-precision
|
||||
/// (32-bit) floating-point elements with elements in \a dst, and store the
|
||||
/// 32-bit result back to tile \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
|
||||
/// FP32(b.row[k].bf16[2*n+0])
|
||||
/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
|
||||
/// FP32(b.row[k].bf16[2*n+1])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
||||
_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
|
||||
/// tiles src0 and src1, accumulating the intermediate single-precision
|
||||
/// (32-bit) floating-point elements with elements in "dst", and store the
|
||||
/// 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
|
||||
303
lib/include/amxcomplextransposeintrin.h
vendored
Normal file
303
lib/include/amxcomplextransposeintrin.h
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
|
||||
#define __AMX_COMPLEXTRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-complex,amx-transpose")))
|
||||
|
||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||
/// accumulate the results into a packed single precision tile. Each dword
|
||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
||||
/// with FP16 real part and FP16 imaginary part.
|
||||
/// Calculates the imaginary part of the result. For each possible combination
|
||||
/// of (transposed column of \a a, column of \a b), it performs a set of
|
||||
/// multiplication and accumulations on all corresponding complex numbers
|
||||
/// (one from \a a and one from \a b). The imaginary part of the \a a element
|
||||
/// is multiplied with the real part of the corresponding \a b element, and
|
||||
/// the real part of the \a a element is multiplied with the imaginary part
|
||||
/// of the corresponding \a b elements. The two accumulated results are
|
||||
/// added, and then accumulated into the corresponding row and column of
|
||||
/// \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO a.rows - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_tcmmimfp16ps(dst, a, b) \
|
||||
__builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
|
||||
|
||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||
/// accumulate the results into a packed single precision tile. Each dword
|
||||
/// element in input tiles \a a and \a b is interpreted as a complex number
|
||||
/// with FP16 real part and FP16 imaginary part.
|
||||
/// Calculates the real part of the result. For each possible combination
|
||||
/// of (rtransposed colum of \a a, column of \a b), it performs a set of
|
||||
/// multiplication and accumulations on all corresponding complex numbers
|
||||
/// (one from \a a and one from \a b). The real part of the \a a element is
|
||||
/// multiplied with the real part of the corresponding \a b element, and the
|
||||
/// negated imaginary part of the \a a element is multiplied with the
|
||||
/// imaginary part of the corresponding \a b elements. The two accumulated
|
||||
/// results are added, and then accumulated into the corresponding row and
|
||||
/// column of \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO a.rows - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
|
||||
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_tcmmrlfp16ps(dst, a, b) \
|
||||
__builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
|
||||
|
||||
/// Perform matrix conjugate transpose and multiplication of two tiles
|
||||
/// containing complex elements and accumulate the results into a packed
|
||||
/// single precision tile. Each dword element in input tiles \a a and \a b
|
||||
/// is interpreted as a complex number with FP16 real part and FP16 imaginary
|
||||
/// part.
|
||||
/// Calculates the imaginary part of the result. For each possible combination
|
||||
/// of (transposed column of \a a, column of \a b), it performs a set of
|
||||
/// multiplication and accumulations on all corresponding complex numbers
|
||||
/// (one from \a a and one from \a b). The negated imaginary part of the \a a
|
||||
/// element is multiplied with the real part of the corresponding \a b
|
||||
/// element, and the real part of the \a a element is multiplied with the
|
||||
/// imaginary part of the corresponding \a b elements. The two accumulated
|
||||
/// results are added, and then accumulated into the corresponding row and
|
||||
/// column of \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO a.rows - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
||||
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_conjtcmmimfp16ps(dst, a, b) \
|
||||
__builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
|
||||
|
||||
/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
|
||||
/// and writes the result to \a dst.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_conjtfp16(__tile dst, __tile a);
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR i := 0 TO dst.rows - 1
|
||||
/// FOR j := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
|
||||
/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, i, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
|
||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
||||
_tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
|
||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
||||
_tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
|
||||
unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
|
||||
_tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
||||
_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
|
||||
return __builtin_ia32_tconjtfp16_internal(m, n, src);
|
||||
}
|
||||
|
||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||
/// accumulate the results into a packed single precision tile. Each dword
|
||||
/// element in input tiles src0 and src1 is interpreted as a complex number
|
||||
/// with FP16 real part and FP16 imaginary part.
|
||||
/// This function calculates the imaginary part of the result.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
||||
dst->tile, src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||
/// accumulate the results into a packed single precision tile. Each dword
|
||||
/// element in input tiles src0 and src1 is interpreted as a complex number
|
||||
/// with FP16 real part and FP16 imaginary part.
|
||||
/// This function calculates the real part of the result.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
|
||||
dst->tile, src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Perform matrix conjugate transpose and multiplication of two tiles
|
||||
/// containing complex elements and accumulate the results into a packed
|
||||
/// single precision tile. Each dword element in input tiles src0 and src1
|
||||
/// is interpreted as a complex number with FP16 real part and FP16 imaginary
|
||||
/// part.
|
||||
/// This function calculates the imaginary part of the result.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
||||
dst->tile, src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Perform conjugate transpose of an FP16-pair of complex elements from src and
|
||||
/// writes the result to dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
|
||||
dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif // __x86_64__
|
||||
#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
|
||||
35
lib/include/amxfp16intrin.h
vendored
35
lib/include/amxfp16intrin.h
vendored
@ -15,6 +15,10 @@
|
||||
#define __AMX_FP16INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
|
||||
|
||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
|
||||
/// and \a b, accumulating the intermediate single-precision (32-bit)
|
||||
/// floating-point elements with elements in \a dst, and store the 32-bit
|
||||
@ -54,5 +58,36 @@
|
||||
#define _tile_dpfp16ps(dst, a, b) \
|
||||
__builtin_ia32_tdpfp16ps(dst, a, b)
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
||||
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_FP16INTRIN_H */
|
||||
|
||||
94
lib/include/amxfp16transposeintrin.h
vendored
Normal file
94
lib/include/amxfp16transposeintrin.h
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_FP16TRANSPOSEINTRIN_H
|
||||
#define __AMX_FP16TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-fp16,amx-transpose")))
|
||||
|
||||
/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
|
||||
/// tiles \a a and \a b, accumulating the intermediate single-precision
|
||||
/// (32-bit) floating-point elements with elements in \a dst, and store the
|
||||
/// 32-bit result back to tile \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// tmp := dst.row[m]
|
||||
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
|
||||
/// FP32(b.row[k].fp16[2*n+0])
|
||||
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
|
||||
/// FP32(b.row[k].fp16[2*n+1])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// ENDFOR
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS
|
||||
_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
|
||||
/// tiles src0 and src1, accumulating the intermediate single-precision
|
||||
/// (32-bit) floating-point elements with elements in "dst", and store the
|
||||
/// 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
|
||||
230
lib/include/amxfp8intrin.h
vendored
Normal file
230
lib/include/amxfp8intrin.h
vendored
Normal file
@ -0,0 +1,230 @@
|
||||
/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMXFP8INTRIN_H
|
||||
#define __AMXFP8INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_FP8 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
||||
_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
|
||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// temp1[n] +=
|
||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TDPBF8PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src2
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_FP8 static void
|
||||
__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
||||
dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
||||
_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2
|
||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// temp1[n] +=
|
||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src2
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_FP8 static void
|
||||
__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
||||
dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
||||
_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2
|
||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// temp1[n] +=
|
||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src2
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
|
||||
__DEFAULT_FN_ATTRS_FP8 static void
|
||||
__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
||||
dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
|
||||
_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2
|
||||
/// accumulating into a Single Precision (FP32) source/dest \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
|
||||
/// \endcode
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// FOR m := 0 TO dst.rows - 1
|
||||
/// temp1[(dst.colsb / 4 - 1) : 0] = 0
|
||||
/// FOR k := 0 TO src1.colsb / 4 - 1
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// temp1[n] +=
|
||||
/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
|
||||
/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
|
||||
/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
|
||||
/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
/// FOR n := 0 TO dst.colsb / 4 - 1
|
||||
/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the \c TDPHF8PS instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src2
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_FP8 static void
|
||||
__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
|
||||
dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
}
|
||||
|
||||
#define _tile_dpbf8ps(dst, src1, src2) \
|
||||
__builtin_ia32_tdpbf8ps((dst), (src1), (src2))
|
||||
#define _tile_dpbhf8ps(dst, src1, src2) \
|
||||
__builtin_ia32_tdpbhf8ps((dst), (src1), (src2))
|
||||
#define _tile_dphbf8ps(dst, src1, src2) \
|
||||
__builtin_ia32_tdphbf8ps((dst), (src1), (src2))
|
||||
#define _tile_dphf8ps(dst, src1, src2) \
|
||||
__builtin_ia32_tdphf8ps((dst), (src1), (src2))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS_FP8
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXFP8INTRIN_H */
|
||||
40
lib/include/amxintrin.h
vendored
40
lib/include/amxintrin.h
vendored
@ -22,8 +22,6 @@
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
#define __DEFAULT_FN_ATTRS_BF16 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
|
||||
#define __DEFAULT_FN_ATTRS_FP16 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
@ -232,9 +230,11 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
|
||||
/// represent 2D tile and the fixed size is maximum amx tile register size.
|
||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||
typedef int _tile1024i_1024a
|
||||
__attribute__((__vector_size__(1024), __aligned__(1024)));
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloadd64_internal(m, n, base,
|
||||
@ -242,7 +242,7 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloaddt164_internal(m, n, base,
|
||||
@ -278,7 +278,7 @@ _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_INT8
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
||||
__SIZE_TYPE__ stride, _tile1024i tile) {
|
||||
return __builtin_ia32_tilestored64_internal(m, n, base,
|
||||
@ -292,13 +292,6 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
|
||||
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This struct pack the shape and tile data together for user. We suggest
|
||||
/// initializing the struct as early as possible, because compiler depends
|
||||
/// on the shape information to do configure. The constant value is preferred
|
||||
@ -493,32 +486,9 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_FP16
|
||||
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS_TILE
|
||||
#undef __DEFAULT_FN_ATTRS_INT8
|
||||
#undef __DEFAULT_FN_ATTRS_BF16
|
||||
#undef __DEFAULT_FN_ATTRS_FP16
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
|
||||
48
lib/include/amxmovrsintrin.h
vendored
Normal file
48
lib/include/amxmovrsintrin.h
vendored
Normal file
@ -0,0 +1,48 @@
|
||||
/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
* ===-------------------------------------------------------------------=== */
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMXMOVRSINTRIN_H
|
||||
#define __AMXMOVRSINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_MOVRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
|
||||
|
||||
#define _tile_loaddrs(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddrs64((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
#define _tile_stream_loaddrs(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddrst164((dst), ((const void *)(base)), \
|
||||
(__SIZE_TYPE__)(stride))
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
|
||||
_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloaddrs64_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
|
||||
_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloaddrst164_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
|
||||
__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
|
||||
__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
#undef __DEFAULT_FN_ATTRS_MOVRS
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXMOVRSINTRIN_H */
|
||||
200
lib/include/amxmovrstransposeintrin.h
vendored
Normal file
200
lib/include/amxmovrstransposeintrin.h
vendored
Normal file
@ -0,0 +1,200 @@
|
||||
/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
* ===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
|
||||
#define __AMX_MOVRS_TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-transpose,amx-movrs")))
|
||||
|
||||
#define _tile_2rpntlvwz0rs(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz0rst1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1rs(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1rst1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
// Use __tile1024i_1024a* to escape the alignment check in
|
||||
// clang/test/Headers/x86-intrinsics-headers-clean.cpp
|
||||
__builtin_ia32_t2rpntlvwz0rs_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz0rst1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1rs_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1rst1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
/// Provides a hint to the implementation that the data will likely become
|
||||
/// read shared in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely become
|
||||
/// read shared in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely become
|
||||
/// read shared in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS
|
||||
static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
|
||||
108
lib/include/amxtf32intrin.h
vendored
Normal file
108
lib/include/amxtf32intrin.h
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AMX_TF32INTRIN_H
|
||||
#define __AMX_TF32INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_TF32 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tf32")))
|
||||
|
||||
/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus
|
||||
/// with \a srcdst.
|
||||
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \
|
||||
/// constexpr int b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
|
||||
///
|
||||
/// \param srcdst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
|
||||
/// dword[12:0] := 0
|
||||
/// dword[31:13] := x[31:13]
|
||||
/// return dword
|
||||
/// }
|
||||
///
|
||||
/// DEFINE silence_snan_fp32(x[31:0]) {
|
||||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
|
||||
/// x.fraction[22] := 1
|
||||
/// return x
|
||||
/// }
|
||||
///
|
||||
/// elements_a := a.colsb / 4
|
||||
/// elements_dest := srcdst.colsb / 4
|
||||
///
|
||||
/// FOR m = 0 TO (srcdst.rows-1)
|
||||
/// tmp[511:0] := 0
|
||||
/// FOR k = 0 TO (elements_a-1)
|
||||
/// FOR n = 0 TO (elements_dest-1)
|
||||
/// af := silence_snan_fp32(a.row[m].fp32[k])
|
||||
/// bf := silence_snan_fp32(b.row[k].fp32[n])
|
||||
/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af)
|
||||
/// * zero_lower_mantissa_bits_fp32(bf)
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
///
|
||||
/// FOR n = 0 TO (elements_dest-1)
|
||||
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
|
||||
///
|
||||
/// ENDFOR
|
||||
///
|
||||
/// zero_upper_rows(srcdst, srcdst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
#define _tile_mmultf32ps(srcdst, a, b) \
|
||||
__builtin_ia32_tmmultf32ps((srcdst), (a), (b))
|
||||
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32
|
||||
_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst.
|
||||
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_TF32
|
||||
static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#endif // __x86_64__
|
||||
#endif // __AMX_TF32INTRIN_H
|
||||
105
lib/include/amxtf32transposeintrin.h
vendored
Normal file
105
lib/include/amxtf32transposeintrin.h
vendored
Normal file
@ -0,0 +1,105 @@
|
||||
/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <amxtf32tranposeintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AMX_TF32TRANSPOSEINTRIN_H
|
||||
#define __AMX_TF32TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("amx-tf32,amx-transpose")))
|
||||
|
||||
/// \code
|
||||
/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
|
||||
/// constexpr int b);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
|
||||
///
|
||||
/// \param srcdst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param a
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param b
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
|
||||
/// dword[12:0] := 0
|
||||
/// dword[31:13] := x[31:13]
|
||||
/// return dword
|
||||
/// }
|
||||
///
|
||||
/// DEFINE silence_snan_fp32(x[31:0]) {
|
||||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
|
||||
/// x.fraction[22] := 1
|
||||
/// return x
|
||||
/// }
|
||||
///
|
||||
/// elements_dest:= srcdst.colsb/4
|
||||
///
|
||||
/// FOR m := 0 TO (srcdst.rows-1)
|
||||
/// tmp[511:0] := 0
|
||||
/// FOR k := 0 TO (a.rows-1)
|
||||
/// FOR n := 0 TO (elements_dest-1)
|
||||
/// a1e := silence_snan_fp32(a.row[k].fp32[m])
|
||||
/// a2e := silence_snan_fp32(b.row[k].fp32[n])
|
||||
/// s1e := zero_lower_mantissa_bits_fp32(a1e)
|
||||
/// s2e := zero_lower_mantissa_bits_fp32(a2e)
|
||||
/// tmp.fp32[n] += s1e * s2e
|
||||
/// ENDFOR
|
||||
/// ENDFOR
|
||||
///
|
||||
/// FOR n := 0 TO (elements_dest-1)
|
||||
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
|
||||
/// ENDFOR
|
||||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
|
||||
///
|
||||
/// ENDFOR
|
||||
///
|
||||
/// zero_upper_rows(srcdst, srcdst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
#define _tile_tmmultf32ps(srcdst, a, b) \
|
||||
__builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
|
||||
|
||||
// dst = m x n (srcdest), src1 = k x m, src2 = k x n
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
|
||||
_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
|
||||
/// Matrix Plus with dst. All the calculation is base on float32 but with the
|
||||
/// lower 13-bit set to 0.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
|
||||
static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
|
||||
dst->tile, src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#endif // __x86_64__
|
||||
#endif // __AMX_TF32TRANSPOSEINTRIN_H
|
||||
248
lib/include/amxtransposeintrin.h
vendored
Normal file
248
lib/include/amxtransposeintrin.h
vendored
Normal file
@ -0,0 +1,248 @@
|
||||
/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
* ===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMX_TRANSPOSEINTRIN_H
|
||||
#define __AMX_TRANSPOSEINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_TRANSPOSE \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
|
||||
|
||||
#define _tile_2rpntlvwz0(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz0t1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1(tdst, base, stride)
|
||||
#define _tile_2rpntlvwz1t1(tdst, base, stride) \
|
||||
__builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
|
||||
|
||||
/// Transpose 32-bit elements from \a src and write the result to \a dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// \code
|
||||
/// void _tile_transposed(__tile dst, __tile src);
|
||||
/// \endcode
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
///
|
||||
/// \code{.operation}
|
||||
///
|
||||
/// FOR i := 0 TO (dst.rows-1)
|
||||
/// tmp[511:0] := 0
|
||||
/// FOR j := 0 TO (dst.colsb/4-1)
|
||||
/// tmp.dword[j] := src.row[j].dword[i]
|
||||
/// ENDFOR
|
||||
/// dst.row[i] := tmp
|
||||
/// ENDFOR
|
||||
///
|
||||
/// zero_upper_rows(dst, dst.rows)
|
||||
/// zero_tileconfig_start()
|
||||
/// \endcode
|
||||
#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
// Use __tile1024i_1024a* to escape the alignment check in
|
||||
// clang/test/Headers/x86-intrinsics-headers-clean.cpp
|
||||
__builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
||||
(_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz0t1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
|
||||
(_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
|
||||
unsigned short row, unsigned short col0, unsigned short col1,
|
||||
_tile1024i *dst0, _tile1024i *dst1, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
__builtin_ia32_t2rpntlvwz1t1_internal(
|
||||
row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
|
||||
return __builtin_ia32_ttransposed_internal(m, n, src);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Converts a pair of tiles from memory into VNNI format, and places the
|
||||
/// results in a pair of destinations specified by dst. The pair of tiles
|
||||
/// in memory is specified via a tsib; the second tile is after the first
|
||||
/// one, separated by the same stride that separates each row.
|
||||
/// The tile configuration for the destination tiles indicates the amount
|
||||
/// of data to read from memory. The instruction will load a number of rows
|
||||
/// that is equal to twice the number of rows in tmm1. The size of each row
|
||||
/// is equal to the average width of the destination tiles. If the second
|
||||
/// tile is configured with zero rows and columns, only the first tile will
|
||||
/// be written. The last row will be not be read from memory but instead
|
||||
/// filled with zeros.
|
||||
/// Provides a hint to the implementation that the data will likely not be
|
||||
/// reused in the near future and the data caching can be optimized.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
|
||||
///
|
||||
/// \param dst0
|
||||
/// First tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param dst1
|
||||
/// Second tile of destination tile pair. Max size is 1024i*2 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
|
||||
const void *base, __SIZE_TYPE__ stride) {
|
||||
_tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
|
||||
&dst1->tile, base, stride);
|
||||
}
|
||||
|
||||
/// Transpose 32-bit elements from src and write the result to dst.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src
|
||||
/// The source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_TRANSPOSE
|
||||
static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
|
||||
dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
|
||||
}
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMX_TRANSPOSEINTRIN_H */
|
||||
39
lib/include/arm_acle.h
vendored
39
lib/include/arm_acle.h
vendored
@ -264,28 +264,28 @@ __rbitl(unsigned long __t) {
|
||||
}
|
||||
|
||||
/* 8.3 16-bit multiplications */
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smulbb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulbb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smulbt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulbt(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smultb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smultb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smultt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smultt(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smulwb(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulwb(__a, __b);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
|
||||
__smulwt(int32_t __a, int32_t __b) {
|
||||
return __builtin_arm_smulwt(__a, __b);
|
||||
}
|
||||
@ -304,46 +304,46 @@ __smulwt(int32_t __a, int32_t __b) {
|
||||
#endif
|
||||
|
||||
/* 8.4.2 Saturating addition and subtraction intrinsics */
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__qadd(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qadd(__t, __v);
|
||||
}
|
||||
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__qsub(int32_t __t, int32_t __v) {
|
||||
return __builtin_arm_qsub(__t, __v);
|
||||
}
|
||||
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__qdbl(int32_t __t) {
|
||||
return __builtin_arm_qadd(__t, __t);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 8.4.3 Accumulating multiplications */
|
||||
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlabb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlabt(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlatb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlatt(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlawb(__a, __b, __c);
|
||||
}
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
|
||||
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
|
||||
return __builtin_arm_smlawt(__a, __b, __c);
|
||||
}
|
||||
@ -621,8 +621,6 @@ __rintnf(float __a) {
|
||||
#endif
|
||||
|
||||
/* 8.8 CRC32 intrinsics */
|
||||
#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \
|
||||
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
|
||||
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||
__crc32b(uint32_t __a, uint8_t __b) {
|
||||
return __builtin_arm_crc32b(__a, __b);
|
||||
@ -662,7 +660,6 @@ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target
|
||||
__crc32cd(uint32_t __a, uint64_t __b) {
|
||||
return __builtin_arm_crc32cd(__a, __b);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* 8.6 Floating-point data-processing intrinsics */
|
||||
/* Armv8.3-A Javascript conversion intrinsic */
|
||||
|
||||
5415
lib/include/arm_neon.h
vendored
5415
lib/include/arm_neon.h
vendored
File diff suppressed because it is too large
Load Diff
416
lib/include/arm_sme.h
vendored
416
lib/include/arm_sme.h
vendored
@ -35,12 +35,6 @@ __ai bool __arm_has_sme(void) __arm_streaming_compatible {
|
||||
return x0 & (1ULL << 63);
|
||||
}
|
||||
|
||||
__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible {
|
||||
uint64_t x0, x1;
|
||||
__builtin_arm_get_sme_state(&x0, &x1);
|
||||
return x0 & 1;
|
||||
}
|
||||
|
||||
void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;
|
||||
void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;
|
||||
void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;
|
||||
@ -48,6 +42,8 @@ void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;
|
||||
|
||||
__ai __attribute__((target("sme"))) void svundef_za(void) __arm_streaming_compatible __arm_out("za") { }
|
||||
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme___arm_in_streaming_mode)))
|
||||
bool __arm_in_streaming_mode(void);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
|
||||
void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m)))
|
||||
@ -604,6 +600,94 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_
|
||||
void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m)))
|
||||
void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svint8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
|
||||
void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
|
||||
void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
|
||||
void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
|
||||
void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
|
||||
void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
|
||||
void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
|
||||
void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
|
||||
void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
|
||||
void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
|
||||
void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
|
||||
void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
|
||||
void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
|
||||
void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
|
||||
void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
|
||||
void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
|
||||
void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
|
||||
void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
|
||||
void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
|
||||
void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
|
||||
void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
|
||||
void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
|
||||
void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
|
||||
void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
|
||||
void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
|
||||
void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
|
||||
void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
|
||||
void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
|
||||
void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
|
||||
void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
|
||||
void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
|
||||
void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
|
||||
void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
|
||||
void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
|
||||
void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
|
||||
void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
|
||||
void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
|
||||
void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
|
||||
void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
|
||||
void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
|
||||
void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
|
||||
void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
|
||||
void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
|
||||
void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
|
||||
void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
|
||||
void svmla_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
|
||||
@ -660,22 +744,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m))
|
||||
void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
|
||||
void svmops_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
|
||||
void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
|
||||
void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
|
||||
void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
|
||||
void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
|
||||
void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
|
||||
void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
|
||||
void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
|
||||
void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)))
|
||||
void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
|
||||
@ -684,6 +752,138 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m))
|
||||
void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
|
||||
void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_single_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_single_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_lane_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm)))
|
||||
void svmla_single_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_single_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_single_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm)))
|
||||
void svmla_lane_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_lane_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_lane_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm)))
|
||||
void svmopa_za16_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm)))
|
||||
void svvdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_lane_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm)))
|
||||
void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm)))
|
||||
void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm)))
|
||||
void svmla_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm)))
|
||||
void svmla_lane_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_lane_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_lane_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm)))
|
||||
void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm)))
|
||||
void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm)))
|
||||
void svmopa_za16_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm)))
|
||||
void svvdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_single_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_single_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_lane_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm)))
|
||||
void svmla_single_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_single_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_single_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm)))
|
||||
void svmla_lane_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_lane_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_lane_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm)))
|
||||
void svmopa_za32_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svvdotb_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svvdott_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_lane_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm)))
|
||||
void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm)))
|
||||
void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm)))
|
||||
void svmla_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm)))
|
||||
void svmla_lane_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_lane_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_lane_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm)))
|
||||
void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm)))
|
||||
void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm)))
|
||||
void svmopa_za32_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svvdotb_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm)))
|
||||
void svvdott_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m)))
|
||||
void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m)))
|
||||
@ -732,6 +932,106 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m
|
||||
void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m)))
|
||||
void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_u8_x4)))
|
||||
svuint8x4_t svluti4_zt_u8_x4(uint64_t, svuint8x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_s8_x4)))
|
||||
svint8x4_t svluti4_zt_s8_x4(uint64_t, svuint8x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8)))
|
||||
void svwrite_lane_zt_u8(uint64_t, svuint8_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32)))
|
||||
void svwrite_lane_zt_u32(uint64_t, svuint32_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64)))
|
||||
void svwrite_lane_zt_u64(uint64_t, svuint64_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16)))
|
||||
void svwrite_lane_zt_u16(uint64_t, svuint16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16)))
|
||||
void svwrite_lane_zt_bf16(uint64_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8)))
|
||||
void svwrite_lane_zt_s8(uint64_t, svint8_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64)))
|
||||
void svwrite_lane_zt_f64(uint64_t, svfloat64_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32)))
|
||||
void svwrite_lane_zt_f32(uint64_t, svfloat32_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16)))
|
||||
void svwrite_lane_zt_f16(uint64_t, svfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32)))
|
||||
void svwrite_lane_zt_s32(uint64_t, svint32_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64)))
|
||||
void svwrite_lane_zt_s64(uint64_t, svint64_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16)))
|
||||
void svwrite_lane_zt_s16(uint64_t, svint16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8)))
|
||||
void svwrite_zt_u8(uint64_t, svuint8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32)))
|
||||
void svwrite_zt_u32(uint64_t, svuint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64)))
|
||||
void svwrite_zt_u64(uint64_t, svuint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16)))
|
||||
void svwrite_zt_u16(uint64_t, svuint16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16)))
|
||||
void svwrite_zt_bf16(uint64_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8)))
|
||||
void svwrite_zt_s8(uint64_t, svint8_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64)))
|
||||
void svwrite_zt_f64(uint64_t, svfloat64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32)))
|
||||
void svwrite_zt_f32(uint64_t, svfloat32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16)))
|
||||
void svwrite_zt_f16(uint64_t, svfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32)))
|
||||
void svwrite_zt_s32(uint64_t, svint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64)))
|
||||
void svwrite_zt_s64(uint64_t, svint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16)))
|
||||
void svwrite_zt_s16(uint64_t, svint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8)))
|
||||
void svwrite_lane_zt(uint64_t, svuint8_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32)))
|
||||
void svwrite_lane_zt(uint64_t, svuint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64)))
|
||||
void svwrite_lane_zt(uint64_t, svuint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16)))
|
||||
void svwrite_lane_zt(uint64_t, svuint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16)))
|
||||
void svwrite_lane_zt(uint64_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8)))
|
||||
void svwrite_lane_zt(uint64_t, svint8_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64)))
|
||||
void svwrite_lane_zt(uint64_t, svfloat64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32)))
|
||||
void svwrite_lane_zt(uint64_t, svfloat32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16)))
|
||||
void svwrite_lane_zt(uint64_t, svfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32)))
|
||||
void svwrite_lane_zt(uint64_t, svint32_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64)))
|
||||
void svwrite_lane_zt(uint64_t, svint64_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16)))
|
||||
void svwrite_lane_zt(uint64_t, svint16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8)))
|
||||
void svwrite_zt(uint64_t, svuint8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32)))
|
||||
void svwrite_zt(uint64_t, svuint32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64)))
|
||||
void svwrite_zt(uint64_t, svuint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16)))
|
||||
void svwrite_zt(uint64_t, svuint16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16)))
|
||||
void svwrite_zt(uint64_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8)))
|
||||
void svwrite_zt(uint64_t, svint8_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64)))
|
||||
void svwrite_zt(uint64_t, svfloat64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32)))
|
||||
void svwrite_zt(uint64_t, svfloat32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16)))
|
||||
void svwrite_zt(uint64_t, svfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32)))
|
||||
void svwrite_zt(uint64_t, svint32_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64)))
|
||||
void svwrite_zt(uint64_t, svint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16)))
|
||||
void svwrite_zt(uint64_t, svint16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x2)))
|
||||
void svadd_write_single_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x2)))
|
||||
@ -2138,78 +2438,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x
|
||||
void svwrite_za8_vg1x4(uint32_t, svuint8x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4)))
|
||||
void svwrite_za8_vg1x4(uint32_t, svint8x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
|
||||
void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
|
||||
void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
|
||||
void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
|
||||
void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
|
||||
void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
|
||||
void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
|
||||
void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
|
||||
void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
|
||||
void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
|
||||
void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
|
||||
void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
|
||||
void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
|
||||
void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
|
||||
void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
|
||||
void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
|
||||
void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
|
||||
void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
|
||||
void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
|
||||
void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
|
||||
void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
|
||||
void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
|
||||
void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
|
||||
void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
|
||||
void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
|
||||
void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
|
||||
void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
|
||||
void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
|
||||
void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
|
||||
void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
|
||||
void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
|
||||
void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
|
||||
void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
|
||||
void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
|
||||
void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
|
||||
void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
|
||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
|
||||
void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2)))
|
||||
void svadd_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
|
||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4)))
|
||||
|
||||
1189
lib/include/arm_sve.h
vendored
1189
lib/include/arm_sve.h
vendored
File diff suppressed because it is too large
Load Diff
87
lib/include/arm_vector_types.h
vendored
87
lib/include/arm_vector_types.h
vendored
@ -17,9 +17,62 @@
|
||||
typedef float float32_t;
|
||||
typedef __fp16 float16_t;
|
||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||
typedef __mfp8 mfloat8_t;
|
||||
typedef double float64_t;
|
||||
#endif
|
||||
|
||||
|
||||
typedef uint64_t fpm_t;
|
||||
|
||||
enum __ARM_FPM_FORMAT { __ARM_FPM_E5M2, __ARM_FPM_E4M3 };
|
||||
|
||||
enum __ARM_FPM_OVERFLOW { __ARM_FPM_INFNAN, __ARM_FPM_SATURATE };
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_fpm_init(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_src1_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
||||
return (__fpm & ~7ull) | (fpm_t)__format;
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_src2_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
||||
return (__fpm & ~0x38ull) | ((fpm_t)__format << 3u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_dst_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
|
||||
return (__fpm & ~0x1c0ull) | ((fpm_t)__format << 6u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_overflow_mul(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
|
||||
return (__fpm & ~0x4000ull) | ((fpm_t)__behaviour << 14u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_overflow_cvt(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
|
||||
return (__fpm & ~0x8000ull) | ((fpm_t)__behaviour << 15u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_lscale(fpm_t __fpm, uint64_t __scale) {
|
||||
return (__fpm & ~0x7f0000ull) | (__scale << 16u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_nscale(fpm_t __fpm, int64_t __scale) {
|
||||
return (__fpm & ~0xff000000ull) | (((fpm_t)__scale & 0xffu) << 24u);
|
||||
}
|
||||
|
||||
static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
|
||||
__arm_set_fpm_lscale2(fpm_t __fpm, uint64_t __scale) {
|
||||
return (uint32_t)__fpm | (__scale << 32u);
|
||||
}
|
||||
|
||||
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
|
||||
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
|
||||
typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
|
||||
@ -36,6 +89,10 @@ typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
|
||||
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
|
||||
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
|
||||
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
|
||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||
typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
|
||||
typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
|
||||
#endif
|
||||
typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
|
||||
typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
|
||||
typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
|
||||
@ -109,6 +166,16 @@ typedef struct uint64x2x2_t {
|
||||
uint64x2_t val[2];
|
||||
} uint64x2x2_t;
|
||||
|
||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||
typedef struct mfloat8x8x2_t {
|
||||
mfloat8x8_t val[2];
|
||||
} mfloat8x8x2_t;
|
||||
|
||||
typedef struct mfloat8x16x2_t {
|
||||
mfloat8x16_t val[2];
|
||||
} mfloat8x16x2_t;
|
||||
|
||||
#endif
|
||||
typedef struct float16x4x2_t {
|
||||
float16x4_t val[2];
|
||||
} float16x4x2_t;
|
||||
@ -199,6 +266,16 @@ typedef struct uint64x2x3_t {
|
||||
uint64x2_t val[3];
|
||||
} uint64x2x3_t;
|
||||
|
||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||
typedef struct mfloat8x8x3_t {
|
||||
mfloat8x8_t val[3];
|
||||
} mfloat8x8x3_t;
|
||||
|
||||
typedef struct mfloat8x16x3_t {
|
||||
mfloat8x16_t val[3];
|
||||
} mfloat8x16x3_t;
|
||||
|
||||
#endif
|
||||
typedef struct float16x4x3_t {
|
||||
float16x4_t val[3];
|
||||
} float16x4x3_t;
|
||||
@ -289,6 +366,16 @@ typedef struct uint64x2x4_t {
|
||||
uint64x2_t val[4];
|
||||
} uint64x2x4_t;
|
||||
|
||||
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||
typedef struct mfloat8x8x4_t {
|
||||
mfloat8x8_t val[4];
|
||||
} mfloat8x8x4_t;
|
||||
|
||||
typedef struct mfloat8x16x4_t {
|
||||
mfloat8x16_t val[4];
|
||||
} mfloat8x16x4_t;
|
||||
|
||||
#endif
|
||||
typedef struct float16x4x4_t {
|
||||
float16x4_t val[4];
|
||||
} float16x4x4_t;
|
||||
|
||||
561
lib/include/avx10_2_512bf16intrin.h
vendored
Normal file
561
lib/include/avx10_2_512bf16intrin.h
vendored
Normal file
@ -0,0 +1,561 @@
|
||||
/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX10_2_512BF16INTRIN_H
|
||||
#define __AVX10_2_512BF16INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
|
||||
return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
|
||||
return (__m512bh)__builtin_ia32_undef512();
|
||||
}
|
||||
|
||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
|
||||
return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
|
||||
bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
|
||||
bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
|
||||
}
|
||||
|
||||
static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
|
||||
__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
|
||||
__bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
|
||||
__bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
|
||||
__bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
|
||||
__bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
|
||||
__bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
|
||||
return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
|
||||
bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
|
||||
bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
|
||||
bf8, bf7, bf6, bf5, bf4, bf3, bf2, bf1};
|
||||
}
|
||||
|
||||
#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
|
||||
bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19, \
|
||||
bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28, \
|
||||
bf29, bf30, bf31, bf32) \
|
||||
_mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26), \
|
||||
(bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19), \
|
||||
(bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12), \
|
||||
(bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), \
|
||||
(bf3), (bf2), (bf1))
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16_ps(__m512bh __a) {
|
||||
return (__m512)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16_pd(__m512bh __a) {
|
||||
return (__m512d)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16_si512(__m512bh __a) {
|
||||
return (__m512i)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
|
||||
return (__m512bh)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castpd_pbh(__m512d __a) {
|
||||
return (__m512bh)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castsi512_pbh(__m512i __a) {
|
||||
return (__m512bh)__a;
|
||||
}
|
||||
|
||||
static __inline__ __m128bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16512_pbh128(__m512bh __a) {
|
||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
}
|
||||
|
||||
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16512_pbh256(__m512bh __a) {
|
||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16128_pbh512(__m128bh __a) {
|
||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_castbf16256_pbh512(__m256bh __a) {
|
||||
return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_zextbf16128_pbh512(__m128bh __a) {
|
||||
return __builtin_shufflevector(
|
||||
__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
|
||||
13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_zextbf16256_pbh512(__m256bh __a) {
|
||||
return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
|
||||
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
||||
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
|
||||
29, 30, 31);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
|
||||
return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
|
||||
(__m512i)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_load_pbh(void const *__p) {
|
||||
return *(const __m512bh *)__p;
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_loadu_pbh(void const *__p) {
|
||||
struct __loadu_pbh {
|
||||
__m512bh_u __v;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
return ((const struct __loadu_pbh *)__p)->__v;
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
|
||||
__m512bh __A) {
|
||||
*(__m512bh *)__P = __A;
|
||||
}
|
||||
|
||||
static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
|
||||
__m512bh __A) {
|
||||
struct __storeu_pbh {
|
||||
__m512bh_u __v;
|
||||
} __attribute__((__packed__, __may_alias__));
|
||||
((struct __storeu_pbh *)__P)->__v = __A;
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
|
||||
(__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
|
||||
(__v32hi)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)((__v32bf)__A + (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)((__v32bf)__A - (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)((__v32bf)__A * (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)((__v32bf)__A / (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
|
||||
__m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
#define _mm512_cmp_pbh_mask(__A, __B, __P) \
|
||||
((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
|
||||
(__v32bf)(__m512bh)(__B), \
|
||||
(int)(__P), (__mmask32) - 1))
|
||||
|
||||
#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P) \
|
||||
((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A), \
|
||||
(__v32bf)(__m512bh)(__B), \
|
||||
(int)(__P), (__mmask32)(__U)))
|
||||
|
||||
#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm) \
|
||||
((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
|
||||
|
||||
#define _mm512_fpclass_pbh_mask(__A, imm) \
|
||||
((__mmask32)__builtin_ia32_vfpclassbf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
|
||||
__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
|
||||
return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_getexp_pbh(__m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_rsqrt_pbh(__m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
|
||||
(__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
#define _mm512_reduce_pbh(__A, imm) \
|
||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(), \
|
||||
(__mmask32) - 1))
|
||||
|
||||
#define _mm512_mask_reduce_pbh(__W, __U, __A, imm) \
|
||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
|
||||
(__mmask32)(__U)))
|
||||
|
||||
#define _mm512_maskz_reduce_pbh(__U, __A, imm) \
|
||||
((__m512bh)__builtin_ia32_vreducebf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
||||
(__mmask32)(__U)))
|
||||
|
||||
#define _mm512_roundscale_pbh(__A, imm) \
|
||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
||||
(__mmask32) - 1))
|
||||
|
||||
#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm) \
|
||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W), \
|
||||
(__mmask32)(__U)))
|
||||
|
||||
#define _mm512_maskz_roundscale_pbh(__U, __A, imm) \
|
||||
((__m512bh)__builtin_ia32_vrndscalebf16_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(), \
|
||||
(__mmask32)(__U)))
|
||||
|
||||
#define _mm512_getmant_pbh(__A, __B, __C) \
|
||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
||||
(__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
|
||||
|
||||
#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C) \
|
||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
||||
(__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
|
||||
|
||||
#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C) \
|
||||
((__m512bh)__builtin_ia32_vgetmantbf16512_mask( \
|
||||
(__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)), \
|
||||
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
||||
(__v32bf)_mm512_sqrt_pbh(__A),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
|
||||
(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh(
|
||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
|
||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
|
||||
-(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh(
|
||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
|
||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
|
||||
(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
|
||||
__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh(
|
||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
|
||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||
_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
|
||||
-(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
|
||||
__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh(
|
||||
__m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
|
||||
__mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
|
||||
return (__m512bh)__builtin_ia32_selectpbf_512(
|
||||
(__mmask32)__U,
|
||||
_mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
|
||||
(__v32bf)_mm512_setzero_pbh());
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif
|
||||
#endif
|
||||
320
lib/include/avx10_2_512convertintrin.h
vendored
Normal file
320
lib/include/avx10_2_512convertintrin.h
vendored
Normal file
@ -0,0 +1,320 @@
|
||||
/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512convertintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX10_2_512CONVERTINTRIN_H
|
||||
#define __AVX10_2_512CONVERTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A,
|
||||
__m512 __B) {
|
||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1),
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) {
|
||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) {
|
||||
return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
|
||||
(__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
#define _mm512_cvtx_round2ps_ph(A, B, R) \
|
||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \
|
||||
(__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(), \
|
||||
(__mmask32)(-1), (const int)(R)))
|
||||
|
||||
#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R) \
|
||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B), \
|
||||
(__v32hf)(W), (__mmask32)(U), \
|
||||
(const int)(R)))
|
||||
|
||||
#define _mm512_maskz_cvtx_round2ps_ph(U, A, B, R) \
|
||||
((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \
|
||||
(__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_setzero_ph(), \
|
||||
(__mmask32)(U), (const int)(R)))
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_bf8(
|
||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvtbiassph_bf8(__m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_bf8(
|
||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtbiassph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_hf8(
|
||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvtbiassph_hf8(__m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
|
||||
(__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_hf8(
|
||||
__m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtbiassph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
|
||||
(__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
|
||||
(__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_bf8(__m512h __A,
|
||||
__m512h __B) {
|
||||
return (__m512i)__builtin_ia32_vcvt2ph2bf8_512((__v32hf)(__A),
|
||||
(__v32hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvt2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B),
|
||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvts2ph_bf8(__m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_vcvt2ph2bf8s_512((__v32hf)(__A),
|
||||
(__v32hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvts2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), (__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvts2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B),
|
||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_hf8(__m512h __A,
|
||||
__m512h __B) {
|
||||
return (__m512i)__builtin_ia32_vcvt2ph2hf8_512((__v32hf)(__A),
|
||||
(__v32hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B),
|
||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_cvts2ph_hf8(__m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_vcvt2ph2hf8s_512((__v32hf)(__A),
|
||||
(__v32hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvts2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), (__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B),
|
||||
(__v64qi)(__m512i)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) {
|
||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
||||
(__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) {
|
||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
||||
(__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) {
|
||||
return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
|
||||
(__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_bf8(__m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtph_bf8(__mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_bf8(__m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtsph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtsph_bf8(__mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_hf8(__m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtph_hf8(__mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_hf8(__m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtsph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtsph_hf8(__mmask32 __U, __m512h __A) {
|
||||
return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
|
||||
(__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
|
||||
}
|
||||
|
||||
static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtbf8_ph(__m256i __A) {
|
||||
return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_cvtbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) {
|
||||
return _mm512_castsi512_ph(
|
||||
_mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline __m512h __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_cvtbf8_ph(__mmask32 __U, __m256i __A) {
|
||||
return _mm512_castsi512_ph(
|
||||
_mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8));
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif // __AVX10_2_512CONVERTINTRIN_H
|
||||
#endif // __SSE2__
|
||||
127
lib/include/avx10_2_512minmaxintrin.h
vendored
Normal file
127
lib/include/avx10_2_512minmaxintrin.h
vendored
Normal file
@ -0,0 +1,127 @@
|
||||
/*===---- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics ---------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2_512MINMAXINTRIN_H
|
||||
#define __AVX10_2_512MINMAXINTRIN_H
|
||||
|
||||
#define _mm512_minmax_pbh(A, B, C) \
|
||||
((__m512bh)__builtin_ia32_vminmaxbf16512((__v32bf)(__m512bh)(A), \
|
||||
(__v32bf)(__m512bh)(A), (int)(C)))
|
||||
|
||||
#define _mm512_mask_minmax_pbh(W, U, A, B, C) \
|
||||
((__m512bh)__builtin_ia32_selectpbf_512( \
|
||||
(__mmask32)(U), \
|
||||
(__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \
|
||||
(__v32bf)(__m512bh)(B), (int)(C)), \
|
||||
(__v32bf)(__m512bh)(W)))
|
||||
|
||||
#define _mm512_maskz_minmax_pbh(U, A, B, C) \
|
||||
((__m512bh)__builtin_ia32_selectpbf_512( \
|
||||
(__mmask32)(U), \
|
||||
(__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A), \
|
||||
(__v32bf)(__m512bh)(B), (int)(C)), \
|
||||
(__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
|
||||
|
||||
#define _mm512_minmax_pd(A, B, C) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_minmax_pd(W, U, A, B, C) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_minmax_pd(U, A, B, C) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_minmax_round_pd(A, B, C, R) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm512_maskz_minmax_round_pd(U, A, B, C, R) \
|
||||
((__m512d)__builtin_ia32_vminmaxpd512_round_mask( \
|
||||
(__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \
|
||||
(__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm512_minmax_ph(A, B, C) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_minmax_ph(W, U, A, B, C) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_minmax_ph(U, A, B, C) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_minmax_round_ph(A, B, C, R) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
|
||||
|
||||
#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
|
||||
|
||||
#define _mm512_maskz_minmax_round_ph(U, A, B, C, R) \
|
||||
((__m512h)__builtin_ia32_vminmaxph512_round_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C), \
|
||||
(__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
|
||||
|
||||
#define _mm512_minmax_ps(A, B, C) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
||||
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_minmax_ps(W, U, A, B, C) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
|
||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_minmax_ps(U, A, B, C) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
||||
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_minmax_round_ps(A, B, C, R) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
||||
(__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R)))
|
||||
|
||||
#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W), \
|
||||
(__mmask16)(U), (int)(R)))
|
||||
|
||||
#define _mm512_maskz_minmax_round_ps(U, A, B, C, R) \
|
||||
((__m512)__builtin_ia32_vminmaxps512_round_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \
|
||||
(__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
|
||||
#endif // __AVX10_2_512MINMAXINTRIN_H
|
||||
314
lib/include/avx10_2_512niintrin.h
vendored
Normal file
314
lib/include/avx10_2_512niintrin.h
vendored
Normal file
@ -0,0 +1,314 @@
|
||||
/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX10_2_512NIINTRIN_H
|
||||
#define __AVX10_2_512NIINTRIN_H
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
/* VNNI FP16 */
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
|
||||
__m512h __A,
|
||||
__m512h __B) {
|
||||
return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
|
||||
(__v32hf)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
|
||||
__mmask16 __U,
|
||||
__m512h __A,
|
||||
__m512h __B) {
|
||||
return (__m512)__builtin_ia32_selectps_512(
|
||||
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
|
||||
__m512 __W,
|
||||
__m512h __A,
|
||||
__m512h __B) {
|
||||
return (__m512)__builtin_ia32_selectps_512(
|
||||
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
|
||||
(__v16sf)_mm512_setzero_ps());
|
||||
}
|
||||
|
||||
/* VMPSADBW */
|
||||
#define _mm512_mpsadbw_epu8(A, B, imm) \
|
||||
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
|
||||
(__v64qi)(__m512i)(B), (int)(imm)))
|
||||
|
||||
#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm) \
|
||||
((__m512i)__builtin_ia32_selectw_512( \
|
||||
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
|
||||
(__v32hi)(__m512i)(W)))
|
||||
|
||||
#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm) \
|
||||
((__m512i)__builtin_ia32_selectw_512( \
|
||||
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
|
||||
(__v32hi)_mm512_setzero_si512()))
|
||||
|
||||
/* VNNI INT8 */
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
|
||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
|
||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
|
||||
__m512i __A,
|
||||
__m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
|
||||
(__v16si)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
|
||||
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
|
||||
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
/* VNNI INT16 */
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
|
||||
__m512i __B,
|
||||
__m512i __C) {
|
||||
return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
|
||||
(__v16si)__C);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
|
||||
(__v16si)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
|
||||
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __SSE2__ */
|
||||
#endif /* __AVX10_2_512NIINTRIN_H */
|
||||
303
lib/include/avx10_2_512satcvtdsintrin.h
vendored
Normal file
303
lib/include/avx10_2_512satcvtdsintrin.h
vendored
Normal file
@ -0,0 +1,303 @@
|
||||
/*===----- avx10_2_512satcvtdsintrin.h - AVX10_2_512SATCVTDS intrinsics ----===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512satcvtdsintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __AVX10_2_512SATCVTDSINTRIN_H
|
||||
#define __AVX10_2_512SATCVTDSINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
// 512 bit : Double -> Int
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi32(__m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttspd_epi32(__m256i __W, __mmask8 __U, __m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttspd_epi32(__mmask8 __U, __m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundpd_epi32(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \
|
||||
(__mmask8) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundpd_epi32(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundpd_epi32(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
// 512 bit : Double -> uInt
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu32(__m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttspd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttspd_epu32(__mmask8 __U, __m512d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
|
||||
(__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundpd_epu32(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(), \
|
||||
(__mmask8) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundpd_epu32(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundpd_epu32(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
// 512 bit : Double -> Long
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi64(__m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttspd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttspd_epi64(__mmask8 __U, __m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundpd_epi64(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \
|
||||
(__mmask8) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundpd_epi64(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundpd_epi64(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
// 512 bit : Double -> ULong
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu64(__m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttspd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttspd_epu64(__mmask8 __U, __m512d __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
|
||||
(__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundpd_epu64(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(), \
|
||||
(__mmask8) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundpd_epu64(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundpd_epu64(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
|
||||
(__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
// 512 bit: Float -> int
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi32(__m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttsps_epi32(__m512i __W, __mmask16 __U, __m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttsps_epi32(__mmask16 __U, __m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundps_epi32(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \
|
||||
(__mmask16) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundps_epi32(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundps_epi32(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \
|
||||
(__mmask16)(__U), (const int)(__R)))
|
||||
|
||||
// 512 bit: Float -> uint
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu32(__m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttsps_epu32(__m512i __W, __mmask16 __U, __m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttsps_epu32(__mmask16 __U, __m512 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
|
||||
(__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundps_epu32(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(), \
|
||||
(__mmask16) - 1, (const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundps_epu32(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundps_epu32(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
|
||||
(__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(), \
|
||||
(__mmask16)(__U), (const int)(__R)))
|
||||
|
||||
// 512 bit : float -> long
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi64(__m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttsps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttsps_epi64(__mmask8 __U, __m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundps_epi64(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundps_epi64(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundps_epi64(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
// 512 bit : float -> ulong
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu64(__m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_mask_cvttsps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_maskz_cvttsps_epu64(__mmask8 __U, __m256 __A) {
|
||||
return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
|
||||
(__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm512_cvtts_roundps_epu64(__A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_mask_cvtts_roundps_epu64(__W, __U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm512_maskz_cvtts_roundps_epu64(__U, __A, __R) \
|
||||
((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
|
||||
(__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
|
||||
(const int)(__R)))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#endif // __AVX10_2_512SATCVTDSINTRIN_H
|
||||
301
lib/include/avx10_2_512satcvtintrin.h
vendored
Normal file
301
lib/include/avx10_2_512satcvtintrin.h
vendored
Normal file
@ -0,0 +1,301 @@
|
||||
/*===------ avx10_2_512satcvtintrin.h - AVX10_2_512SATCVT intrinsics -------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2_512satcvtintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2_512SATCVTINTRIN_H
|
||||
#define __AVX10_2_512SATCVTINTRIN_H
|
||||
|
||||
#define _mm512_ipcvtbf16_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtbf162ibs512((__v32bf)(__m512bh)(A)))
|
||||
|
||||
#define _mm512_mask_ipcvtbf16_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvtbf16_epi8(A), \
|
||||
(__v32hi)(__m512i)(W)))
|
||||
|
||||
#define _mm512_maskz_ipcvtbf16_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvtbf16_epi8(A), \
|
||||
(__v32hi)_mm512_setzero_si512()))
|
||||
|
||||
#define _mm512_ipcvtbf16_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtbf162iubs512((__v32bf)(__m512bh)(A)))
|
||||
|
||||
#define _mm512_mask_ipcvtbf16_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvtbf16_epu8(A), \
|
||||
(__v32hi)(__m512i)(W)))
|
||||
|
||||
#define _mm512_maskz_ipcvtbf16_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvtbf16_epu8(A), \
|
||||
(__v32hi)_mm512_setzero_si512()))
|
||||
|
||||
#define _mm512_ipcvttbf16_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttbf162ibs512((__v32bf)(__m512bh)(A)))
|
||||
|
||||
#define _mm512_mask_ipcvttbf16_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvttbf16_epi8(A), \
|
||||
(__v32hi)(__m512i)(W)))
|
||||
|
||||
#define _mm512_maskz_ipcvttbf16_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvttbf16_epi8(A), \
|
||||
(__v32hi)_mm512_setzero_si512()))
|
||||
|
||||
#define _mm512_ipcvttbf16_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttbf162iubs512((__v32bf)(__m512bh)(A)))
|
||||
|
||||
#define _mm512_mask_ipcvttbf16_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvttbf16_epu8(A), \
|
||||
(__v32hi)(__m512i)(W)))
|
||||
|
||||
#define _mm512_maskz_ipcvttbf16_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||
(__v32hi)_mm512_ipcvttbf16_epu8(A), \
|
||||
(__v32hi)_mm512_setzero_si512()))
|
||||
|
||||
#define _mm512_ipcvtph_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvtph_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)(W), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvtph_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvt_roundph_epi8(A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)_mm512_setzero_si512(), \
|
||||
(__mmask32)-1, (const int)R))
|
||||
|
||||
#define _mm512_mask_ipcvt_roundph_epi8(W, U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
|
||||
|
||||
#define _mm512_maskz_ipcvt_roundph_epi8(U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)_mm512_setzero_si512(), \
|
||||
(__mmask32)(U), (const int)R))
|
||||
|
||||
#define _mm512_ipcvtph_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvtph_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)(W), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvtph_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvt_roundph_epu8(A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm512_mask_ipcvt_roundph_epu8(W, U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
|
||||
|
||||
#define _mm512_maskz_ipcvt_roundph_epu8(U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
(const int)R))
|
||||
|
||||
#define _mm512_ipcvtps_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvtps_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
||||
(__v16su)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvtps_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvt_roundps_epi8(A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
||||
(__v16su)_mm512_setzero_si512(), \
|
||||
(__mmask16)-1, (const int)R))
|
||||
|
||||
#define _mm512_mask_ipcvt_roundps_epi8(W, U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm512_maskz_ipcvt_roundps_epi8(U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A), \
|
||||
(__v16su)_mm512_setzero_si512(), \
|
||||
(__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm512_ipcvtps_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvtps_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask((__v16sf)(__m512)(A), \
|
||||
(__v16su)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvtps_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvt_roundps_epu8(A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm512_mask_ipcvt_roundps_epu8(W, U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm512_maskz_ipcvt_roundps_epu8(U, A, R) \
|
||||
((__m512i)__builtin_ia32_vcvtps2iubs512_mask( \
|
||||
(__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
(const int)R))
|
||||
|
||||
#define _mm512_ipcvttph_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvttph_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)(W), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvttph_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvtt_roundph_epi8(A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
S))
|
||||
|
||||
#define _mm512_mask_ipcvtt_roundph_epi8(W, U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
|
||||
|
||||
#define _mm512_maskz_ipcvtt_roundph_epi8(U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2ibs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
S))
|
||||
|
||||
#define _mm512_ipcvttph_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvttph_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask((__v32hf)(__m512h)(A), \
|
||||
(__v32hu)(W), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvttph_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvtt_roundph_epu8(A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1, \
|
||||
S))
|
||||
|
||||
#define _mm512_mask_ipcvtt_roundph_epu8(W, U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
|
||||
|
||||
#define _mm512_maskz_ipcvtt_roundph_epu8(U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttph2iubs512_mask( \
|
||||
(__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U), \
|
||||
S))
|
||||
|
||||
#define _mm512_ipcvttps_epi8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvttps_epi8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask((__v16sf)(__m512h)(A), \
|
||||
(__v16su)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvttps_epi8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvtt_roundps_epi8(A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
S))
|
||||
|
||||
#define _mm512_mask_ipcvtt_roundps_epi8(W, U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
|
||||
|
||||
#define _mm512_maskz_ipcvtt_roundps_epi8(U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2ibs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
S))
|
||||
|
||||
#define _mm512_ipcvttps_epu8(A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_mask_ipcvttps_epu8(W, U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask((__v16sf)(__m512h)(A), \
|
||||
(__v16su)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_maskz_ipcvttps_epu8(U, A) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm512_ipcvtt_roundps_epu8(A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1, \
|
||||
S))
|
||||
|
||||
#define _mm512_mask_ipcvtt_roundps_epu8(W, U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
|
||||
|
||||
#define _mm512_maskz_ipcvtt_roundps_epu8(U, A, S) \
|
||||
((__m512i)__builtin_ia32_vcvttps2iubs512_mask( \
|
||||
(__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U), \
|
||||
S))
|
||||
|
||||
#endif // __AVX10_2_512SATCVTINTRIN_H
|
||||
1085
lib/include/avx10_2bf16intrin.h
vendored
Normal file
1085
lib/include/avx10_2bf16intrin.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
590
lib/include/avx10_2convertintrin.h
vendored
Normal file
590
lib/include/avx10_2convertintrin.h
vendored
Normal file
@ -0,0 +1,590 @@
|
||||
/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2convertintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#ifndef __AVX10_2CONVERTINTRIN_H
|
||||
#define __AVX10_2CONVERTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
|
||||
__m128 __B) {
|
||||
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
|
||||
(__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
|
||||
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
|
||||
(__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
|
||||
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
|
||||
(__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
|
||||
__m256 __B) {
|
||||
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
|
||||
(__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1),
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
|
||||
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
|
||||
(__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
|
||||
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
|
||||
(__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
|
||||
#define _mm256_cvtx_round2ps_ph(A, B, R) \
|
||||
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
|
||||
(__v8sf)(A), (__v8sf)(B), (__v16hf)_mm256_undefined_ph(), \
|
||||
(__mmask16)(-1), (const int)(R)))
|
||||
|
||||
#define _mm256_mask_cvtx_round2ps_ph(W, U, A, B, R) \
|
||||
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
|
||||
(__v8sf)(A), (__v8sf)(B), (__v16hf)(W), (__mmask16)(U), (const int)(R)))
|
||||
|
||||
#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R) \
|
||||
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
|
||||
(__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \
|
||||
(__mmask16)(U), (const int)(R)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_bf8(__m128i __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
|
||||
(__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8(
|
||||
__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtbiassph_bf8(__m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtbiassph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtbiassph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtbiassph_bf8(__m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
|
||||
(__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_bf8(
|
||||
__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtbiassph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_hf8(__m128i __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
|
||||
(__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8(
|
||||
__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_cvtbiassph_hf8(__m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtbiassph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtbiassph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
|
||||
(__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvtbiassph_hf8(__m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
|
||||
(__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_hf8(
|
||||
__m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtbiassph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
|
||||
(__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
|
||||
(__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A), (__v8hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B),
|
||||
(__v16qi)(__m128i)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_bf8(__m256h __A,
|
||||
__m256h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A),
|
||||
(__v16hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
|
||||
(__v32qi)(__m256i)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_bf8(__m128h __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A), (__v8hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvts2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), (__v16qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvts2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B),
|
||||
(__v16qi)(__m128i)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A),
|
||||
(__v16hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B),
|
||||
(__v32qi)(__m256i)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A), (__v8hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B),
|
||||
(__v16qi)(__m128i)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_hf8(__m256h __A,
|
||||
__m256h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A),
|
||||
(__v16hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
|
||||
(__v32qi)(__m256i)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_hf8(__m128h __A,
|
||||
__m128h __B) {
|
||||
return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A), (__v8hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvts2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), (__v16qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvts2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B),
|
||||
(__v16qi)(__m128i)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A),
|
||||
(__v16hf)(__B));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B),
|
||||
(__v32qi)(__m256i)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) {
|
||||
return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
|
||||
(__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W,
|
||||
__mmask8 __U,
|
||||
__m128i __A) {
|
||||
return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
|
||||
(__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U,
|
||||
__m128i __A) {
|
||||
return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
|
||||
(__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) {
|
||||
return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
|
||||
(__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) {
|
||||
return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
|
||||
(__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) {
|
||||
return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
|
||||
(__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_bf8(__m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_bf8(__m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtsph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtsph_bf8(__mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_bf8(__m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtsph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtsph_bf8(__mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_hf8(__m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_hf8(__m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtsph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtsph_hf8(__mmask8 __U, __m128h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
|
||||
(__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_hf8(__m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtsph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtsph_hf8(__mmask16 __U, __m256h __A) {
|
||||
return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
|
||||
(__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) {
|
||||
return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvtbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) {
|
||||
return _mm_castsi128_ph(
|
||||
_mm_mask_slli_epi16((__m128i)__S, __U, _mm_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline__ __m128h __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) {
|
||||
return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8));
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) {
|
||||
return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvtbf8_ph(__m256h __S, __mmask16 __U, __m128i __A) {
|
||||
return _mm256_castsi256_ph(
|
||||
_mm256_mask_slli_epi16((__m256i)__S, __U, _mm256_cvtepi8_epi16(__A), 8));
|
||||
}
|
||||
|
||||
static __inline__ __m256h __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) {
|
||||
return _mm256_castsi256_ph(
|
||||
_mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8));
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif // __AVX10_2CONVERTINTRIN_H
|
||||
#endif // __SSE2__
|
||||
66
lib/include/avx10_2copyintrin.h
vendored
Normal file
66
lib/include/avx10_2copyintrin.h
vendored
Normal file
@ -0,0 +1,66 @@
|
||||
/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2copyintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2COPYINTRIN_H
|
||||
#define __AVX10_2COPYINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/// Constructs a 128-bit integer vector, setting the lower 32 bits to the
|
||||
/// lower 32 bits of the parameter \a __A; the upper bits are zeoroed.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// result[31:0] := __A[31:0]
|
||||
/// result[MAX:32] := 0
|
||||
/// \endcode
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVD </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 128-bit integer vector. The lower 32 bits are copied from the
|
||||
/// parameter \a __A; the upper bits are zeroed.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) {
|
||||
return (__m128i)__builtin_shufflevector(
|
||||
(__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4);
|
||||
}
|
||||
|
||||
/// Constructs a 128-bit integer vector, setting the lower 16 bits to the
|
||||
/// lower 16 bits of the parameter \a __A; the upper bits are zeoroed.
|
||||
///
|
||||
/// \code{.operation}
|
||||
/// result[15:0] := __A[15:0]
|
||||
/// result[MAX:16] := 0
|
||||
/// \endcode
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VMOVW </c> instruction.
|
||||
///
|
||||
/// \param __A
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 128-bit integer vector. The lower 16 bits are copied from the
|
||||
/// parameter \a __A; the upper bits are zeroed.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) {
|
||||
return (__m128i)__builtin_shufflevector(
|
||||
(__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
|
||||
#endif // __AVX10_2COPYINTRIN_H
|
||||
277
lib/include/avx10_2minmaxintrin.h
vendored
Normal file
277
lib/include/avx10_2minmaxintrin.h
vendored
Normal file
@ -0,0 +1,277 @@
|
||||
/*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2MINMAXINTRIN_H
|
||||
#define __AVX10_2MINMAXINTRIN_H
|
||||
|
||||
#define _mm_minmax_pbh(A, B, C) \
|
||||
((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A), \
|
||||
(__m128bh)(__v8bf)(B), (int)(C)))
|
||||
|
||||
#define _mm_mask_minmax_pbh(W, U, A, B, C) \
|
||||
((__m128bh)__builtin_ia32_selectpbf_128( \
|
||||
(__mmask8)(U), \
|
||||
(__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \
|
||||
(int)(C)), \
|
||||
(__v8bf)(W)))
|
||||
|
||||
#define _mm_maskz_minmax_pbh(U, A, B, C) \
|
||||
((__m128bh)__builtin_ia32_selectpbf_128( \
|
||||
(__mmask8)(U), \
|
||||
(__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B), \
|
||||
(int)(C)), \
|
||||
(__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
|
||||
|
||||
#define _mm256_minmax_pbh(A, B, C) \
|
||||
((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A), \
|
||||
(__m256bh)(__v16bf)(B), (int)(C)))
|
||||
|
||||
#define _mm256_mask_minmax_pbh(W, U, A, B, C) \
|
||||
((__m256bh)__builtin_ia32_selectpbf_256( \
|
||||
(__mmask16)(U), \
|
||||
(__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \
|
||||
(__m256bh)(__v16bf)(B), (int)(C)), \
|
||||
(__v16bf)(W)))
|
||||
|
||||
#define _mm256_maskz_minmax_pbh(U, A, B, C) \
|
||||
((__m256bh)__builtin_ia32_selectpbf_256( \
|
||||
(__mmask16)(U), \
|
||||
(__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A), \
|
||||
(__m256bh)(__v16bf)(B), (int)(C)), \
|
||||
(__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
|
||||
|
||||
#define _mm_minmax_pd(A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_setzero_pd(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_minmax_pd(W, U, A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)(__m128d)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_minmax_pd(U, A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxpd128_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_minmax_pd(A, B, C) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_mask_minmax_pd(W, U, A, B, C) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)(__m256d)(W), (__mmask8)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_maskz_minmax_pd(U, A, B, C) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)_mm256_setzero_pd(), (__mmask8)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_minmax_round_pd(A, B, C, R) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm256_mask_minmax_round_pd(W, U, A, B, C, R) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm256_maskz_minmax_round_pd(U, A, B, C, R) \
|
||||
((__m256d)__builtin_ia32_vminmaxpd256_round_mask( \
|
||||
(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \
|
||||
(__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_minmax_ph(A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_setzero_ph(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_minmax_ph(W, U, A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)(__m128h)(W), (__mmask16)-1))
|
||||
|
||||
#define _mm_maskz_minmax_ph(U, A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxph128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_minmax_ph(A, B, C) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
||||
(__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_mask_minmax_ph(W, U, A, B, C) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
||||
(__v16hf)(__m256h)(W), (__mmask16)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_maskz_minmax_ph(U, A, B, C) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
||||
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_minmax_round_ph(A, B, C, R) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
||||
(__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
|
||||
|
||||
#define _mm256_mask_minmax_round_ph(W, U, A, B, C, R) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (C), \
|
||||
(__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
|
||||
|
||||
#define _mm256_maskz_minmax_round_ph(U, A, B, C, R) \
|
||||
((__m256h)__builtin_ia32_vminmaxph256_round_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C), \
|
||||
(__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
|
||||
|
||||
#define _mm_minmax_ps(A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_setzero_ps(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_minmax_ps(W, U, A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
|
||||
(__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_minmax_ps(U, A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxps128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_minmax_ps(A, B, C) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
||||
(__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_mask_minmax_ps(W, U, A, B, C) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
|
||||
(__mmask8)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_maskz_minmax_ps(U, A, B, C) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
||||
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U), _MM_FROUND_NO_EXC))
|
||||
|
||||
#define _mm256_minmax_round_ps(A, B, C, R) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
||||
(__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm256_mask_minmax_round_ps(W, U, A, B, C, R) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
|
||||
(__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm256_maskz_minmax_round_ps(U, A, B, C, R) \
|
||||
((__m256)__builtin_ia32_vminmaxps256_round_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \
|
||||
(__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_minmax_sd(A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_mask_minmax_sd(W, U, A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_maskz_minmax_sd(U, A, B, C) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_minmax_round_sd(A, B, C, R) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm_mask_minmax_round_sd(W, U, A, B, C, R) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_maskz_minmax_round_sd(U, A, B, C, R) \
|
||||
((__m128d)__builtin_ia32_vminmaxsd_round_mask( \
|
||||
(__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \
|
||||
(__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_minmax_sh(A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_mask_minmax_sh(W, U, A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_maskz_minmax_sh(U, A, B, C) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_minmax_round_sh(A, B, C, R) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm_mask_minmax_round_sh(W, U, A, B, C, R) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_maskz_minmax_round_sh(U, A, B, C, R) \
|
||||
((__m128h)__builtin_ia32_vminmaxsh_round_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C), \
|
||||
(__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_minmax_ss(A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_mask_minmax_ss(W, U, A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
|
||||
(__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_maskz_minmax_ss(U, A, B, C) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm_minmax_round_ss(A, B, C, R) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
|
||||
|
||||
#define _mm_mask_minmax_round_ss(W, U, A, B, C, R) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W), \
|
||||
(__mmask8)(U), (int)(R)))
|
||||
|
||||
#define _mm_maskz_minmax_round_ss(U, A, B, C, R) \
|
||||
((__m128)__builtin_ia32_vminmaxss_round_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \
|
||||
(__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
|
||||
#endif // __AVX10_2MINMAXINTRIN_H
|
||||
2075
lib/include/avx10_2niintrin.h
vendored
Normal file
2075
lib/include/avx10_2niintrin.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
496
lib/include/avx10_2satcvtdsintrin.h
vendored
Normal file
496
lib/include/avx10_2satcvtdsintrin.h
vendored
Normal file
@ -0,0 +1,496 @@
|
||||
/*===----------- avx10_2satcvtdsintrin.h - AVX512SATCVTDS intrinsics --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2satcvtdsintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2SATCVTDSINTRIN_H
|
||||
#define __AVX10_2SATCVTDSINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
#define _mm_cvtts_roundsd_i32(__A, __R) \
|
||||
((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128)(__A), (const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundsd_si32(__A, __R) \
|
||||
((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128d)(__A), (const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundsd_u32(__A, __R) \
|
||||
((unsigned int)__builtin_ia32_vcvttsd2usis32((__v2df)(__m128d)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundss_i32(__A, __R) \
|
||||
((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundss_si32(__A, __R) \
|
||||
((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundss_u32(__A, __R) \
|
||||
((unsigned int)__builtin_ia32_vcvttss2usis32((__v4sf)(__m128)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define _mm_cvtts_roundss_u64(__A, __R) \
|
||||
((unsigned long long)__builtin_ia32_vcvttss2usis64((__v4sf)(__m128)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundsd_u64(__A, __R) \
|
||||
((unsigned long long)__builtin_ia32_vcvttsd2usis64((__v2df)(__m128d)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundss_i64(__A, __R) \
|
||||
((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundss_si64(__A, __R) \
|
||||
((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundsd_si64(__A, __R) \
|
||||
((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A), \
|
||||
(const int)(__R)))
|
||||
|
||||
#define _mm_cvtts_roundsd_i64(__A, __R) \
|
||||
((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A), \
|
||||
(const int)(__R)))
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
// 128 Bit : Double -> int
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi32(__m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask(
|
||||
(__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask((__v2df)__A, (__v4si)__W,
|
||||
__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttspd_epi32(__mmask16 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask(
|
||||
(__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U));
|
||||
}
|
||||
|
||||
// 256 Bit : Double -> int
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttspd_epi32(__m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttspd_epi32(__mmask8 __U, __m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundpd_epi32(__A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(), \
|
||||
(__mmask8) - 1, (int)(__R)))
|
||||
|
||||
#define _mm256_mask_cvtts_roundpd_epi32(__W, __U, __A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R)))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundpd_epi32(__U, __A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(), \
|
||||
(__mmask8)__U, (int)(__R)))
|
||||
|
||||
// 128 Bit : Double -> uint
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu32(__m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
|
||||
(__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
|
||||
(__v2df)__A, (__v4si)(__m128i)__W, (__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttspd_epu32(__mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
|
||||
(__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U));
|
||||
}
|
||||
|
||||
// 256 Bit : Double -> uint
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttspd_epu32(__m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttspd_epu32(__mmask8 __U, __m256d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
|
||||
(__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundpd_epu32(__A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(), \
|
||||
(__mmask8) - 1, (int)(__R)))
|
||||
|
||||
#define _mm256_mask_cvtts_roundpd_epu32(__W, __U, __A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R)))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundpd_epu32(__U, __A, __R) \
|
||||
((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask( \
|
||||
(__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(), \
|
||||
(__mmask8)__U, (int)(__R)))
|
||||
|
||||
// 128 Bit : Double -> long
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi64(__m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask(
|
||||
(__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttspd_epi64(__m128i __W, __mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask((__v2df)__A, (__v2di)__W,
|
||||
(__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttspd_epi64(__mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask(
|
||||
(__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
|
||||
// 256 Bit : Double -> long
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttspd_epi64(__m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttspd_epi64(__m256i __W, __mmask8 __U, __m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttspd_epi64(__mmask8 __U, __m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundpd_epi64(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( \
|
||||
(__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \
|
||||
(int)__R))
|
||||
|
||||
#define _mm256_mask_cvtts_roundpd_epi64(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask((__v4df)__A, (__v4di)__W, \
|
||||
(__mmask8)__U, (int)__R))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundpd_epi64(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask( \
|
||||
(__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R))
|
||||
|
||||
// 128 Bit : Double -> ulong
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu64(__m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask(
|
||||
(__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttspd_epu64(__m128i __W, __mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask((__v2df)__A, (__v2di)__W,
|
||||
(__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttspd_epu64(__mmask8 __U, __m128d __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask(
|
||||
(__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
|
||||
// 256 Bit : Double -> ulong
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttspd_epu64(__m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttspd_epu64(__m256i __W, __mmask8 __U, __m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttspd_epu64(__mmask8 __U, __m256d __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
|
||||
(__v4df)__A, (__v4di)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundpd_epu64(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \
|
||||
(__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \
|
||||
(int)__R))
|
||||
|
||||
#define _mm256_mask_cvtts_roundpd_epu64(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \
|
||||
(__v4df)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundpd_epu64(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask( \
|
||||
(__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R))
|
||||
|
||||
// 128 Bit : float -> int
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi32(__m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask(
|
||||
(__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttsps_epi32(__m128i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask((__v4sf)__A, (__v4si)__W,
|
||||
(__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttsps_epi32(__mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask(
|
||||
(__v4sf)__A, (__v4si)(__m128i)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
|
||||
// 256 Bit : float -> int
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttsps_epi32(__m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttsps_epi32(__m256i __W, __mmask8 __U, __m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttsps_epi32(__mmask8 __U, __m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundps_epi32(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(), \
|
||||
(__mmask8) - 1, (int)(__R)))
|
||||
|
||||
#define _mm256_mask_cvtts_roundps_epi32(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R)))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundps_epi32(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(), \
|
||||
(__mmask8)__U, (int)(__R)))
|
||||
|
||||
// 128 Bit : float -> uint
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu32(__m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask(
|
||||
(__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttsps_epu32(__m128i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask((__v4sf)__A, (__v4si)__W,
|
||||
(__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttsps_epu32(__mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask(
|
||||
(__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
|
||||
// 256 Bit : float -> uint
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttsps_epu32(__m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttsps_epu32(__m256i __W, __mmask8 __U, __m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttsps_epu32(__mmask8 __U, __m256 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
|
||||
(__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundps_epu32(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(), \
|
||||
(__mmask8) - 1, (int)(__R)))
|
||||
|
||||
#define _mm256_mask_cvtts_roundps_epu32(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R)))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundps_epu32(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask( \
|
||||
(__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(), \
|
||||
(__mmask8)__U, (int)(__R)))
|
||||
|
||||
// 128 bit : float -> long
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi64(__m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
|
||||
(__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttsps_epi64(__m128i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
|
||||
(__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
|
||||
(__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
// 256 bit : float -> long
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttsps_epi64(__m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttsps_epi64(__m256i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundps_epi64(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \
|
||||
(int)__R))
|
||||
|
||||
#define _mm256_mask_cvtts_roundps_epi64(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundps_epi64(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, \
|
||||
(int)__R))
|
||||
|
||||
// 128 bit : float -> ulong
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu64(__m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
|
||||
(__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_cvttsps_epu64(__m128i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
|
||||
(__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) {
|
||||
return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
|
||||
(__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
|
||||
}
|
||||
// 256 bit : float -> ulong
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_cvttsps_epu64(__m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_cvttsps_epu64(__m256i __W, __mmask8 __U, __m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) {
|
||||
return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
|
||||
(__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U,
|
||||
_MM_FROUND_CUR_DIRECTION));
|
||||
}
|
||||
|
||||
#define _mm256_cvtts_roundps_epu64(__A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1, \
|
||||
(int)__R))
|
||||
|
||||
#define _mm256_mask_cvtts_roundps_epu64(__W, __U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
|
||||
|
||||
#define _mm256_maskz_cvtts_roundps_epu64(__U, __A, __R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask( \
|
||||
(__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, \
|
||||
(int)__R))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
#endif // __AVX10_2SATCVTDSINTRIN_H
|
||||
444
lib/include/avx10_2satcvtintrin.h
vendored
Normal file
444
lib/include/avx10_2satcvtintrin.h
vendored
Normal file
@ -0,0 +1,444 @@
|
||||
/*===----------- avx10_2satcvtintrin.h - AVX10_2SATCVT intrinsics ----------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <avx10_2satcvtintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __AVX10_2SATCVTINTRIN_H
|
||||
#define __AVX10_2SATCVTINTRIN_H
|
||||
|
||||
#define _mm_ipcvtbf16_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtbf162ibs128((__v8bf)(__m128bh)(A)))
|
||||
|
||||
#define _mm_mask_ipcvtbf16_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128( \
|
||||
(__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epi8(A), (__v8hi)(__m128i)(W)))
|
||||
|
||||
#define _mm_maskz_ipcvtbf16_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||
(__v8hi)_mm_ipcvtbf16_epi8(A), \
|
||||
(__v8hi)_mm_setzero_si128()))
|
||||
|
||||
#define _mm256_ipcvtbf16_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtbf162ibs256((__v16bf)(__m256bh)(A)))
|
||||
|
||||
#define _mm256_mask_ipcvtbf16_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvtbf16_epi8(A), \
|
||||
(__v16hi)(__m256i)(W)))
|
||||
|
||||
#define _mm256_maskz_ipcvtbf16_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvtbf16_epi8(A), \
|
||||
(__v16hi)_mm256_setzero_si256()))
|
||||
|
||||
#define _mm_ipcvtbf16_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtbf162iubs128((__v8bf)(__m128bh)(A)))
|
||||
|
||||
#define _mm_mask_ipcvtbf16_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128( \
|
||||
(__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epu8(A), (__v8hi)(__m128i)(W)))
|
||||
|
||||
#define _mm_maskz_ipcvtbf16_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||
(__v8hi)_mm_ipcvtbf16_epu8(A), \
|
||||
(__v8hi)_mm_setzero_si128()))
|
||||
|
||||
#define _mm256_ipcvtbf16_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtbf162iubs256((__v16bf)(__m256bh)(A)))
|
||||
|
||||
#define _mm256_mask_ipcvtbf16_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvtbf16_epu8(A), \
|
||||
(__v16hi)(__m256i)(W)))
|
||||
|
||||
#define _mm256_maskz_ipcvtbf16_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvtbf16_epu8(A), \
|
||||
(__v16hi)_mm256_setzero_si256()))
|
||||
|
||||
#define _mm_ipcvtph_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2ibs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvtph_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2ibs128_mask((__v8hf)(__m128h)(A), \
|
||||
(__v8hu)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvtph_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2ibs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvtph_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvtph_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvtph_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \
|
||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvt_roundph_epi8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)_mm256_setzero_si256(), \
|
||||
(__mmask16)-1, (const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvt_roundph_epi8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvt_roundph_epi8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)_mm256_setzero_si256(), \
|
||||
(__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm_ipcvtph_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2iubs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvtph_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2iubs128_mask((__v8hf)(__m128h)(A), \
|
||||
(__v8hu)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvtph_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtph2iubs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvtph_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvtph_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvtph_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \
|
||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvt_roundph_epu8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvt_roundph_epu8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvt_roundph_epu8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
|
||||
(const int)R))
|
||||
|
||||
#define _mm_ipcvtps_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2ibs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvtps_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2ibs128_mask((__v4sf)(__m128)(A), \
|
||||
(__v4su)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvtps_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2ibs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvtps_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvtps_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)(W), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvtps_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvt_roundps_epi8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)-1, (const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvt_roundps_epi8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvt_roundps_epi8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm_ipcvtps_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2iubs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvtps_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2iubs128_mask((__v4sf)(__m128)(A), \
|
||||
(__v4su)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvtps_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvtps2iubs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvtps_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvtps_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)(W), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvtps_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvt_roundps_epu8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)-1, (const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvt_roundps_epu8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvt_roundps_epu8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm_ipcvttbf16_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttbf162ibs128((__v8bf)(__m128bh)(A)))
|
||||
|
||||
#define _mm_mask_ipcvttbf16_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128( \
|
||||
(__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epi8(A), (__v8hi)(__m128i)(W)))
|
||||
|
||||
#define _mm_maskz_ipcvttbf16_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||
(__v8hi)_mm_ipcvttbf16_epi8(A), \
|
||||
(__v8hi)_mm_setzero_si128()))
|
||||
|
||||
#define _mm256_ipcvttbf16_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttbf162ibs256((__v16bf)(__m256bh)(A)))
|
||||
|
||||
#define _mm256_mask_ipcvttbf16_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvttbf16_epi8(A), \
|
||||
(__v16hi)(__m256i)(W)))
|
||||
|
||||
#define _mm256_maskz_ipcvttbf16_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvttbf16_epi8(A), \
|
||||
(__v16hi)_mm256_setzero_si256()))
|
||||
|
||||
#define _mm_ipcvttbf16_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttbf162iubs128((__v8bf)(__m128bh)(A)))
|
||||
|
||||
#define _mm_mask_ipcvttbf16_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128( \
|
||||
(__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epu8(A), (__v8hi)(__m128i)(W)))
|
||||
|
||||
#define _mm_maskz_ipcvttbf16_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||
(__v8hi)_mm_ipcvttbf16_epu8(A), \
|
||||
(__v8hi)_mm_setzero_si128()))
|
||||
|
||||
#define _mm256_ipcvttbf16_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttbf162iubs256((__v16bf)(__m256bh)(A)))
|
||||
|
||||
#define _mm256_mask_ipcvttbf16_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvttbf16_epu8(A), \
|
||||
(__v16hi)(__m256i)(W)))
|
||||
|
||||
#define _mm256_maskz_ipcvttbf16_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||
(__v16hi)_mm256_ipcvttbf16_epu8(A), \
|
||||
(__v16hi)_mm256_setzero_si256()))
|
||||
|
||||
#define _mm_ipcvttph_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2ibs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvttph_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2ibs128_mask((__v8hf)(__m128h)(A), \
|
||||
(__v8hu)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvttph_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2ibs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvttph_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvttph_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvttph_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \
|
||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvtt_roundph_epi8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvtt_roundph_epi8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvtt_roundph_epi8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2ibs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
|
||||
(const int)R))
|
||||
|
||||
#define _mm_ipcvttph_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2iubs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvttph_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2iubs128_mask((__v8hf)(__m128h)(A), \
|
||||
(__v8hu)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvttph_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttph2iubs128_mask( \
|
||||
(__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvttph_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvttph_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask((__v16hf)(__m256h)(A), \
|
||||
(__v16hu)(W), (__mmask16)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvttph_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()), \
|
||||
(__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvtt_roundph_epu8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvtt_roundph_epu8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvtt_roundph_epu8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttph2iubs256_mask( \
|
||||
(__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \
|
||||
(const int)R))
|
||||
|
||||
#define _mm_ipcvttps_epi8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2ibs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvttps_epi8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2ibs128_mask((__v4sf)(__m128)(A), \
|
||||
(__v4su)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvttps_epi8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2ibs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvttps_epi8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvttps_epi8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)(W), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvttps_epi8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvtt_roundps_epi8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)-1, (const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvtt_roundps_epi8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvtt_roundps_epi8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)_mm256_setzero_si256(), \
|
||||
(__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm_ipcvttps_epu8(A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2iubs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
|
||||
|
||||
#define _mm_mask_ipcvttps_epu8(W, U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2iubs128_mask((__v4sf)(__m128)(A), \
|
||||
(__v4su)(W), (__mmask8)(U)))
|
||||
|
||||
#define _mm_maskz_ipcvttps_epu8(U, A) \
|
||||
((__m128i)__builtin_ia32_vcvttps2iubs128_mask( \
|
||||
(__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
|
||||
|
||||
#define _mm256_ipcvttps_epu8(A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_mask_ipcvttps_epu8(W, U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask((__v8sf)(__m256)(A), \
|
||||
(__v8su)(W), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_maskz_ipcvttps_epu8(U, A) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U), \
|
||||
_MM_FROUND_CUR_DIRECTION))
|
||||
|
||||
#define _mm256_ipcvtt_roundps_epu8(A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \
|
||||
(const int)R))
|
||||
|
||||
#define _mm256_mask_ipcvtt_roundps_epu8(W, U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
|
||||
|
||||
#define _mm256_maskz_ipcvtt_roundps_epu8(U, A, R) \
|
||||
((__m256i)__builtin_ia32_vcvttps2iubs256_mask( \
|
||||
(__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \
|
||||
(const int)R))
|
||||
#endif // __AVX10_2SATCVTINTRIN_H
|
||||
9
lib/include/avx2intrin.h
vendored
9
lib/include/avx2intrin.h
vendored
@ -15,12 +15,21 @@
|
||||
#define __AVX2INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx2,no-evex512"), __min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx2,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
|
||||
__min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx2"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
||||
/// Computes sixteen sum of absolute difference (SAD) operations on sets of
|
||||
|
||||
4
lib/include/avx512bitalgintrin.h
vendored
4
lib/include/avx512bitalgintrin.h
vendored
@ -23,7 +23,7 @@
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_popcnt_epi16(__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
|
||||
return (__m512i)__builtin_elementwise_popcount((__v32hu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
@ -45,7 +45,7 @@ _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
_mm512_popcnt_epi8(__m512i __A)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
|
||||
return (__m512i)__builtin_elementwise_popcount((__v64qu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
|
||||
36
lib/include/avx512fintrin.h
vendored
36
lib/include/avx512fintrin.h
vendored
@ -175,12 +175,21 @@ typedef enum
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512f,no-evex512")))
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
|
||||
#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
/* Create vectors with repeated elements */
|
||||
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_setzero_si512(void)
|
||||
{
|
||||
return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_setzero_si512(void) {
|
||||
return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0};
|
||||
}
|
||||
|
||||
#define _mm512_setzero_epi32 _mm512_setzero_si512
|
||||
@ -256,20 +265,16 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
|
||||
(__v8di) _mm512_setzero_si512());
|
||||
}
|
||||
|
||||
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS512
|
||||
_mm512_setzero_ps(void)
|
||||
{
|
||||
return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
|
||||
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void) {
|
||||
return __extension__(__m512){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
|
||||
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
}
|
||||
|
||||
#define _mm512_setzero _mm512_setzero_ps
|
||||
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS512
|
||||
_mm512_setzero_pd(void)
|
||||
{
|
||||
return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
|
||||
static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
_mm512_setzero_pd(void) {
|
||||
return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
||||
}
|
||||
|
||||
static __inline __m512 __DEFAULT_FN_ATTRS512
|
||||
@ -9775,5 +9780,8 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
|
||||
#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#endif /* __AVX512FINTRIN_H */
|
||||
|
||||
8
lib/include/avx512vlbitalgintrin.h
vendored
8
lib/include/avx512vlbitalgintrin.h
vendored
@ -27,7 +27,7 @@
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_popcnt_epi16(__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
|
||||
return (__m256i)__builtin_elementwise_popcount((__v16hu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
@ -49,7 +49,7 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_popcnt_epi16(__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
|
||||
return (__m128i)__builtin_elementwise_popcount((__v8hu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
@ -71,7 +71,7 @@ _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_popcnt_epi8(__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
|
||||
return (__m256i)__builtin_elementwise_popcount((__v32qu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
@ -93,7 +93,7 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_popcnt_epi8(__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
|
||||
return (__m128i)__builtin_elementwise_popcount((__v16qu)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
|
||||
16
lib/include/avx512vpopcntdqintrin.h
vendored
16
lib/include/avx512vpopcntdqintrin.h
vendored
@ -21,8 +21,15 @@
|
||||
__target__("avx512vpopcntdq,evex512"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
|
||||
return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm512_popcnt_epi64(__m512i __A) {
|
||||
return (__m512i)__builtin_elementwise_popcount((__v8du)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
@ -36,8 +43,9 @@ _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
|
||||
return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
|
||||
return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm512_popcnt_epi32(__m512i __A) {
|
||||
return (__m512i)__builtin_elementwise_popcount((__v16su)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||
|
||||
24
lib/include/avx512vpopcntdqvlintrin.h
vendored
24
lib/include/avx512vpopcntdqvlintrin.h
vendored
@ -25,9 +25,17 @@
|
||||
__target__("avx512vpopcntdq,avx512vl,no-evex512"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
|
||||
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
|
||||
#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
|
||||
#endif
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_popcnt_epi64(__m128i __A) {
|
||||
return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
|
||||
return (__m128i)__builtin_elementwise_popcount((__v2du)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
@ -41,9 +49,9 @@ _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
|
||||
return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
_mm_popcnt_epi32(__m128i __A) {
|
||||
return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
|
||||
return (__m128i)__builtin_elementwise_popcount((__v4su)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
@ -57,9 +65,9 @@ _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
|
||||
return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_popcnt_epi64(__m256i __A) {
|
||||
return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
|
||||
return (__m256i)__builtin_elementwise_popcount((__v4du)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
@ -73,9 +81,9 @@ _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
|
||||
return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
|
||||
_mm256_popcnt_epi32(__m256i __A) {
|
||||
return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
|
||||
return (__m256i)__builtin_elementwise_popcount((__v8su)__A);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
|
||||
46
lib/include/avxintrin.h
vendored
46
lib/include/avxintrin.h
vendored
@ -50,12 +50,29 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
|
||||
__min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
|
||||
__min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx"), \
|
||||
__min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
|
||||
#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
/* Arithmetic */
|
||||
/// Adds two 256-bit vectors of [4 x double].
|
||||
@ -3689,7 +3706,7 @@ _mm256_undefined_si256(void)
|
||||
/// A double-precision floating-point value used to initialize bits [63:0]
|
||||
/// of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [4 x double].
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_set_pd(double __a, double __b, double __c, double __d)
|
||||
{
|
||||
return __extension__ (__m256d){ __d, __c, __b, __a };
|
||||
@ -3728,7 +3745,7 @@ _mm256_set_pd(double __a, double __b, double __c, double __d)
|
||||
/// A single-precision floating-point value used to initialize bits [31:0]
|
||||
/// of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [8 x float].
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_set_ps(float __a, float __b, float __c, float __d,
|
||||
float __e, float __f, float __g, float __h)
|
||||
{
|
||||
@ -3955,7 +3972,7 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
|
||||
/// A double-precision floating-point value used to initialize bits [255:192]
|
||||
/// of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [4 x double].
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_setr_pd(double __a, double __b, double __c, double __d)
|
||||
{
|
||||
return _mm256_set_pd(__d, __c, __b, __a);
|
||||
@ -3995,7 +4012,7 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d)
|
||||
/// A single-precision floating-point value used to initialize bits [255:224]
|
||||
/// of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [8 x float].
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_setr_ps(float __a, float __b, float __c, float __d,
|
||||
float __e, float __f, float __g, float __h)
|
||||
{
|
||||
@ -4212,7 +4229,7 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
|
||||
/// A double-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [4 x double].
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_set1_pd(double __w)
|
||||
{
|
||||
return _mm256_set_pd(__w, __w, __w, __w);
|
||||
@ -4231,7 +4248,7 @@ _mm256_set1_pd(double __w)
|
||||
/// A single-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 256-bit floating-point vector of [8 x float].
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_set1_ps(float __w)
|
||||
{
|
||||
return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
|
||||
@ -4322,10 +4339,8 @@ _mm256_set1_epi64x(long long __q)
|
||||
/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
|
||||
///
|
||||
/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_pd(void)
|
||||
{
|
||||
return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
|
||||
static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) {
|
||||
return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
|
||||
}
|
||||
|
||||
/// Constructs a 256-bit floating-point vector of [8 x float] with all
|
||||
@ -4336,9 +4351,7 @@ _mm256_setzero_pd(void)
|
||||
/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
|
||||
///
|
||||
/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_ps(void)
|
||||
{
|
||||
static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) {
|
||||
return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
@ -4349,9 +4362,8 @@ _mm256_setzero_ps(void)
|
||||
/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
|
||||
///
|
||||
/// \returns A 256-bit integer vector initialized to zero.
|
||||
static __inline __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_setzero_si256(void)
|
||||
{
|
||||
static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm256_setzero_si256(void) {
|
||||
return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
|
||||
}
|
||||
|
||||
@ -5121,6 +5133,8 @@ _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
|
||||
|
||||
#endif /* __AVXINTRIN_H */
|
||||
|
||||
113
lib/include/avxvnniint16intrin.h
vendored
113
lib/include/avxvnniint16intrin.h
vendored
@ -15,14 +15,6 @@
|
||||
#ifndef __AVXVNNIINT16INTRIN_H
|
||||
#define __AVXVNNIINT16INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
/// signed 16-bit results. Sum these 2 results with the corresponding
|
||||
@ -53,12 +45,9 @@
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwsud_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpwsud_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwsuds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpwsuds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwusd_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpwusd_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwusds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpwusds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwuud_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpwuud_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpwuuds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
|
||||
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
|
||||
@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
#define _mm256_dpwuuds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
#endif // __AVXVNNIINT16INTRIN_H
|
||||
|
||||
113
lib/include/avxvnniint8intrin.h
vendored
113
lib/include/avxvnniint8intrin.h
vendored
@ -14,14 +14,6 @@
|
||||
#ifndef __AVXVNNIINT8INTRIN_H
|
||||
#define __AVXVNNIINT8INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
|
||||
__min_vector_width__(256)))
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
@ -52,12 +44,9 @@
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbssd_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpbssd_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbssds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpbssds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbsud_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpbsud_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbsuds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpbsuds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbuud_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#define _mm256_dpbuud_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
/// ENDFOR
|
||||
/// dst[MAX:128] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
|
||||
__m128i __A,
|
||||
__m128i __B) {
|
||||
return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
|
||||
(__v4si)__B);
|
||||
}
|
||||
#define _mm_dpbuuds_epi32(__W, __A, __B) \
|
||||
((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \
|
||||
(__v4si)(__B)))
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
|
||||
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
|
||||
/// signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in \a __W with signed saturation, and store the packed
|
||||
@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
|
||||
/// ENDFOR
|
||||
/// dst[MAX:256] := 0
|
||||
/// \endcode
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
|
||||
return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
|
||||
(__v8si)__B);
|
||||
}
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
#define _mm256_dpbuuds_epi32(__W, __A, __B) \
|
||||
((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \
|
||||
(__v8si)(__B)))
|
||||
|
||||
#endif // __AVXVNNIINT8INTRIN_H
|
||||
|
||||
32
lib/include/bmi2intrin.h
vendored
32
lib/include/bmi2intrin.h
vendored
@ -15,7 +15,13 @@
|
||||
#define __BMI2INTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
|
||||
#endif
|
||||
|
||||
/// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
|
||||
/// starting at bit number \a __Y.
|
||||
@ -38,8 +44,7 @@
|
||||
/// The lower 8 bits specify the bit number of the lowest bit to zero.
|
||||
/// \returns The partially zeroed 32-bit value.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_bzhi_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
_bzhi_u32(unsigned int __X, unsigned int __Y) {
|
||||
return __builtin_ia32_bzhi_si(__X, __Y);
|
||||
}
|
||||
|
||||
@ -68,8 +73,7 @@ _bzhi_u32(unsigned int __X, unsigned int __Y)
|
||||
/// The 32-bit mask specifying where to deposit source bits.
|
||||
/// \returns The 32-bit result.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_pdep_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
_pdep_u32(unsigned int __X, unsigned int __Y) {
|
||||
return __builtin_ia32_pdep_si(__X, __Y);
|
||||
}
|
||||
|
||||
@ -98,8 +102,7 @@ _pdep_u32(unsigned int __X, unsigned int __Y)
|
||||
/// The 32-bit mask specifying which source bits to extract.
|
||||
/// \returns The 32-bit result.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_pext_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
_pext_u32(unsigned int __X, unsigned int __Y) {
|
||||
return __builtin_ia32_pext_si(__X, __Y);
|
||||
}
|
||||
|
||||
@ -124,8 +127,7 @@ _pext_u32(unsigned int __X, unsigned int __Y)
|
||||
/// A pointer to memory for storing the upper half of the product.
|
||||
/// \returns The lower half of the product.
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
|
||||
{
|
||||
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
|
||||
unsigned long long __res = (unsigned long long) __X * __Y;
|
||||
*__P = (unsigned int)(__res >> 32);
|
||||
return (unsigned int)__res;
|
||||
@ -154,8 +156,7 @@ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
|
||||
/// The lower 8 bits specify the bit number of the lowest bit to zero.
|
||||
/// \returns The partially zeroed 64-bit value.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_bzhi_u64(unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return __builtin_ia32_bzhi_di(__X, __Y);
|
||||
}
|
||||
|
||||
@ -184,8 +185,7 @@ _bzhi_u64(unsigned long long __X, unsigned long long __Y)
|
||||
/// The 64-bit mask specifying where to deposit source bits.
|
||||
/// \returns The 64-bit result.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_pdep_u64(unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
_pdep_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return __builtin_ia32_pdep_di(__X, __Y);
|
||||
}
|
||||
|
||||
@ -214,8 +214,7 @@ _pdep_u64(unsigned long long __X, unsigned long long __Y)
|
||||
/// The 64-bit mask specifying which source bits to extract.
|
||||
/// \returns The 64-bit result.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_pext_u64(unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
_pext_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return __builtin_ia32_pext_di(__X, __Y);
|
||||
}
|
||||
|
||||
@ -241,8 +240,7 @@ _pext_u64(unsigned long long __X, unsigned long long __Y)
|
||||
/// \returns The lower half of the product.
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_mulx_u64 (unsigned long long __X, unsigned long long __Y,
|
||||
unsigned long long *__P)
|
||||
{
|
||||
unsigned long long *__P) {
|
||||
unsigned __int128 __res = (unsigned __int128) __X * __Y;
|
||||
*__P = (unsigned long long) (__res >> 64);
|
||||
return (unsigned long long) __res;
|
||||
|
||||
68
lib/include/bmiintrin.h
vendored
68
lib/include/bmiintrin.h
vendored
@ -17,7 +17,12 @@
|
||||
/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
|
||||
instruction behaves as BSF on non-BMI targets, there is code that expects
|
||||
to use it as a potentially faster version of BSF. */
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __RELAXED_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__)) constexpr
|
||||
#else
|
||||
#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
#endif
|
||||
|
||||
/// Counts the number of trailing zero bits in the operand.
|
||||
///
|
||||
@ -31,8 +36,7 @@
|
||||
/// bits in the operand.
|
||||
/// \see _tzcnt_u16
|
||||
static __inline__ unsigned short __RELAXED_FN_ATTRS
|
||||
__tzcnt_u16(unsigned short __X)
|
||||
{
|
||||
__tzcnt_u16(unsigned short __X) {
|
||||
return __builtin_ia32_tzcnt_u16(__X);
|
||||
}
|
||||
|
||||
@ -65,8 +69,7 @@ __tzcnt_u16(unsigned short __X)
|
||||
/// bits in the operand.
|
||||
/// \see { _mm_tzcnt_32 _tzcnt_u32 }
|
||||
static __inline__ unsigned int __RELAXED_FN_ATTRS
|
||||
__tzcnt_u32(unsigned int __X)
|
||||
{
|
||||
__tzcnt_u32(unsigned int __X) {
|
||||
return __builtin_ia32_tzcnt_u32(__X);
|
||||
}
|
||||
|
||||
@ -82,8 +85,7 @@ __tzcnt_u32(unsigned int __X)
|
||||
/// the operand.
|
||||
/// \see { __tzcnt_u32 _tzcnt_u32 }
|
||||
static __inline__ int __RELAXED_FN_ATTRS
|
||||
_mm_tzcnt_32(unsigned int __X)
|
||||
{
|
||||
_mm_tzcnt_32(unsigned int __X) {
|
||||
return (int)__builtin_ia32_tzcnt_u32(__X);
|
||||
}
|
||||
|
||||
@ -118,8 +120,7 @@ _mm_tzcnt_32(unsigned int __X)
|
||||
/// bits in the operand.
|
||||
/// \see { _mm_tzcnt_64 _tzcnt_u64 }
|
||||
static __inline__ unsigned long long __RELAXED_FN_ATTRS
|
||||
__tzcnt_u64(unsigned long long __X)
|
||||
{
|
||||
__tzcnt_u64(unsigned long long __X) {
|
||||
return __builtin_ia32_tzcnt_u64(__X);
|
||||
}
|
||||
|
||||
@ -135,8 +136,7 @@ __tzcnt_u64(unsigned long long __X)
|
||||
/// the operand.
|
||||
/// \see { __tzcnt_u64 _tzcnt_u64 }
|
||||
static __inline__ long long __RELAXED_FN_ATTRS
|
||||
_mm_tzcnt_64(unsigned long long __X)
|
||||
{
|
||||
_mm_tzcnt_64(unsigned long long __X) {
|
||||
return (long long)__builtin_ia32_tzcnt_u64(__X);
|
||||
}
|
||||
|
||||
@ -164,7 +164,13 @@ _mm_tzcnt_64(unsigned long long __X)
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("bmi"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("bmi")))
|
||||
#endif
|
||||
|
||||
/// Performs a bitwise AND of the second operand with the one's
|
||||
/// complement of the first operand.
|
||||
@ -181,8 +187,7 @@ _mm_tzcnt_64(unsigned long long __X)
|
||||
/// operand with the one's complement of the first operand.
|
||||
/// \see _andn_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__andn_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
__andn_u32(unsigned int __X, unsigned int __Y) {
|
||||
return ~__X & __Y;
|
||||
}
|
||||
|
||||
@ -224,8 +229,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
|
||||
/// extracted bits.
|
||||
/// \see _bextr_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__bextr_u32(unsigned int __X, unsigned int __Y)
|
||||
{
|
||||
__bextr_u32(unsigned int __X, unsigned int __Y) {
|
||||
return __builtin_ia32_bextr_u32(__X, __Y);
|
||||
}
|
||||
|
||||
@ -249,9 +253,8 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
|
||||
/// extracted bits.
|
||||
/// \see __bextr_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
|
||||
{
|
||||
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) {
|
||||
return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR2 */
|
||||
@ -289,8 +292,7 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) {
|
||||
/// the source operand.
|
||||
/// \see _blsi_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsi_u32(unsigned int __X)
|
||||
{
|
||||
__blsi_u32(unsigned int __X) {
|
||||
return __X & -__X;
|
||||
}
|
||||
|
||||
@ -325,8 +327,7 @@ __blsi_u32(unsigned int __X)
|
||||
/// \returns An unsigned integer containing the newly created mask.
|
||||
/// \see _blsmsk_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsmsk_u32(unsigned int __X)
|
||||
{
|
||||
__blsmsk_u32(unsigned int __X) {
|
||||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
@ -361,8 +362,7 @@ __blsmsk_u32(unsigned int __X)
|
||||
/// operand.
|
||||
/// \see _blsr_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsr_u32(unsigned int __X)
|
||||
{
|
||||
__blsr_u32(unsigned int __X) {
|
||||
return __X & (__X - 1);
|
||||
}
|
||||
|
||||
@ -401,8 +401,7 @@ __blsr_u32(unsigned int __X)
|
||||
/// operand with the one's complement of the first operand.
|
||||
/// \see _andn_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__andn_u64 (unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
__andn_u64 (unsigned long long __X, unsigned long long __Y) {
|
||||
return ~__X & __Y;
|
||||
}
|
||||
|
||||
@ -445,8 +444,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
|
||||
/// extracted bits.
|
||||
/// \see _bextr_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__bextr_u64(unsigned long long __X, unsigned long long __Y)
|
||||
{
|
||||
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return __builtin_ia32_bextr_u64(__X, __Y);
|
||||
}
|
||||
|
||||
@ -470,9 +468,8 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
|
||||
/// extracted bits.
|
||||
/// \see __bextr_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
|
||||
{
|
||||
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) {
|
||||
return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR2 */
|
||||
@ -510,8 +507,7 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
/// bits from the source operand.
|
||||
/// \see _blsi_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsi_u64(unsigned long long __X)
|
||||
{
|
||||
__blsi_u64(unsigned long long __X) {
|
||||
return __X & -__X;
|
||||
}
|
||||
|
||||
@ -546,8 +542,7 @@ __blsi_u64(unsigned long long __X)
|
||||
/// \returns An unsigned 64-bit integer containing the newly created mask.
|
||||
/// \see _blsmsk_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsmsk_u64(unsigned long long __X)
|
||||
{
|
||||
__blsmsk_u64(unsigned long long __X) {
|
||||
return __X ^ (__X - 1);
|
||||
}
|
||||
|
||||
@ -582,8 +577,7 @@ __blsmsk_u64(unsigned long long __X)
|
||||
/// source operand.
|
||||
/// \see _blsr_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsr_u64(unsigned long long __X)
|
||||
{
|
||||
__blsr_u64(unsigned long long __X) {
|
||||
return __X & (__X - 1);
|
||||
}
|
||||
|
||||
|
||||
2
lib/include/cmpccxaddintrin.h
vendored
2
lib/include/cmpccxaddintrin.h
vendored
@ -63,7 +63,7 @@ typedef enum {
|
||||
(int)(__D))))
|
||||
|
||||
#define _cmpccxadd_epi64(__A, __B, __C, __D) \
|
||||
((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \
|
||||
((long long)(__builtin_ia32_cmpccxadd64((__A), (long long)(__B), \
|
||||
(long long)(__C), (int)(__D))))
|
||||
|
||||
#endif // __x86_64__
|
||||
|
||||
23
lib/include/cpuid.h
vendored
23
lib/include/cpuid.h
vendored
@ -187,17 +187,18 @@
|
||||
#define bit_ENQCMD 0x20000000
|
||||
|
||||
/* Features in %edx for leaf 7 sub-leaf 0 */
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
#define bit_UINTR 0x00000020
|
||||
#define bit_SERIALIZE 0x00004000
|
||||
#define bit_TSXLDTRK 0x00010000
|
||||
#define bit_PCONFIG 0x00040000
|
||||
#define bit_IBT 0x00100000
|
||||
#define bit_AMXBF16 0x00400000
|
||||
#define bit_AVX512FP16 0x00800000
|
||||
#define bit_AMXTILE 0x01000000
|
||||
#define bit_AMXINT8 0x02000000
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
#define bit_UINTR 0x00000020
|
||||
#define bit_AVX512VP2INTERSECT 0x00000100
|
||||
#define bit_SERIALIZE 0x00004000
|
||||
#define bit_TSXLDTRK 0x00010000
|
||||
#define bit_PCONFIG 0x00040000
|
||||
#define bit_IBT 0x00100000
|
||||
#define bit_AMXBF16 0x00400000
|
||||
#define bit_AVX512FP16 0x00800000
|
||||
#define bit_AMXTILE 0x01000000
|
||||
#define bit_AMXINT8 0x02000000
|
||||
|
||||
/* Features in %eax for leaf 7 sub-leaf 1 */
|
||||
#define bit_SHA512 0x00000001
|
||||
|
||||
248
lib/include/emmintrin.h
vendored
248
lib/include/emmintrin.h
vendored
@ -49,12 +49,27 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("sse2,no-evex512"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_MMX \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
#define __trunc64(x) \
|
||||
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
|
||||
#define __anyext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, -1, -1)
|
||||
|
||||
/// Adds lower double-precision values in both operands and returns the
|
||||
/// sum in the lower 64 bits of the result. The upper 64 bits of the result
|
||||
@ -71,8 +86,8 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
|
||||
/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
|
||||
/// from the upper 64 bits of the first source operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
__a[0] += __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -89,8 +104,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the sums of both
|
||||
/// operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2df)__a + (__v2df)__b);
|
||||
}
|
||||
|
||||
@ -111,8 +126,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
|
||||
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
|
||||
/// difference of the lower 64 bits of both operands. The upper 64 bits are
|
||||
/// copied from the upper 64 bits of the first source operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
__a[0] -= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -129,8 +144,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing the subtrahend.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the differences between
|
||||
/// both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2df)__a - (__v2df)__b);
|
||||
}
|
||||
|
||||
@ -150,8 +165,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
|
||||
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
|
||||
/// product of the lower 64 bits of both operands. The upper 64 bits are
|
||||
/// copied from the upper 64 bits of the first source operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
__a[0] *= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -168,8 +183,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing one of the operands.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the products of both
|
||||
/// operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2df)__a * (__v2df)__b);
|
||||
}
|
||||
|
||||
@ -190,8 +205,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
|
||||
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
|
||||
/// quotient of the lower 64 bits of both operands. The upper 64 bits are
|
||||
/// copied from the upper 64 bits of the first source operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
__a[0] /= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -209,8 +224,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing the divisor.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the quotients of both
|
||||
/// operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2df)__a / (__v2df)__b);
|
||||
}
|
||||
|
||||
@ -358,8 +373,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
|
||||
/// values between both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2du)__a & (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -378,8 +393,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
|
||||
/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
|
||||
/// values in the second operand and the one's complement of the first
|
||||
/// operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_andnot_pd(__m128d __a, __m128d __b) {
|
||||
return (__m128d)(~(__v2du)__a & (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -395,8 +410,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
|
||||
/// values between both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2du)__a | (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -412,8 +427,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
|
||||
/// values between both operands.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
return (__m128d)((__v2du)__a ^ (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -1291,7 +1306,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
|
||||
/// floating-point elements are converted to double-precision values. The
|
||||
/// upper two elements are unused.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the converted values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtps_pd(__m128 __a) {
|
||||
return (__m128d) __builtin_convertvector(
|
||||
__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
|
||||
}
|
||||
@ -1312,7 +1328,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
|
||||
///
|
||||
/// The upper two elements are unused.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the converted values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtepi32_pd(__m128i __a) {
|
||||
return (__m128d) __builtin_convertvector(
|
||||
__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
|
||||
}
|
||||
@ -1398,8 +1415,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
|
||||
/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
|
||||
/// converted value from the second parameter. The upper 64 bits are copied
|
||||
/// from the upper 64 bits of the first parameter.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
|
||||
int __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtsi32_sd(__m128d __a, int __b) {
|
||||
__a[0] = __b;
|
||||
return __a;
|
||||
}
|
||||
@ -1423,8 +1440,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
|
||||
/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
|
||||
/// converted value from the second parameter. The upper 64 bits are copied
|
||||
/// from the upper 64 bits of the first parameter.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
|
||||
__m128 __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtss_sd(__m128d __a, __m128 __b) {
|
||||
__a[0] = __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -1486,8 +1503,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double].
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
|
||||
return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
|
||||
return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
|
||||
}
|
||||
|
||||
/// Converts the two double-precision floating-point elements of a
|
||||
@ -1505,8 +1522,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [2 x double].
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
|
||||
return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
|
||||
return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
|
||||
}
|
||||
|
||||
/// Converts the two signed 32-bit integer elements of a 64-bit vector of
|
||||
@ -1520,8 +1537,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
|
||||
/// \param __a
|
||||
/// A 64-bit vector of [2 x i32].
|
||||
/// \returns A 128-bit vector of [2 x double] containing the converted values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
|
||||
return __builtin_ia32_cvtpi2pd((__v2si)__a);
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtpi32_pd(__m64 __a) {
|
||||
return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
|
||||
}
|
||||
|
||||
/// Returns the low-order element of a 128-bit vector of [2 x double] as
|
||||
@ -1535,7 +1553,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
|
||||
/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
|
||||
/// \returns A double-precision floating-point value copied from the lower 64
|
||||
/// bits of \a __a.
|
||||
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
|
||||
static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtsd_f64(__m128d __a) {
|
||||
return __a[0];
|
||||
}
|
||||
|
||||
@ -1770,7 +1789,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
|
||||
/// lower 64 bits contain the value of the parameter. The upper 64 bits are
|
||||
/// set to zero.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) {
|
||||
return __extension__(__m128d){__w, 0.0};
|
||||
}
|
||||
|
||||
@ -1786,7 +1805,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
|
||||
/// A double-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double].
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) {
|
||||
return __extension__(__m128d){__w, __w};
|
||||
}
|
||||
|
||||
@ -1802,7 +1821,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
|
||||
/// A double-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double].
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) {
|
||||
return _mm_set1_pd(__w);
|
||||
}
|
||||
|
||||
@ -1820,8 +1839,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
|
||||
/// A double-precision floating-point value used to initialize the lower 64
|
||||
/// bits of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double].
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
|
||||
double __x) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w,
|
||||
double __x) {
|
||||
return __extension__(__m128d){__x, __w};
|
||||
}
|
||||
|
||||
@ -1840,8 +1859,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
|
||||
/// A double-precision floating-point value used to initialize the upper 64
|
||||
/// bits of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double].
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
|
||||
double __x) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w,
|
||||
double __x) {
|
||||
return __extension__(__m128d){__w, __x};
|
||||
}
|
||||
|
||||
@ -1854,7 +1873,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
|
||||
///
|
||||
/// \returns An initialized 128-bit floating-point vector of [2 x double] with
|
||||
/// all elements set to zero.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) {
|
||||
return __extension__(__m128d){0.0, 0.0};
|
||||
}
|
||||
|
||||
@ -1873,8 +1892,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
|
||||
/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
|
||||
/// lower 64 bits of the result.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the moved values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_move_sd(__m128d __a, __m128d __b) {
|
||||
__a[0] = __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -2091,8 +2110,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
|
||||
/// A 128-bit vector of [4 x i32].
|
||||
/// \returns A 128-bit vector of [4 x i32] containing the sums of both
|
||||
/// parameters.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
|
||||
__m128i __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_add_epi32(__m128i __a, __m128i __b) {
|
||||
return (__m128i)((__v4su)__a + (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -2108,9 +2127,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
|
||||
/// \param __b
|
||||
/// A 64-bit integer.
|
||||
/// \returns A 64-bit integer containing the sum of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
|
||||
__m64 __b) {
|
||||
return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
|
||||
return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
|
||||
}
|
||||
|
||||
/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
|
||||
@ -2129,8 +2147,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
|
||||
/// A 128-bit vector of [2 x i64].
|
||||
/// \returns A 128-bit vector of [2 x i64] containing the sums of both
|
||||
/// parameters.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
|
||||
__m128i __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_add_epi64(__m128i __a, __m128i __b) {
|
||||
return (__m128i)((__v2du)__a + (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -2431,9 +2449,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
|
||||
/// \param __b
|
||||
/// A 64-bit integer containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the product of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
|
||||
__m64 __b) {
|
||||
return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
|
||||
return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
|
||||
(__v4si)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Multiplies 32-bit unsigned integer values contained in the lower
|
||||
@ -2521,8 +2539,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
|
||||
/// A 128-bit integer vector containing the subtrahends.
|
||||
/// \returns A 128-bit integer vector containing the differences of the values
|
||||
/// in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
|
||||
__m128i __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_sub_epi32(__m128i __a, __m128i __b) {
|
||||
return (__m128i)((__v4su)__a - (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -2539,9 +2557,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
|
||||
/// A 64-bit integer vector containing the subtrahend.
|
||||
/// \returns A 64-bit integer vector containing the difference of the values in
|
||||
/// the operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
|
||||
__m64 __b) {
|
||||
return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
|
||||
return (__m64)((unsigned long long)__a - (unsigned long long)__b);
|
||||
}
|
||||
|
||||
/// Subtracts the corresponding elements of two [2 x i64] vectors.
|
||||
@ -2556,8 +2573,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
|
||||
/// A 128-bit integer vector containing the subtrahends.
|
||||
/// \returns A 128-bit integer vector containing the differences of the values
|
||||
/// in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
|
||||
__m128i __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_sub_epi64(__m128i __a, __m128i __b) {
|
||||
return (__m128i)((__v2du)__a - (__v2du)__b);
|
||||
}
|
||||
|
||||
@ -3255,8 +3272,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
|
||||
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
|
||||
/// converted value of the second operand. The upper 64 bits are copied from
|
||||
/// the upper 64 bits of the first operand.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
|
||||
long long __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtsi64_sd(__m128d __a, long long __b) {
|
||||
__a[0] = __b;
|
||||
return __a;
|
||||
}
|
||||
@ -3310,7 +3327,8 @@ static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
|
||||
/// \param __a
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the converted values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtepi32_ps(__m128i __a) {
|
||||
return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
|
||||
}
|
||||
|
||||
@ -3494,8 +3512,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
|
||||
/// destination vector of [2 x i64].
|
||||
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
|
||||
/// provided in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
|
||||
long long __q0) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_epi64x(long long __q1, long long __q0) {
|
||||
return __extension__(__m128i)(__v2di){__q0, __q1};
|
||||
}
|
||||
|
||||
@ -3515,9 +3533,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
|
||||
/// destination vector of [2 x i64].
|
||||
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
|
||||
/// provided in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
|
||||
__m64 __q0) {
|
||||
return _mm_set_epi64x((long long)__q1, (long long)__q0);
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_epi64(__m64 __q1, __m64 __q0) {
|
||||
return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]);
|
||||
}
|
||||
|
||||
/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
|
||||
@ -3542,8 +3560,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [4 x i32] containing the values
|
||||
/// provided in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
|
||||
int __i1, int __i0) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3,
|
||||
int __i2,
|
||||
int __i1,
|
||||
int __i0) {
|
||||
return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
|
||||
}
|
||||
|
||||
@ -3581,7 +3601,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [8 x i16] containing the values
|
||||
/// provided in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
|
||||
short __w2, short __w1, short __w0) {
|
||||
return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
|
||||
@ -3630,7 +3650,7 @@ _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
|
||||
/// Initializes bits [7:0] of the destination vector.
|
||||
/// \returns An initialized 128-bit vector of [16 x i8] containing the values
|
||||
/// provided in the operands.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
|
||||
char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
|
||||
char __b4, char __b3, char __b2, char __b1, char __b0) {
|
||||
@ -3652,7 +3672,8 @@ _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit integer vector of [2 x i64] with both
|
||||
/// elements containing the value provided in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set1_epi64x(long long __q) {
|
||||
return _mm_set_epi64x(__q, __q);
|
||||
}
|
||||
|
||||
@ -3669,7 +3690,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [2 x i64] with all elements
|
||||
/// containing the value provided in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set1_epi64(__m64 __q) {
|
||||
return _mm_set_epi64(__q, __q);
|
||||
}
|
||||
|
||||
@ -3686,7 +3708,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [4 x i32] with all elements
|
||||
/// containing the value provided in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) {
|
||||
return _mm_set_epi32(__i, __i, __i, __i);
|
||||
}
|
||||
|
||||
@ -3703,7 +3725,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [8 x i16] with all elements
|
||||
/// containing the value provided in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set1_epi16(short __w) {
|
||||
return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
|
||||
}
|
||||
|
||||
@ -3720,7 +3743,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
|
||||
/// vector.
|
||||
/// \returns An initialized 128-bit vector of [16 x i8] with all elements
|
||||
/// containing the value provided in the operand.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) {
|
||||
return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
|
||||
__b, __b, __b, __b, __b);
|
||||
}
|
||||
@ -3739,8 +3762,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
|
||||
/// A 64-bit integral value used to initialize the upper 64 bits of the
|
||||
/// result.
|
||||
/// \returns An initialized 128-bit integer vector.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
|
||||
__m64 __q1) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setr_epi64(__m64 __q0, __m64 __q1) {
|
||||
return _mm_set_epi64(__q1, __q0);
|
||||
}
|
||||
|
||||
@ -3761,9 +3784,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
|
||||
/// \param __i3
|
||||
/// A 32-bit integral value used to initialize bits [127:96] of the result.
|
||||
/// \returns An initialized 128-bit integer vector.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
|
||||
int __i2,
|
||||
int __i3) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) {
|
||||
return _mm_set_epi32(__i3, __i2, __i1, __i0);
|
||||
}
|
||||
|
||||
@ -3792,7 +3814,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
|
||||
/// \param __w7
|
||||
/// A 16-bit integral value used to initialize bits [127:112] of the result.
|
||||
/// \returns An initialized 128-bit integer vector.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
|
||||
short __w5, short __w6, short __w7) {
|
||||
return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
|
||||
@ -3839,7 +3861,7 @@ _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
|
||||
/// \param __b15
|
||||
/// An 8-bit integral value used to initialize bits [127:120] of the result.
|
||||
/// \returns An initialized 128-bit integer vector.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
|
||||
char __b6, char __b7, char __b8, char __b9, char __b10,
|
||||
char __b11, char __b12, char __b13, char __b14, char __b15) {
|
||||
@ -3855,7 +3877,7 @@ _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
|
||||
///
|
||||
/// \returns An initialized 128-bit integer vector with all elements set to
|
||||
/// zero.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) {
|
||||
return __extension__(__m128i)(__v2di){0LL, 0LL};
|
||||
}
|
||||
|
||||
@ -4588,7 +4610,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
|
||||
/// A 128-bit integer vector operand. The lower 64 bits are moved to the
|
||||
/// destination.
|
||||
/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movepi64_pi64(__m128i __a) {
|
||||
return (__m64)__a[0];
|
||||
}
|
||||
|
||||
@ -4603,8 +4626,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
|
||||
/// A 64-bit value.
|
||||
/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
|
||||
/// the operand. The upper 64 bits are assigned zeros.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
|
||||
return __extension__(__m128i)(__v2di){(long long)__a, 0};
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movpi64_epi64(__m64 __a) {
|
||||
return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
|
||||
}
|
||||
|
||||
/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
|
||||
@ -4619,7 +4643,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
|
||||
/// destination.
|
||||
/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
|
||||
/// the operand. The upper 64 bits are assigned zeros.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_move_epi64(__m128i __a) {
|
||||
return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
|
||||
}
|
||||
|
||||
@ -4638,8 +4663,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
|
||||
/// A 128-bit vector of [2 x double]. \n
|
||||
/// Bits [127:64] are written to bits [127:64] of the destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_unpackhi_pd(__m128d __a, __m128d __b) {
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
|
||||
}
|
||||
|
||||
@ -4658,8 +4683,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
|
||||
/// A 128-bit vector of [2 x double]. \n
|
||||
/// Bits [63:0] are written to bits [127:64] of the destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
|
||||
__m128d __b) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_unpacklo_pd(__m128d __a, __m128d __b) {
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
|
||||
}
|
||||
|
||||
@ -4722,7 +4747,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
|
||||
/// A 128-bit floating-point vector of [2 x double].
|
||||
/// \returns A 128-bit floating-point vector of [4 x float] containing the same
|
||||
/// bitwise pattern as the parameter.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castpd_ps(__m128d __a) {
|
||||
return (__m128)__a;
|
||||
}
|
||||
|
||||
@ -4737,7 +4763,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
|
||||
/// A 128-bit floating-point vector of [2 x double].
|
||||
/// \returns A 128-bit integer vector containing the same bitwise pattern as the
|
||||
/// parameter.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castpd_si128(__m128d __a) {
|
||||
return (__m128i)__a;
|
||||
}
|
||||
|
||||
@ -4752,7 +4779,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
|
||||
/// A 128-bit floating-point vector of [4 x float].
|
||||
/// \returns A 128-bit floating-point vector of [2 x double] containing the same
|
||||
/// bitwise pattern as the parameter.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castps_pd(__m128 __a) {
|
||||
return (__m128d)__a;
|
||||
}
|
||||
|
||||
@ -4767,7 +4795,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
|
||||
/// A 128-bit floating-point vector of [4 x float].
|
||||
/// \returns A 128-bit integer vector containing the same bitwise pattern as the
|
||||
/// parameter.
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castps_si128(__m128 __a) {
|
||||
return (__m128i)__a;
|
||||
}
|
||||
|
||||
@ -4782,7 +4811,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 128-bit floating-point vector of [4 x float] containing the same
|
||||
/// bitwise pattern as the parameter.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castsi128_ps(__m128i __a) {
|
||||
return (__m128)__a;
|
||||
}
|
||||
|
||||
@ -4797,7 +4827,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
|
||||
/// A 128-bit integer vector.
|
||||
/// \returns A 128-bit floating-point vector of [2 x double] containing the same
|
||||
/// bitwise pattern as the parameter.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_castsi128_pd(__m128i __a) {
|
||||
return (__m128d)__a;
|
||||
}
|
||||
|
||||
@ -4889,8 +4920,11 @@ void _mm_pause(void);
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#undef __anyext128
|
||||
#undef __trunc64
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_MMX
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
|
||||
|
||||
|
||||
42
lib/include/gfniintrin.h
vendored
42
lib/include/gfniintrin.h
vendored
@ -14,6 +14,7 @@
|
||||
#ifndef __GFNIINTRIN_H
|
||||
#define __GFNIINTRIN_H
|
||||
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
/* Default attributes for simple form (no masking). */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
@ -25,6 +26,37 @@
|
||||
__target__("avx,gfni,no-evex512"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
/* Default attributes for VLX masked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
|
||||
__min_vector_width__(256)))
|
||||
#else
|
||||
/* Default attributes for simple form (no masking). */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("gfni"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
/* Default attributes for YMM unmasked form. */
|
||||
#define __DEFAULT_FN_ATTRS_Y \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
/* Default attributes for VLX masked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni"), \
|
||||
__min_vector_width__(256)))
|
||||
#endif
|
||||
|
||||
/* Default attributes for ZMM unmasked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_Z \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
@ -36,16 +68,6 @@
|
||||
__target__("avx512bw,evex512,gfni"), \
|
||||
__min_vector_width__(512)))
|
||||
|
||||
/* Default attributes for VLX masked forms. */
|
||||
#define __DEFAULT_FN_ATTRS_VL128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_VL256 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
|
||||
__min_vector_width__(256)))
|
||||
|
||||
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
|
||||
(__v16qi)(__m128i)(B), \
|
||||
|
||||
12
lib/include/hexagon_types.h
vendored
12
lib/include/hexagon_types.h
vendored
@ -1,7 +1,11 @@
|
||||
/******************************************************************************/
|
||||
/* (c) 2020 Qualcomm Innovation Center, Inc. All rights reserved. */
|
||||
/* */
|
||||
/******************************************************************************/
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef HEXAGON_TYPES_H
|
||||
#define HEXAGON_TYPES_H
|
||||
|
||||
|
||||
427
lib/include/hvx_hexagon_protos.h
vendored
427
lib/include/hvx_hexagon_protos.h
vendored
@ -5178,6 +5178,433 @@
|
||||
#define Q6_Vuh_vmpy_VuhVuh_rs16(Vu,Vv) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpyuhvs)(Vu,Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 69 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.sf=vadd(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vadd_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Wsf_vadd_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_sf_bf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.h=Vu32.hf
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VS
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vh_equals_Vhf(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.hf=Vu32.h
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vhf_equals_Vh(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VS
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vhf_equals_Vh(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_hf_h)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.sf=Vu32.w
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VS
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vsf_equals_Vw(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_sf_w)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.w=Vu32.sf
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vw_equals_Vsf(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VS
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vw_equals_Vsf(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_w_sf)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.bf=vcvt(Vu32.sf,Vv32.sf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vbf_vcvt_VsfVsf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vbf_vcvt_VsfVsf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_bf_sf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Qd4=vcmp.gt(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gt_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VA Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Q_vcmp_gt_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \
|
||||
((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf)(Vu, Vv)), -1)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Qx4&=vcmp.gt(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtand_QVbfVbf(HVX_VectorPred
|
||||
Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution
|
||||
Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Q_vcmp_gtand_QVbfVbf(Qx, Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \
|
||||
((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_and)( \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \
|
||||
Vv)), \
|
||||
-1)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Qx4|=vcmp.gt(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtor_QVbfVbf(HVX_VectorPred
|
||||
Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution
|
||||
Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Q_vcmp_gtor_QVbfVbf(Qx, Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \
|
||||
((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_or)( \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \
|
||||
Vv)), \
|
||||
-1)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Qx4^=vcmp.gt(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtxacc_QVbfVbf(HVX_VectorPred
|
||||
Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VA Execution
|
||||
Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Q_vcmp_gtxacc_QVbfVbf(Qx, Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt) \
|
||||
((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_xor)( \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu, \
|
||||
Vv)), \
|
||||
-1)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.bf=vmax(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmax_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_LATE Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vbf_vmax_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmax_bf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.bf=vmin(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmin_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_LATE Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vbf_vmin_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmin_bf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.sf=vmpy(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpy_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Wsf_vmpy_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vxx32.sf+=vmpy(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpyacc_WsfVbfVbf(HVX_VectorPair
|
||||
Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution
|
||||
Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Wsf_vmpyacc_WsfVbfVbf(Vxx, Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf_acc)(Vxx, Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 73
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.sf=vsub(Vu32.bf,Vv32.bf)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vsub_VbfVbf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Wsf_vsub_VbfVbf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_bf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 73 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32=vgetqfext(Vu32.x,Rt32)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfext_VR(HVX_Vector Vu, Word32 Rt)
|
||||
Instruction Type: CVI_VX
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vgetqfext_VR(Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext)(Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vx32|=vgetqfext(Vu32.x,Rt32)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfextor_VVR(HVX_Vector Vx,
|
||||
HVX_Vector Vu, Word32 Rt) Instruction Type: CVI_VX Execution Slots:
|
||||
SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vgetqfextor_VVR(Vx, Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext_oracc)(Vx, Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.x=vsetqfext(Vu32,Rt32)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vsetqfext_VR(HVX_Vector Vu, Word32 Rt)
|
||||
Instruction Type: CVI_VX
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vsetqfext_VR(Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_set_qfext)(Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.f8=vabs(Vu32.f8)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vabs_V(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VX_LATE
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vabs_V(Vu) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_f8)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vadd(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vadd_VV(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vadd_VV(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_hf_f8)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.b=vcvt2(Vu32.hf,Vv32.hf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vb_vcvt2_VhfVhf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vb_vcvt2_VhfVhf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_b_hf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vcvt2(Vu32.b)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vb(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VX_DV
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vcvt2_Vb(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_b)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vcvt2(Vu32.ub)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vub(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VX_DV
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vcvt2_Vub(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_ub)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.ub=vcvt2(Vu32.hf,Vv32.hf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vub_vcvt2_VhfVhf(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vub_vcvt2_VhfVhf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_ub_hf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.f8=vcvt(Vu32.hf,Vv32.hf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vcvt_VhfVhf(HVX_Vector Vu, HVX_Vector
|
||||
Vv) Instruction Type: CVI_VX Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vcvt_VhfVhf(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_f8_hf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vcvt(Vu32.f8)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt_V(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VX_DV
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vcvt_V(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_hf_f8)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.f8=vfmax(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vfmax_VV(HVX_Vector Vu, HVX_Vector Vv)
|
||||
Instruction Type: CVI_VX_LATE
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vfmax_VV(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmax_f8)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.f8=vfmin(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vfmin_VV(HVX_Vector Vu, HVX_Vector Vv)
|
||||
Instruction Type: CVI_VX_LATE
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vfmin_VV(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmin_f8)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.f8=vfneg(Vu32.f8)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vfneg_V(HVX_Vector Vu)
|
||||
Instruction Type: CVI_VX_LATE
|
||||
Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vfneg_V(Vu) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfneg_f8)(Vu)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32=vmerge(Vu32.x,Vv32.w)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_V_vmerge_VVw(HVX_Vector Vu, HVX_Vector
|
||||
Vv) Instruction Type: CVI_VS Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_V_vmerge_VVw(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmerge_qf)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vmpy(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpy_VV(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vmpy_VV(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vxx32.hf+=vmpy(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpyacc_WhfVV(HVX_VectorPair
|
||||
Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution
|
||||
Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vmpyacc_WhfVV(Vxx, Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8_acc)(Vxx, Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.qf16=vmpy(Vu32.hf,Rt32.hf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_VhfRhf(HVX_Vector Vu, Word32
|
||||
Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vqf16_vmpy_VhfRhf(Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_hf)(Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.qf16=vmpy(Vu32.qf16,Rt32.hf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_Vqf16Rhf(HVX_Vector Vu,
|
||||
Word32 Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vqf16_vmpy_Vqf16Rhf(Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_qf16)(Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vd32.qf32=vmpy(Vu32.sf,Rt32.sf)
|
||||
C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vmpy_VsfRsf(HVX_Vector Vu, Word32
|
||||
Rt) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Vqf32_vmpy_VsfRsf(Vu, Rt) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_sf)(Vu, Rt)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Vdd32.hf=vsub(Vu32.f8,Vv32.f8)
|
||||
C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vsub_VV(HVX_Vector Vu,
|
||||
HVX_Vector Vv) Instruction Type: CVI_VX_DV Execution Slots: SLOT23
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_Whf_vsub_VV(Vu, Vv) \
|
||||
__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv)
|
||||
#endif /* __HEXAGON_ARCH___ >= 79 */
|
||||
|
||||
#endif /* __HVX__ */
|
||||
|
||||
#endif
|
||||
|
||||
90
lib/include/immintrin.h
vendored
90
lib/include/immintrin.h
vendored
@ -605,6 +605,20 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#include <movdirintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVRS__)
|
||||
#include <movrsintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AVX10_2__) && defined(__MOVRS__))
|
||||
#include <movrs_avx10_2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AVX10_2_512__) && defined(__MOVRS__))
|
||||
#include <movrs_avx10_2_512intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
|
||||
#include <pconfigintrin.h>
|
||||
#endif
|
||||
@ -620,9 +634,6 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
|
||||
#include <invpcidintrin.h>
|
||||
#endif
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
|
||||
#include <amxfp16intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) || \
|
||||
defined(__WIDEKL__)
|
||||
@ -634,10 +645,59 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#include <amxintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
|
||||
#include <amxfp16intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
|
||||
#include <amxcomplexintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__)
|
||||
#include <amxfp8intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
|
||||
#include <amxtransposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
|
||||
#include <amxmovrsintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
|
||||
#include <amxmovrstransposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
|
||||
#include <amxavx512intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
|
||||
#include <amxtf32intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
|
||||
#include <amxtf32transposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__))
|
||||
#include <amxbf16transposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__))
|
||||
#include <amxfp16transposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__))
|
||||
#include <amxcomplextransposeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
defined(__AVX512VP2INTERSECT__)
|
||||
#include <avx512vp2intersectintrin.h>
|
||||
@ -648,6 +708,30 @@ _storebe_i64(void * __P, long long __D) {
|
||||
#include <avx512vlvp2intersectintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
|
||||
#include <avx10_2bf16intrin.h>
|
||||
#include <avx10_2convertintrin.h>
|
||||
#include <avx10_2copyintrin.h>
|
||||
#include <avx10_2minmaxintrin.h>
|
||||
#include <avx10_2niintrin.h>
|
||||
#include <avx10_2satcvtdsintrin.h>
|
||||
#include <avx10_2satcvtintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
|
||||
#include <avx10_2_512bf16intrin.h>
|
||||
#include <avx10_2_512convertintrin.h>
|
||||
#include <avx10_2_512minmaxintrin.h>
|
||||
#include <avx10_2_512niintrin.h>
|
||||
#include <avx10_2_512satcvtdsintrin.h>
|
||||
#include <avx10_2_512satcvtintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || \
|
||||
(defined(__AVX10_2_512__) && defined(__SM4__))
|
||||
#include <sm4evexintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
|
||||
#include <enqcmdintrin.h>
|
||||
#endif
|
||||
|
||||
30
lib/include/intrin.h
vendored
30
lib/include/intrin.h
vendored
@ -94,8 +94,8 @@ void __outwordstring(unsigned short, unsigned short *, unsigned long);
|
||||
unsigned long __readcr0(void);
|
||||
unsigned long __readcr2(void);
|
||||
unsigned __LPTRINT_TYPE__ __readcr3(void);
|
||||
unsigned long __readcr4(void);
|
||||
unsigned long __readcr8(void);
|
||||
unsigned __LPTRINT_TYPE__ __readcr4(void);
|
||||
unsigned __int64 __readcr8(void);
|
||||
unsigned int __readdr(unsigned int);
|
||||
#ifdef __i386__
|
||||
unsigned char __readfsbyte(unsigned long);
|
||||
@ -124,8 +124,8 @@ void __vmx_vmptrst(unsigned __int64 *);
|
||||
void __wbinvd(void);
|
||||
void __writecr0(unsigned int);
|
||||
void __writecr3(unsigned __INTPTR_TYPE__);
|
||||
void __writecr4(unsigned int);
|
||||
void __writecr8(unsigned int);
|
||||
void __writecr4(unsigned __INTPTR_TYPE__);
|
||||
void __writecr8(unsigned __int64);
|
||||
void __writedr(unsigned int, unsigned int);
|
||||
void __writefsbyte(unsigned long, unsigned char);
|
||||
void __writefsdword(unsigned long, unsigned long);
|
||||
@ -330,33 +330,33 @@ static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
|
||||
__asm__ volatile("hlt");
|
||||
}
|
||||
|
||||
static inline unsigned char __inbyte(unsigned short port) {
|
||||
static __inline__ unsigned char __inbyte(unsigned short port) {
|
||||
unsigned char ret;
|
||||
__asm__ __volatile__("inb %w1, %b0" : "=a"(ret) : "Nd"(port));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned short __inword(unsigned short port) {
|
||||
static __inline__ unsigned short __inword(unsigned short port) {
|
||||
unsigned short ret;
|
||||
__asm__ __volatile__("inw %w1, %w0" : "=a"(ret) : "Nd"(port));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned long __indword(unsigned short port) {
|
||||
static __inline__ unsigned long __indword(unsigned short port) {
|
||||
unsigned long ret;
|
||||
__asm__ __volatile__("inl %w1, %k0" : "=a"(ret) : "Nd"(port));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void __outbyte(unsigned short port, unsigned char data) {
|
||||
static __inline__ void __outbyte(unsigned short port, unsigned char data) {
|
||||
__asm__ __volatile__("outb %b0, %w1" : : "a"(data), "Nd"(port));
|
||||
}
|
||||
|
||||
static inline void __outword(unsigned short port, unsigned short data) {
|
||||
static __inline__ void __outword(unsigned short port, unsigned short data) {
|
||||
__asm__ __volatile__("outw %w0, %w1" : : "a"(data), "Nd"(port));
|
||||
}
|
||||
|
||||
static inline void __outdword(unsigned short port, unsigned long data) {
|
||||
static __inline__ void __outdword(unsigned short port, unsigned long data) {
|
||||
__asm__ __volatile__("outl %k0, %w1" : : "a"(data), "Nd"(port));
|
||||
}
|
||||
#endif
|
||||
@ -396,6 +396,16 @@ unsigned short __readx18word(unsigned long offset);
|
||||
unsigned long __readx18dword(unsigned long offset);
|
||||
unsigned __int64 __readx18qword(unsigned long offset);
|
||||
|
||||
void __addx18byte(unsigned long offset, unsigned char data);
|
||||
void __addx18word(unsigned long offset, unsigned short data);
|
||||
void __addx18dword(unsigned long offset, unsigned long data);
|
||||
void __addx18qword(unsigned long offset, unsigned __int64 data);
|
||||
|
||||
void __incx18byte(unsigned long offset);
|
||||
void __incx18word(unsigned long offset);
|
||||
void __incx18dword(unsigned long offset);
|
||||
void __incx18qword(unsigned long offset);
|
||||
|
||||
double _CopyDoubleFromInt64(__int64);
|
||||
float _CopyFloatFromInt32(__int32);
|
||||
__int32 _CopyInt32FromFloat(float);
|
||||
|
||||
11
lib/include/intrin0.h
vendored
11
lib/include/intrin0.h
vendored
@ -44,7 +44,7 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
|
||||
__int64 *_ComparandResult);
|
||||
#endif
|
||||
|
||||
#ifdef __x86_64__ && !defined(__arm64ec__)
|
||||
#if defined(__x86_64__) && !defined(__arm64ec__)
|
||||
unsigned __int64 _umul128(unsigned __int64, unsigned __int64,
|
||||
unsigned __int64 *);
|
||||
unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
|
||||
@ -207,6 +207,9 @@ long _InterlockedExchange_rel(long volatile *_Target, long _Value);
|
||||
__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
|
||||
__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
|
||||
__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
|
||||
void *_InterlockedExchangePointer_acq(void *volatile *_Target, void *_Value);
|
||||
void *_InterlockedExchangePointer_nf(void *volatile *_Target, void *_Value);
|
||||
void *_InterlockedExchangePointer_rel(void *volatile *_Target, void *_Value);
|
||||
|
||||
/*----------------------------------------------------------------------------*\
|
||||
|* Interlocked Compare Exchange
|
||||
@ -237,6 +240,12 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
|
||||
__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
|
||||
__int64 _Exchange,
|
||||
__int64 _Comparand);
|
||||
void *_InterlockedCompareExchangePointer_acq(void *volatile *_Destination,
|
||||
void *_Exchange, void *_Comparand);
|
||||
void *_InterlockedCompareExchangePointer_nf(void *volatile *_Destination,
|
||||
void *_Exchange, void *_Comparand);
|
||||
void *_InterlockedCompareExchangePointer_rel(void *volatile *_Destination,
|
||||
void *_Exchange, void *_Comparand);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
30
lib/include/larchintrin.h
vendored
30
lib/include/larchintrin.h
vendored
@ -228,17 +228,31 @@ extern __inline void
|
||||
((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
|
||||
#endif
|
||||
|
||||
#define __frecipe_s(/*float*/ _1) \
|
||||
(float)__builtin_loongarch_frecipe_s((float)_1)
|
||||
#ifdef __loongarch_frecipe
|
||||
extern __inline float
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__frecipe_s(float _1) {
|
||||
return __builtin_loongarch_frecipe_s(_1);
|
||||
}
|
||||
|
||||
#define __frecipe_d(/*double*/ _1) \
|
||||
(double)__builtin_loongarch_frecipe_d((double)_1)
|
||||
extern __inline double
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__frecipe_d(double _1) {
|
||||
return __builtin_loongarch_frecipe_d(_1);
|
||||
}
|
||||
|
||||
#define __frsqrte_s(/*float*/ _1) \
|
||||
(float)__builtin_loongarch_frsqrte_s((float)_1)
|
||||
extern __inline float
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__frsqrte_s(float _1) {
|
||||
return __builtin_loongarch_frsqrte_s(_1);
|
||||
}
|
||||
|
||||
#define __frsqrte_d(/*double*/ _1) \
|
||||
(double)__builtin_loongarch_frsqrte_d((double)_1)
|
||||
extern __inline double
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
__frsqrte_d(double _1) {
|
||||
return __builtin_loongarch_frsqrte_d(_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
52
lib/include/lasxintrin.h
vendored
52
lib/include/lasxintrin.h
vendored
@ -1726,18 +1726,6 @@ extern __inline
|
||||
return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvfrecipe_s(__m256 _1) {
|
||||
return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
|
||||
__lasx_xvfrecipe_d(__m256d _1) {
|
||||
return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvfrint_s(__m256 _1) {
|
||||
@ -1762,18 +1750,6 @@ extern __inline
|
||||
return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvfrsqrte_s(__m256 _1) {
|
||||
return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
|
||||
__lasx_xvfrsqrte_d(__m256d _1) {
|
||||
return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvflogb_s(__m256 _1) {
|
||||
@ -2585,7 +2561,7 @@ extern __inline
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
|
||||
__lasx_xvorn_v(__m256i _1, __m256i _2) {
|
||||
return (__m256i)__builtin_lasx_xvorn_v((v32i8)_1, (v32i8)_2);
|
||||
return (__m256i)__builtin_lasx_xvorn_v((v32u8)_1, (v32u8)_2);
|
||||
}
|
||||
|
||||
#define __lasx_xvldi(/*i13*/ _1) ((__m256i)__builtin_lasx_xvldi((_1)))
|
||||
@ -3866,6 +3842,32 @@ extern __inline
|
||||
return (__m256i)__builtin_lasx_xvfcmp_sun_s((v8f32)_1, (v8f32)_2);
|
||||
}
|
||||
|
||||
#if defined(__loongarch_frecipe)
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvfrecipe_s(__m256 _1) {
|
||||
return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
|
||||
__lasx_xvfrecipe_d(__m256d _1) {
|
||||
return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
|
||||
__lasx_xvfrsqrte_s(__m256 _1) {
|
||||
return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
|
||||
__lasx_xvfrsqrte_d(__m256d _1) {
|
||||
return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __lasx_xvpickve_d_f(/*__m256d*/ _1, /*ui2*/ _2) \
|
||||
((__m256d)__builtin_lasx_xvpickve_d_f((v4f64)(_1), (_2)))
|
||||
|
||||
|
||||
11
lib/include/limits.h
vendored
11
lib/include/limits.h
vendored
@ -111,11 +111,14 @@
|
||||
#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
|
||||
#endif
|
||||
|
||||
/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension. It's too bad
|
||||
that we don't have something like #pragma poison that could be used to
|
||||
deprecate a macro - the code should just use LLONG_MAX and friends.
|
||||
/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension. Android's
|
||||
bionic also defines them. It's too bad that we don't have something like
|
||||
#pragma poison that could be used to deprecate a macro - the code should just
|
||||
use LLONG_MAX and friends.
|
||||
*/
|
||||
#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
|
||||
#if (defined(__GNU_LIBRARY__) ? defined(__USE_GNU) \
|
||||
: !defined(__STRICT_ANSI__)) || \
|
||||
defined(__BIONIC__)
|
||||
|
||||
#undef LONG_LONG_MIN
|
||||
#undef LONG_LONG_MAX
|
||||
|
||||
38
lib/include/llvm_libc_wrappers/ctype.h
vendored
38
lib/include/llvm_libc_wrappers/ctype.h
vendored
@ -51,6 +51,19 @@
|
||||
#pragma push_macro("toascii")
|
||||
#pragma push_macro("tolower")
|
||||
#pragma push_macro("toupper")
|
||||
#pragma push_macro("isalnum_l")
|
||||
#pragma push_macro("isalpha_l")
|
||||
#pragma push_macro("isascii_l")
|
||||
#pragma push_macro("isblank_l")
|
||||
#pragma push_macro("iscntrl_l")
|
||||
#pragma push_macro("isdigit_l")
|
||||
#pragma push_macro("isgraph_l")
|
||||
#pragma push_macro("islower_l")
|
||||
#pragma push_macro("isprint_l")
|
||||
#pragma push_macro("ispunct_l")
|
||||
#pragma push_macro("isspace_l")
|
||||
#pragma push_macro("isupper_l")
|
||||
#pragma push_macro("isxdigit_l")
|
||||
|
||||
#undef isalnum
|
||||
#undef isalpha
|
||||
@ -68,6 +81,18 @@
|
||||
#undef toascii
|
||||
#undef tolower
|
||||
#undef toupper
|
||||
#undef isalnum_l
|
||||
#undef isalpha_l
|
||||
#undef iscntrl_l
|
||||
#undef isdigit_l
|
||||
#undef islower_l
|
||||
#undef isgraph_l
|
||||
#undef isprint_l
|
||||
#undef ispunct_l
|
||||
#undef isspace_l
|
||||
#undef isupper_l
|
||||
#undef isblank_l
|
||||
#undef isxdigit_l
|
||||
|
||||
#pragma omp begin declare target
|
||||
|
||||
@ -93,6 +118,19 @@
|
||||
#pragma pop_macro("toascii")
|
||||
#pragma pop_macro("tolower")
|
||||
#pragma pop_macro("toupper")
|
||||
#pragma pop_macro("isalnum_l")
|
||||
#pragma pop_macro("isalpha_l")
|
||||
#pragma pop_macro("isascii_l")
|
||||
#pragma pop_macro("isblank_l")
|
||||
#pragma pop_macro("iscntrl_l")
|
||||
#pragma pop_macro("isdigit_l")
|
||||
#pragma pop_macro("isgraph_l")
|
||||
#pragma pop_macro("islower_l")
|
||||
#pragma pop_macro("isprint_l")
|
||||
#pragma pop_macro("ispunct_l")
|
||||
#pragma pop_macro("isspace_l")
|
||||
#pragma pop_macro("isupper_l")
|
||||
#pragma pop_macro("isxdigit_l")
|
||||
#endif
|
||||
|
||||
#undef __LIBC_ATTRS
|
||||
|
||||
8
lib/include/llvm_libc_wrappers/stdlib.h
vendored
8
lib/include/llvm_libc_wrappers/stdlib.h
vendored
@ -34,8 +34,16 @@ _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
|
||||
_Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!");
|
||||
_Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!");
|
||||
|
||||
#if defined(__GLIBC__) && __cplusplus >= 201703L
|
||||
#define at_quick_exit atexit
|
||||
#endif
|
||||
|
||||
#include <llvm-libc-decls/stdlib.h>
|
||||
|
||||
#if defined(__GLIBC__) && __cplusplus >= 201703L
|
||||
#undef at_quick_exit
|
||||
#endif
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#undef __LIBC_ATTRS
|
||||
|
||||
52
lib/include/lsxintrin.h
vendored
52
lib/include/lsxintrin.h
vendored
@ -1776,18 +1776,6 @@ extern __inline
|
||||
return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vfrecipe_s(__m128 _1) {
|
||||
return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
|
||||
__lsx_vfrecipe_d(__m128d _1) {
|
||||
return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vfrint_s(__m128 _1) {
|
||||
@ -1812,18 +1800,6 @@ extern __inline
|
||||
return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vfrsqrte_s(__m128 _1) {
|
||||
return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
|
||||
__lsx_vfrsqrte_d(__m128d _1) {
|
||||
return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vflogb_s(__m128 _1) {
|
||||
@ -3425,7 +3401,7 @@ extern __inline
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
|
||||
__lsx_vorn_v(__m128i _1, __m128i _2) {
|
||||
return (__m128i)__builtin_lsx_vorn_v((v16i8)_1, (v16i8)_2);
|
||||
return (__m128i)__builtin_lsx_vorn_v((v16u8)_1, (v16u8)_2);
|
||||
}
|
||||
|
||||
#define __lsx_vldi(/*i13*/ _1) ((__m128i)__builtin_lsx_vldi((_1)))
|
||||
@ -3738,6 +3714,32 @@ extern __inline
|
||||
return (__m128i)__builtin_lsx_vfcmp_sun_s((v4f32)_1, (v4f32)_2);
|
||||
}
|
||||
|
||||
#if defined(__loongarch_frecipe)
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vfrecipe_s(__m128 _1) {
|
||||
return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
|
||||
__lsx_vfrecipe_d(__m128d _1) {
|
||||
return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
|
||||
__lsx_vfrsqrte_s(__m128 _1) {
|
||||
return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
|
||||
}
|
||||
|
||||
extern __inline
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
|
||||
__lsx_vfrsqrte_d(__m128d _1) {
|
||||
return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __lsx_vrepli_b(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_b((_1)))
|
||||
|
||||
#define __lsx_vrepli_d(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_d((_1)))
|
||||
|
||||
17
lib/include/lzcntintrin.h
vendored
17
lib/include/lzcntintrin.h
vendored
@ -15,7 +15,13 @@
|
||||
#define __LZCNTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
/// Counts the number of leading zero bits in the operand.
|
||||
@ -43,8 +49,7 @@
|
||||
/// bits in the operand.
|
||||
/// \see _lzcnt_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__lzcnt32(unsigned int __X)
|
||||
{
|
||||
__lzcnt32(unsigned int __X) {
|
||||
return __builtin_ia32_lzcnt_u32(__X);
|
||||
}
|
||||
|
||||
@ -60,8 +65,7 @@ __lzcnt32(unsigned int __X)
|
||||
/// bits in the operand.
|
||||
/// \see __lzcnt32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_lzcnt_u32(unsigned int __X)
|
||||
{
|
||||
_lzcnt_u32(unsigned int __X) {
|
||||
return __builtin_ia32_lzcnt_u32(__X);
|
||||
}
|
||||
|
||||
@ -93,8 +97,7 @@ _lzcnt_u32(unsigned int __X)
|
||||
/// bits in the operand.
|
||||
/// \see __lzcnt64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_lzcnt_u64(unsigned long long __X)
|
||||
{
|
||||
_lzcnt_u64(unsigned long long __X) {
|
||||
return __builtin_ia32_lzcnt_u64(__X);
|
||||
}
|
||||
#endif
|
||||
|
||||
364
lib/include/mmintrin.h
vendored
364
lib/include/mmintrin.h
vendored
@ -21,10 +21,45 @@ typedef int __v2si __attribute__((__vector_size__(8)));
|
||||
typedef short __v4hi __attribute__((__vector_size__(8)));
|
||||
typedef char __v8qi __attribute__((__vector_size__(8)));
|
||||
|
||||
/* Unsigned types */
|
||||
typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
|
||||
typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
|
||||
typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
|
||||
typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
|
||||
|
||||
/* We need an explicitly signed variant for char. Note that this shouldn't
|
||||
* appear in the interface though. */
|
||||
typedef signed char __v8qs __attribute__((__vector_size__(8)));
|
||||
|
||||
/* SSE/SSE2 types */
|
||||
typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
|
||||
typedef int __v4si __attribute__((__vector_size__(16)));
|
||||
typedef short __v8hi __attribute__((__vector_size__(16)));
|
||||
typedef char __v16qi __attribute__((__vector_size__(16)));
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
|
||||
__min_vector_width__(64)))
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS_SSE2 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("sse2,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_SSE2 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
|
||||
#endif
|
||||
|
||||
#define __trunc64(x) \
|
||||
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
|
||||
#define __anyext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, -1, -1)
|
||||
|
||||
/// Clears the MMX state by setting the state of the x87 stack registers
|
||||
/// to empty.
|
||||
@ -50,10 +85,10 @@ _mm_empty(void) {
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
|
||||
/// parameter. The upper 32 bits are set to 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtsi32_si64(int __i)
|
||||
{
|
||||
return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
|
||||
return __extension__ (__m64)(__v2si){__i, 0};
|
||||
}
|
||||
|
||||
/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
|
||||
@ -67,10 +102,10 @@ _mm_cvtsi32_si64(int __i)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 32-bit signed integer value containing the lower 32 bits of the
|
||||
/// parameter.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtsi64_si32(__m64 __m)
|
||||
{
|
||||
return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
|
||||
return ((__v2si)__m)[0];
|
||||
}
|
||||
|
||||
/// Casts a 64-bit signed integer value into a 64-bit integer vector.
|
||||
@ -83,7 +118,7 @@ _mm_cvtsi64_si32(__m64 __m)
|
||||
/// A 64-bit signed integer.
|
||||
/// \returns A 64-bit integer vector containing the same bitwise pattern as the
|
||||
/// parameter.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtsi64_m64(long long __i)
|
||||
{
|
||||
return (__m64)__i;
|
||||
@ -99,7 +134,7 @@ _mm_cvtsi64_m64(long long __i)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 64-bit signed integer containing the same bitwise pattern as the
|
||||
/// parameter.
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtm64_si64(__m64 __m)
|
||||
{
|
||||
return (long long)__m;
|
||||
@ -124,10 +159,11 @@ _mm_cvtm64_si64(__m64 __m)
|
||||
/// written to the upper 32 bits of the result.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_packs_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
|
||||
return __trunc64(__builtin_ia32_packsswb128(
|
||||
(__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
|
||||
@ -149,10 +185,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
|
||||
/// written to the upper 32 bits of the result.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_packs_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
|
||||
return __trunc64(__builtin_ia32_packssdw128(
|
||||
(__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
|
||||
}
|
||||
|
||||
/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
|
||||
@ -174,10 +211,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
|
||||
/// written to the upper 32 bits of the result.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_packs_pu16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
|
||||
return __trunc64(__builtin_ia32_packuswb128(
|
||||
(__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
|
||||
@ -201,10 +239,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
|
||||
/// Bits [63:56] are written to bits [63:56] of the result.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
|
||||
4, 12, 5, 13, 6, 14, 7, 15);
|
||||
}
|
||||
|
||||
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
|
||||
@ -224,10 +263,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
|
||||
/// Bits [63:48] are written to bits [63:48] of the result.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
|
||||
2, 6, 3, 7);
|
||||
}
|
||||
|
||||
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
|
||||
@ -245,10 +285,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
|
||||
/// the upper 32 bits of the result.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
|
||||
}
|
||||
|
||||
/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
|
||||
@ -272,10 +312,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
|
||||
/// Bits [31:24] are written to bits [63:56] of the result.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
|
||||
0, 8, 1, 9, 2, 10, 3, 11);
|
||||
}
|
||||
|
||||
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
|
||||
@ -295,10 +336,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
|
||||
/// Bits [31:16] are written to bits [63:48] of the result.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
|
||||
0, 4, 1, 5);
|
||||
}
|
||||
|
||||
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
|
||||
@ -316,10 +358,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
|
||||
/// the upper 32 bits of the result.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
|
||||
}
|
||||
|
||||
/// Adds each 8-bit integer element of the first 64-bit integer vector
|
||||
@ -337,10 +379,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_add_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
|
||||
}
|
||||
|
||||
/// Adds each 16-bit integer element of the first 64-bit integer vector
|
||||
@ -358,10 +400,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_add_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
|
||||
}
|
||||
|
||||
/// Adds each 32-bit integer element of the first 64-bit integer vector
|
||||
@ -379,10 +421,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [2 x i32].
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_add_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
|
||||
}
|
||||
|
||||
/// Adds, with saturation, each 8-bit signed integer element of the first
|
||||
@ -403,10 +445,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
|
||||
/// of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_adds_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
|
||||
}
|
||||
|
||||
/// Adds, with saturation, each 16-bit signed integer element of the first
|
||||
@ -427,10 +469,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
|
||||
/// of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_adds_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
|
||||
}
|
||||
|
||||
/// Adds, with saturation, each 8-bit unsigned integer element of the first
|
||||
@ -450,10 +492,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
|
||||
/// unsigned sums of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_adds_pu8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
|
||||
}
|
||||
|
||||
/// Adds, with saturation, each 16-bit unsigned integer element of the first
|
||||
@ -473,10 +515,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
|
||||
/// unsigned sums of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_adds_pu16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
|
||||
}
|
||||
|
||||
/// Subtracts each 8-bit integer element of the second 64-bit integer
|
||||
@ -494,10 +536,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
|
||||
/// both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sub_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
|
||||
}
|
||||
|
||||
/// Subtracts each 16-bit integer element of the second 64-bit integer
|
||||
@ -515,10 +557,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
|
||||
/// both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sub_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
|
||||
}
|
||||
|
||||
/// Subtracts each 32-bit integer element of the second 64-bit integer
|
||||
@ -536,10 +578,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
|
||||
/// both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sub_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
|
||||
}
|
||||
|
||||
/// Subtracts, with saturation, each 8-bit signed integer element of the second
|
||||
@ -560,10 +602,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
|
||||
/// differences of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_subs_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
|
||||
}
|
||||
|
||||
/// Subtracts, with saturation, each 16-bit signed integer element of the
|
||||
@ -584,10 +626,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
|
||||
/// differences of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_subs_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
|
||||
}
|
||||
|
||||
/// Subtracts each 8-bit unsigned integer element of the second 64-bit
|
||||
@ -608,10 +650,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
|
||||
/// differences of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_subs_pu8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
|
||||
}
|
||||
|
||||
/// Subtracts each 16-bit unsigned integer element of the second 64-bit
|
||||
@ -632,10 +674,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
|
||||
/// differences of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_subs_pu16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
|
||||
}
|
||||
|
||||
/// Multiplies each 16-bit signed integer element of the first 64-bit
|
||||
@ -659,10 +701,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
|
||||
/// products of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_madd_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
|
||||
return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
|
||||
(__v8hi)__anyext128(__m2)));
|
||||
}
|
||||
|
||||
/// Multiplies each 16-bit signed integer element of the first 64-bit
|
||||
@ -680,10 +723,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
|
||||
/// of the products of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
|
||||
(__v8hi)__anyext128(__m2)));
|
||||
}
|
||||
|
||||
/// Multiplies each 16-bit signed integer element of the first 64-bit
|
||||
@ -701,10 +745,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
|
||||
/// of the products of both parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_mullo_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
|
||||
}
|
||||
|
||||
/// Left-shifts each 16-bit signed integer element of the first
|
||||
@ -724,10 +768,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
|
||||
/// values. If \a __count is greater or equal to 16, the result is set to all
|
||||
/// 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sll_pi16(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
|
||||
(__v8hi)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Left-shifts each 16-bit signed integer element of a 64-bit integer
|
||||
@ -746,10 +791,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
|
||||
/// values. If \a __count is greater or equal to 16, the result is set to all
|
||||
/// 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_slli_pi16(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Left-shifts each 32-bit signed integer element of the first
|
||||
@ -769,10 +815,11 @@ _mm_slli_pi16(__m64 __m, int __count)
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
|
||||
/// values. If \a __count is greater or equal to 32, the result is set to all
|
||||
/// 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sll_pi32(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
|
||||
(__v4si)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Left-shifts each 32-bit signed integer element of a 64-bit integer
|
||||
@ -791,10 +838,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
|
||||
/// values. If \a __count is greater or equal to 32, the result is set to all
|
||||
/// 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_slli_pi32(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Left-shifts the first 64-bit integer parameter by the number of bits
|
||||
@ -811,10 +859,11 @@ _mm_slli_pi32(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector containing the left-shifted value. If
|
||||
/// \a __count is greater or equal to 64, the result is set to 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sll_si64(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
|
||||
(__v2di)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Left-shifts the first parameter, which is a 64-bit integer, by the
|
||||
@ -831,10 +880,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector containing the left-shifted value. If
|
||||
/// \a __count is greater or equal to 64, the result is set to 0.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_slli_si64(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Right-shifts each 16-bit integer element of the first parameter,
|
||||
@ -855,10 +905,11 @@ _mm_slli_si64(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sra_pi16(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
|
||||
(__v8hi)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
|
||||
@ -878,10 +929,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srai_pi16(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Right-shifts each 32-bit integer element of the first parameter,
|
||||
@ -902,10 +954,11 @@ _mm_srai_pi16(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sra_pi32(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
|
||||
(__v4si)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
|
||||
@ -925,10 +978,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srai_pi32(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Right-shifts each 16-bit integer element of the first parameter,
|
||||
@ -948,10 +1002,11 @@ _mm_srai_pi32(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srl_pi16(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
|
||||
(__v8hi)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
|
||||
@ -970,10 +1025,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srli_pi16(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Right-shifts each 32-bit integer element of the first parameter,
|
||||
@ -993,10 +1049,11 @@ _mm_srli_pi16(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srl_pi32(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
|
||||
(__v4si)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
|
||||
@ -1015,10 +1072,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srli_pi32(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Right-shifts the first 64-bit integer parameter by the number of bits
|
||||
@ -1035,10 +1093,11 @@ _mm_srli_pi32(__m64 __m, int __count)
|
||||
/// \param __count
|
||||
/// A 64-bit integer vector interpreted as a single 64-bit integer.
|
||||
/// \returns A 64-bit integer vector containing the right-shifted value.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srl_si64(__m64 __m, __m64 __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
|
||||
(__v2di)__anyext128(__count)));
|
||||
}
|
||||
|
||||
/// Right-shifts the first parameter, which is a 64-bit integer, by the
|
||||
@ -1056,10 +1115,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
|
||||
/// \param __count
|
||||
/// A 32-bit integer value.
|
||||
/// \returns A 64-bit integer vector containing the right-shifted value.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_srli_si64(__m64 __m, int __count)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
|
||||
return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
|
||||
__count));
|
||||
}
|
||||
|
||||
/// Performs a bitwise AND of two 64-bit integer vectors.
|
||||
@ -1074,10 +1134,10 @@ _mm_srli_si64(__m64 __m, int __count)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 64-bit integer vector containing the bitwise AND of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_and_si64(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
|
||||
return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
|
||||
}
|
||||
|
||||
/// Performs a bitwise NOT of the first 64-bit integer vector, and then
|
||||
@ -1095,10 +1155,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 64-bit integer vector containing the bitwise AND of the second
|
||||
/// parameter and the one's complement of the first parameter.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_andnot_si64(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
|
||||
return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
|
||||
}
|
||||
|
||||
/// Performs a bitwise OR of two 64-bit integer vectors.
|
||||
@ -1113,10 +1173,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 64-bit integer vector containing the bitwise OR of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_or_si64(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
|
||||
return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
|
||||
}
|
||||
|
||||
/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
|
||||
@ -1131,10 +1191,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector.
|
||||
/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
|
||||
/// parameters.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_xor_si64(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
|
||||
return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
|
||||
}
|
||||
|
||||
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1153,10 +1213,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
|
||||
return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
|
||||
}
|
||||
|
||||
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1175,10 +1235,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
|
||||
}
|
||||
|
||||
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1197,10 +1257,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [2 x i32].
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
|
||||
}
|
||||
|
||||
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1219,10 +1279,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
|
||||
/* This function always performs a signed comparison, but __v8qi is a char
|
||||
which may be signed or unsigned, so use __v8qs. */
|
||||
return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
|
||||
}
|
||||
|
||||
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1241,10 +1303,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
|
||||
return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
|
||||
}
|
||||
|
||||
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
|
||||
@ -1263,10 +1325,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
|
||||
/// A 64-bit integer vector of [2 x i32].
|
||||
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
|
||||
/// results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
|
||||
return (__m64)((__v2si)__m1 > (__v2si)__m2);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector initialized to zero.
|
||||
@ -1276,10 +1338,9 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
|
||||
/// This intrinsic corresponds to the <c> PXOR </c> instruction.
|
||||
///
|
||||
/// \returns An initialized 64-bit integer vector with all elements set to zero.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_setzero_si64(void)
|
||||
{
|
||||
return __extension__ (__m64){ 0LL };
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_setzero_si64(void) {
|
||||
return __extension__(__m64){0LL};
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector initialized with the specified
|
||||
@ -1297,10 +1358,9 @@ _mm_setzero_si64(void)
|
||||
/// A 32-bit integer value used to initialize the lower 32 bits of the
|
||||
/// result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_set_pi32(int __i1, int __i0)
|
||||
{
|
||||
return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set_pi32(int __i1, int __i0) {
|
||||
return __extension__(__m64)(__v2si){__i0, __i1};
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector initialized with the specified
|
||||
@ -1320,10 +1380,9 @@ _mm_set_pi32(int __i1, int __i0)
|
||||
/// \param __s0
|
||||
/// A 16-bit integer value used to initialize bits [15:0] of the result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
|
||||
{
|
||||
return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) {
|
||||
return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector initialized with the specified
|
||||
@ -1351,12 +1410,11 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
|
||||
/// \param __b0
|
||||
/// An 8-bit integer value used to initialize bits [7:0] of the result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
|
||||
char __b1, char __b0)
|
||||
{
|
||||
return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
|
||||
__b4, __b5, __b6, __b7);
|
||||
char __b1, char __b0) {
|
||||
return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
|
||||
__b4, __b5, __b6, __b7};
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector of [2 x i32], with each of the
|
||||
@ -1372,10 +1430,9 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
|
||||
/// A 32-bit integer value used to initialize each vector element of the
|
||||
/// result.
|
||||
/// \returns An initialized 64-bit integer vector of [2 x i32].
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_set1_pi32(int __i)
|
||||
{
|
||||
return _mm_set_pi32(__i, __i);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set1_pi32(int __i) {
|
||||
return _mm_set_pi32(__i, __i);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector of [4 x i16], with each of the
|
||||
@ -1391,10 +1448,9 @@ _mm_set1_pi32(int __i)
|
||||
/// A 16-bit integer value used to initialize each vector element of the
|
||||
/// result.
|
||||
/// \returns An initialized 64-bit integer vector of [4 x i16].
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_set1_pi16(short __w)
|
||||
{
|
||||
return _mm_set_pi16(__w, __w, __w, __w);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set1_pi16(short __w) {
|
||||
return _mm_set_pi16(__w, __w, __w, __w);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector of [8 x i8], with each of the
|
||||
@ -1409,10 +1465,9 @@ _mm_set1_pi16(short __w)
|
||||
/// An 8-bit integer value used to initialize each vector element of the
|
||||
/// result.
|
||||
/// \returns An initialized 64-bit integer vector of [8 x i8].
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_set1_pi8(char __b)
|
||||
{
|
||||
return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_set1_pi8(char __b) {
|
||||
return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector, initialized in reverse order with
|
||||
@ -1430,10 +1485,9 @@ _mm_set1_pi8(char __b)
|
||||
/// A 32-bit integer value used to initialize the upper 32 bits of the
|
||||
/// result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_setr_pi32(int __i0, int __i1)
|
||||
{
|
||||
return _mm_set_pi32(__i1, __i0);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_setr_pi32(int __i0, int __i1) {
|
||||
return _mm_set_pi32(__i1, __i0);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector, initialized in reverse order with
|
||||
@ -1453,10 +1507,9 @@ _mm_setr_pi32(int __i0, int __i1)
|
||||
/// \param __w3
|
||||
/// A 16-bit integer value used to initialize bits [63:48] of the result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
|
||||
{
|
||||
return _mm_set_pi16(__w3, __w2, __w1, __w0);
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
|
||||
return _mm_set_pi16(__w3, __w2, __w1, __w0);
|
||||
}
|
||||
|
||||
/// Constructs a 64-bit integer vector, initialized in reverse order with
|
||||
@ -1484,14 +1537,15 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
|
||||
/// \param __b7
|
||||
/// An 8-bit integer value used to initialize bits [63:56] of the result.
|
||||
/// \returns An initialized 64-bit integer vector.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
|
||||
char __b6, char __b7)
|
||||
{
|
||||
return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
|
||||
char __b6, char __b7) {
|
||||
return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __anyext128
|
||||
#undef __trunc64
|
||||
#undef __DEFAULT_FN_ATTRS_SSE2
|
||||
|
||||
/* Aliases for compatibility. */
|
||||
#define _m_empty _mm_empty
|
||||
|
||||
6
lib/include/module.modulemap
vendored
6
lib/include/module.modulemap
vendored
@ -66,6 +66,8 @@ module _Builtin_intrinsics [system] [extern_c] {
|
||||
textual header "__wmmintrin_aes.h"
|
||||
textual header "__wmmintrin_pclmul.h"
|
||||
|
||||
textual header "mm3dnow.h"
|
||||
|
||||
explicit module mm_malloc {
|
||||
requires !freestanding
|
||||
header "mm_malloc.h"
|
||||
@ -122,10 +124,6 @@ module _Builtin_intrinsics [system] [extern_c] {
|
||||
header "popcntintrin.h"
|
||||
}
|
||||
|
||||
explicit module mm3dnow {
|
||||
header "mm3dnow.h"
|
||||
}
|
||||
|
||||
explicit module aes_pclmul {
|
||||
header "wmmintrin.h"
|
||||
export aes
|
||||
|
||||
98
lib/include/movrs_avx10_2_512intrin.h
vendored
Normal file
98
lib/include/movrs_avx10_2_512intrin.h
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
/*===----- movrs_avx10_2_512intrin.h - AVX10.2-512-MOVRS intrinsics --------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <movrs_avx10_2_512intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __MOVRS_AVX10_2_512INTRIN_H
|
||||
#define __MOVRS_AVX10_2_512INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("movrs, avx10.2-512"), __min_vector_width__(512)))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_loadrs_epi8(void const *__A) {
|
||||
return (__m512i)__builtin_ia32_vmovrsb512((const __v64qi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_loadrs_epi8(__m512i __W, __mmask64 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectb_512(
|
||||
(__mmask64)__U, (__v64qi)_mm512_loadrs_epi8(__A), (__v64qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_loadrs_epi8(__mmask64 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
|
||||
(__v64qi)_mm512_loadrs_epi8(__A),
|
||||
(__v64qi)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_loadrs_epi32(void const *__A) {
|
||||
return (__m512i)__builtin_ia32_vmovrsd512((const __v16si *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_loadrs_epi32(__m512i __W, __mmask16 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectd_512(
|
||||
(__mmask16)__U, (__v16si)_mm512_loadrs_epi32(__A), (__v16si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_loadrs_epi32(__mmask16 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_loadrs_epi32(__A),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_loadrs_epi64(void const *__A) {
|
||||
return (__m512i)__builtin_ia32_vmovrsq512((const __v8di *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_loadrs_epi64(__m512i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectq_512(
|
||||
(__mmask8)__U, (__v8di)_mm512_loadrs_epi64(__A), (__v8di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_loadrs_epi64(__A),
|
||||
(__v8di)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_loadrs_epi16(void const *__A) {
|
||||
return (__m512i)__builtin_ia32_vmovrsw512((const __v32hi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_loadrs_epi16(__m512i __W, __mmask32 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectw_512(
|
||||
(__mmask32)__U, (__v32hi)_mm512_loadrs_epi16(__A), (__v32hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_loadrs_epi16(__mmask32 __U, void const *__A) {
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_loadrs_epi16(__A),
|
||||
(__v32hi)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __MOVRS_AVX10_2_512INTRIN_H */
|
||||
174
lib/include/movrs_avx10_2intrin.h
vendored
Normal file
174
lib/include/movrs_avx10_2intrin.h
vendored
Normal file
@ -0,0 +1,174 @@
|
||||
/*===--------- movrs_avx10_2intrin.h - AVX10.2-MOVRS intrinsics ------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error \
|
||||
"Never use <movrs_avx10_2intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __MOVRS_AVX10_2INTRIN_H
|
||||
#define __MOVRS_AVX10_2INTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS128 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("movrs,avx10.2-256"), __min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS256 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("movrs,avx10.2-256"), __min_vector_width__(256)))
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_loadrs_epi8(void const *__A) {
|
||||
return (__m128i)__builtin_ia32_vmovrsb128((const __v16qi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_loadrs_epi8(__m128i __W, __mmask16 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectb_128(
|
||||
(__mmask16)__U, (__v16qi)_mm_loadrs_epi8(__A), (__v16qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_loadrs_epi8(__mmask16 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
|
||||
(__v16qi)_mm_loadrs_epi8(__A),
|
||||
(__v16qi)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_loadrs_epi8(void const *__A) {
|
||||
return (__m256i)__builtin_ia32_vmovrsb256((const __v32qi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_loadrs_epi8(__m256i __W, __mmask32 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectb_256(
|
||||
(__mmask32)__U, (__v32qi)_mm256_loadrs_epi8(__A), (__v32qi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_loadrs_epi8(__mmask32 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
|
||||
(__v32qi)_mm256_loadrs_epi8(__A),
|
||||
(__v32qi)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_loadrs_epi32(void const *__A) {
|
||||
return (__m128i)__builtin_ia32_vmovrsd128((const __v4si *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_loadrs_epi32(__m128i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectd_128(
|
||||
(__mmask8)__U, (__v4si)_mm_loadrs_epi32(__A), (__v4si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_loadrs_epi32(__mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_loadrs_epi32(__A),
|
||||
(__v4si)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_loadrs_epi32(void const *__A) {
|
||||
return (__m256i)__builtin_ia32_vmovrsd256((const __v8si *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_loadrs_epi32(__m256i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectd_256(
|
||||
(__mmask8)__U, (__v8si)_mm256_loadrs_epi32(__A), (__v8si)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_loadrs_epi32(__mmask8 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_loadrs_epi32(__A),
|
||||
(__v8si)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_loadrs_epi64(void const *__A) {
|
||||
return (__m128i)__builtin_ia32_vmovrsq128((const __v2di *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_loadrs_epi64(__m128i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectq_128(
|
||||
(__mmask8)__U, (__v2di)_mm_loadrs_epi64(__A), (__v2di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||
(__v2di)_mm_loadrs_epi64(__A),
|
||||
(__v2di)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_loadrs_epi64(void const *__A) {
|
||||
return (__m256i)__builtin_ia32_vmovrsq256((const __v4di *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_loadrs_epi64(__m256i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectq_256(
|
||||
(__mmask8)__U, (__v4di)_mm256_loadrs_epi64(__A), (__v4di)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||
(__v4di)_mm256_loadrs_epi64(__A),
|
||||
(__v4di)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_loadrs_epi16(void const *__A) {
|
||||
return (__m128i)__builtin_ia32_vmovrsw128((const __v8hi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_loadrs_epi16(__m128i __W, __mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectw_128(
|
||||
(__mmask8)__U, (__v8hi)_mm_loadrs_epi16(__A), (__v8hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_loadrs_epi16(__mmask8 __U, void const *__A) {
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_loadrs_epi16(__A),
|
||||
(__v8hi)_mm_setzero_si128());
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_loadrs_epi16(void const *__A) {
|
||||
return (__m256i)__builtin_ia32_vmovrsw256((const __v16hi *)(__A));
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_loadrs_epi16(__m256i __W, __mmask16 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectw_256(
|
||||
(__mmask16)__U, (__v16hi)_mm256_loadrs_epi16(__A), (__v16hi)__W);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_loadrs_epi16(__mmask16 __U, void const *__A) {
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_loadrs_epi16(__A),
|
||||
(__v16hi)_mm256_setzero_si256());
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS256
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __MOVRS_AVX10_2INTRIN_H */
|
||||
59
lib/include/movrsintrin.h
vendored
Normal file
59
lib/include/movrsintrin.h
vendored
Normal file
@ -0,0 +1,59 @@
|
||||
/*===---------------- movrsintrin.h - MOVRS intrinsics ----------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===----------------------------------------------------------------------===*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <movrsintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __MOVRSINTRIN_H
|
||||
#define __MOVRSINTRIN_H
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("movrs")))
|
||||
|
||||
#ifdef __x86_64__
|
||||
static __inline__ char __DEFAULT_FN_ATTRS _movrs_i8(const void *__A) {
|
||||
return (char)__builtin_ia32_movrsqi((const void *)__A);
|
||||
}
|
||||
|
||||
static __inline__ short __DEFAULT_FN_ATTRS _movrs_i16(const void *__A) {
|
||||
return (short)__builtin_ia32_movrshi((const void *)__A);
|
||||
}
|
||||
|
||||
static __inline__ int __DEFAULT_FN_ATTRS _movrs_i32(const void *__A) {
|
||||
return (int)__builtin_ia32_movrssi((const void *)__A);
|
||||
}
|
||||
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS _movrs_i64(const void *__A) {
|
||||
return (long long)__builtin_ia32_movrsdi((const void *)__A);
|
||||
}
|
||||
#endif // __x86_64__
|
||||
|
||||
// Loads a memory sequence containing the specified memory address into
|
||||
/// the L3 data cache. Data will be shared (read/written) to by requesting
|
||||
/// core and other cores.
|
||||
///
|
||||
/// Note that the effect of this intrinsic is dependent on the processor
|
||||
/// implementation.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c PREFETCHRS instruction.
|
||||
///
|
||||
/// \param __P
|
||||
/// A pointer specifying the memory address to be prefetched.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_m_prefetchrs(volatile const void *__P) {
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wcast-qual"
|
||||
__builtin_ia32_prefetchrs((const void *)__P);
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#endif // __MOVRSINTRIN_H
|
||||
@ -10,17 +10,15 @@
|
||||
#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
|
||||
#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __NVPTX__
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
#pragma push_macro("__CUDA__")
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
|
||||
@ -31,9 +29,10 @@ extern "C" {
|
||||
#include <__clang_cuda_device_functions.h>
|
||||
|
||||
#undef __OPENMP_NVPTX__
|
||||
#undef __CUDA__
|
||||
#pragma pop_macro("__CUDA__")
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
|
||||
#ifdef __AMDGCN__
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
|
||||
9
lib/include/openmp_wrappers/complex_cmath.h
vendored
9
lib/include/openmp_wrappers/complex_cmath.h
vendored
@ -64,8 +64,13 @@ template <class _Tp> __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
|
||||
}
|
||||
|
||||
// conj
|
||||
|
||||
template <class _Tp> std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
|
||||
#ifdef _GLIBCXX20_CONSTEXPR
|
||||
#define CXX20_CONSTEXPR_DEVICE __DEVICE__
|
||||
#else
|
||||
#define CXX20_CONSTEXPR_DEVICE
|
||||
#endif
|
||||
template <class _Tp>
|
||||
CXX20_CONSTEXPR_DEVICE std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
|
||||
return std::complex<_Tp>(__c.real(), -__c.imag());
|
||||
}
|
||||
|
||||
|
||||
19
lib/include/pmmintrin.h
vendored
19
lib/include/pmmintrin.h
vendored
@ -17,9 +17,21 @@
|
||||
#include <emmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("sse3,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse3"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#endif
|
||||
|
||||
/// Loads data from an unaligned memory location to elements in a 128-bit
|
||||
/// vector.
|
||||
@ -122,7 +134,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
|
||||
/// values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movehdup_ps(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
|
||||
@ -143,7 +155,7 @@ _mm_movehdup_ps(__m128 __a)
|
||||
/// destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
|
||||
/// values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_moveldup_ps(__m128 __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
|
||||
@ -244,7 +256,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
|
||||
/// [127:64] and [63:0] of the destination.
|
||||
/// \returns A 128-bit vector of [2 x double] containing the moved and
|
||||
/// duplicated values.
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
||||
static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movedup_pd(__m128d __a)
|
||||
{
|
||||
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
|
||||
@ -297,5 +309,6 @@ _mm_mwait(unsigned __extensions, unsigned __hints)
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#endif /* __PMMINTRIN_H */
|
||||
|
||||
14
lib/include/popcntintrin.h
vendored
14
lib/include/popcntintrin.h
vendored
@ -11,12 +11,13 @@
|
||||
#define __POPCNTINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("popcnt"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
|
||||
#endif
|
||||
|
||||
/// Counts the number of bits in the source operand having a value of 1.
|
||||
@ -29,7 +30,7 @@
|
||||
/// An unsigned 32-bit integer operand.
|
||||
/// \returns A 32-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
static __inline__ int __DEFAULT_FN_ATTRS
|
||||
_mm_popcnt_u32(unsigned int __A)
|
||||
{
|
||||
return __builtin_popcount(__A);
|
||||
@ -46,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A)
|
||||
/// An unsigned 64-bit integer operand.
|
||||
/// \returns A 64-bit integer containing the number of bits with value 1 in the
|
||||
/// source operand.
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
static __inline__ long long __DEFAULT_FN_ATTRS
|
||||
_mm_popcnt_u64(unsigned long long __A)
|
||||
{
|
||||
return __builtin_popcountll(__A);
|
||||
@ -54,6 +55,5 @@ _mm_popcnt_u64(unsigned long long __A)
|
||||
#endif /* __x86_64__ */
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
|
||||
#endif /* __POPCNTINTRIN_H */
|
||||
|
||||
6
lib/include/ptrauth.h
vendored
6
lib/include/ptrauth.h
vendored
@ -42,6 +42,9 @@ typedef enum {
|
||||
The extra data is always 0. */
|
||||
ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data,
|
||||
|
||||
/* The key used to sign pointers in ELF .init_array/.fini_array. */
|
||||
ptrauth_key_init_fini_pointer = ptrauth_key_process_independent_code,
|
||||
|
||||
/* Other pointers signed under the ABI use private ABI rules. */
|
||||
|
||||
} ptrauth_key;
|
||||
@ -253,6 +256,9 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
|
||||
[[clang::ptrauth_vtable_pointer(key, address_discrimination, \
|
||||
extra_discrimination)]]
|
||||
|
||||
/* The value is ptrauth_string_discriminator("init_fini") */
|
||||
#define __ptrauth_init_fini_discriminator 0xd9d4
|
||||
|
||||
#else
|
||||
|
||||
#define ptrauth_strip(__value, __key) \
|
||||
|
||||
128
lib/include/riscv_corev_alu.h
vendored
Normal file
128
lib/include/riscv_corev_alu.h
vendored
Normal file
@ -0,0 +1,128 @@
|
||||
/*===---- riscv_corev_alu.h - CORE-V ALU intrinsics ------------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __RISCV_COREV_ALU_H
|
||||
#define __RISCV_COREV_ALU_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__riscv_xcvalu)
|
||||
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_abs(long a) {
|
||||
return __builtin_abs(a);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_slet(long a, long b) {
|
||||
return __builtin_riscv_cv_alu_slet(a, b);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_sletu(unsigned long a, unsigned long b) {
|
||||
return __builtin_riscv_cv_alu_sletu(a, b);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_min(long a, long b) {
|
||||
return __builtin_elementwise_min(a, b);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_minu(unsigned long a, unsigned long b) {
|
||||
return __builtin_elementwise_min(a, b);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_max(long a, long b) {
|
||||
return __builtin_elementwise_max(a, b);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_maxu(unsigned long a, unsigned long b) {
|
||||
return __builtin_elementwise_max(a, b);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_exths(int16_t a) {
|
||||
return __builtin_riscv_cv_alu_exths(a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_exthz(uint16_t a) {
|
||||
return __builtin_riscv_cv_alu_exthz(a);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_extbs(int8_t a) {
|
||||
return __builtin_riscv_cv_alu_extbs(a);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_extbz(uint8_t a) {
|
||||
return __builtin_riscv_cv_alu_extbz(a);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_clip(long a,
|
||||
unsigned long b) {
|
||||
return __builtin_riscv_cv_alu_clip(a, b);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_clipu(unsigned long a, unsigned long b) {
|
||||
return __builtin_riscv_cv_alu_clipu(a, b);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addN(long a, long b,
|
||||
uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_addN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_adduN(unsigned long a, unsigned long b, uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_adduN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addRN(long a, long b,
|
||||
uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_addRN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_adduRN(unsigned long a, unsigned long b, uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_adduRN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subN(long a, long b,
|
||||
uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_subN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_subuN(unsigned long a, unsigned long b, uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_subuN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subRN(long a, long b,
|
||||
uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_subRN(a, b, shft);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long __DEFAULT_FN_ATTRS
|
||||
__riscv_cv_alu_subuRN(unsigned long a, unsigned long b, uint8_t shft) {
|
||||
return __builtin_riscv_cv_alu_subuRN(a, b, shft);
|
||||
}
|
||||
|
||||
#endif // defined(__riscv_xcvalu)
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // define __RISCV_COREV_ALU_H
|
||||
1
lib/include/riscv_vector.h
vendored
1
lib/include/riscv_vector.h
vendored
@ -419,7 +419,6 @@ typedef __rvv_bfloat16m2x4_t vbfloat16m2x4_t;
|
||||
typedef __rvv_bfloat16m4_t vbfloat16m4_t;
|
||||
typedef __rvv_bfloat16m4x2_t vbfloat16m4x2_t;
|
||||
typedef __rvv_bfloat16m8_t vbfloat16m8_t;
|
||||
#define __riscv_v_intrinsic_overloading 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
32
lib/include/sm4evexintrin.h
vendored
Normal file
32
lib/include/sm4evexintrin.h
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
/*===--------------- sm4evexintrin.h - SM4 EVEX intrinsics -----------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <sm4evexintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif // __IMMINTRIN_H
|
||||
|
||||
#ifndef __SM4EVEXINTRIN_H
|
||||
#define __SM4EVEXINTRIN_H
|
||||
|
||||
#define __DEFAULT_FN_ATTRS512 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("sm4,avx10.2-512"), __min_vector_width__(512)))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vsm4key4512((__v16su)__A, (__v16su)__B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
|
||||
return (__m512i)__builtin_ia32_vsm4rnds4512((__v16su)__A, (__v16su)__B);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
|
||||
#endif // __SM4EVEXINTRIN_H
|
||||
6
lib/include/smmintrin.h
vendored
6
lib/include/smmintrin.h
vendored
@ -17,9 +17,15 @@
|
||||
#include <tmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("sse4.1,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
/* SSE4 Rounding macros. */
|
||||
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||||
|
||||
5
lib/include/stdalign.h
vendored
5
lib/include/stdalign.h
vendored
@ -10,10 +10,6 @@
|
||||
#ifndef __STDALIGN_H
|
||||
#define __STDALIGN_H
|
||||
|
||||
#if defined(__MVS__) && __has_include_next(<stdalign.h>)
|
||||
#include_next <stdalign.h>
|
||||
#else
|
||||
|
||||
#if defined(__cplusplus) || \
|
||||
(defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L)
|
||||
#ifndef __cplusplus
|
||||
@ -25,5 +21,4 @@
|
||||
#define __alignof_is_defined 1
|
||||
#endif /* __STDC_VERSION__ */
|
||||
|
||||
#endif /* __MVS__ */
|
||||
#endif /* __STDALIGN_H */
|
||||
|
||||
62
lib/include/tbmintrin.h
vendored
62
lib/include/tbmintrin.h
vendored
@ -15,63 +15,60 @@
|
||||
#define __TBMINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("tbm"))) constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("tbm")))
|
||||
#endif
|
||||
|
||||
#define __bextri_u32(a, b) \
|
||||
((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
|
||||
(unsigned int)(b)))
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcfill_u32(unsigned int __a)
|
||||
{
|
||||
__blcfill_u32(unsigned int __a) {
|
||||
return __a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blci_u32(unsigned int __a)
|
||||
{
|
||||
__blci_u32(unsigned int __a) {
|
||||
return __a | ~(__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcic_u32(unsigned int __a)
|
||||
{
|
||||
__blcic_u32(unsigned int __a) {
|
||||
return ~__a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcmsk_u32(unsigned int __a)
|
||||
{
|
||||
__blcmsk_u32(unsigned int __a) {
|
||||
return __a ^ (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blcs_u32(unsigned int __a)
|
||||
{
|
||||
__blcs_u32(unsigned int __a) {
|
||||
return __a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsfill_u32(unsigned int __a)
|
||||
{
|
||||
__blsfill_u32(unsigned int __a) {
|
||||
return __a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__blsic_u32(unsigned int __a)
|
||||
{
|
||||
__blsic_u32(unsigned int __a) {
|
||||
return ~__a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__t1mskc_u32(unsigned int __a)
|
||||
{
|
||||
__t1mskc_u32(unsigned int __a) {
|
||||
return ~__a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
__tzmsk_u32(unsigned int __a)
|
||||
{
|
||||
__tzmsk_u32(unsigned int __a) {
|
||||
return ~__a & (__a - 1);
|
||||
}
|
||||
|
||||
@ -81,56 +78,47 @@ __tzmsk_u32(unsigned int __a)
|
||||
(unsigned long long)(b)))
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcfill_u64(unsigned long long __a)
|
||||
{
|
||||
__blcfill_u64(unsigned long long __a) {
|
||||
return __a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blci_u64(unsigned long long __a)
|
||||
{
|
||||
__blci_u64(unsigned long long __a) {
|
||||
return __a | ~(__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcic_u64(unsigned long long __a)
|
||||
{
|
||||
__blcic_u64(unsigned long long __a) {
|
||||
return ~__a & (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcmsk_u64(unsigned long long __a)
|
||||
{
|
||||
__blcmsk_u64(unsigned long long __a) {
|
||||
return __a ^ (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blcs_u64(unsigned long long __a)
|
||||
{
|
||||
__blcs_u64(unsigned long long __a) {
|
||||
return __a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsfill_u64(unsigned long long __a)
|
||||
{
|
||||
__blsfill_u64(unsigned long long __a) {
|
||||
return __a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__blsic_u64(unsigned long long __a)
|
||||
{
|
||||
__blsic_u64(unsigned long long __a) {
|
||||
return ~__a | (__a - 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__t1mskc_u64(unsigned long long __a)
|
||||
{
|
||||
__t1mskc_u64(unsigned long long __a) {
|
||||
return ~__a | (__a + 1);
|
||||
}
|
||||
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
__tzmsk_u64(unsigned long long __a)
|
||||
{
|
||||
__tzmsk_u64(unsigned long long __a) {
|
||||
return ~__a & (__a - 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
102
lib/include/tmmintrin.h
vendored
102
lib/include/tmmintrin.h
vendored
@ -17,13 +17,21 @@
|
||||
#include <pmmintrin.h>
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("ssse3,no-evex512"), __min_vector_width__(64)))
|
||||
#define __DEFAULT_FN_ATTRS_MMX \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("mmx,ssse3,no-evex512"), \
|
||||
__min_vector_width__(64)))
|
||||
__target__("ssse3,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("ssse3"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#define __trunc64(x) \
|
||||
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
|
||||
#define __anyext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, -1, -1)
|
||||
|
||||
/// Computes the absolute value of each of the packed 8-bit signed
|
||||
/// integers in the source operand and stores the 8-bit unsigned integer
|
||||
@ -37,10 +45,10 @@
|
||||
/// A 64-bit vector of [8 x i8].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi8(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
|
||||
return (__m64)__builtin_elementwise_abs((__v8qs)__a);
|
||||
}
|
||||
|
||||
/// Computes the absolute value of each of the packed 8-bit signed
|
||||
@ -73,10 +81,10 @@ _mm_abs_epi8(__m128i __a)
|
||||
/// A 64-bit vector of [4 x i16].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi16(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
|
||||
return (__m64)__builtin_elementwise_abs((__v4hi)__a);
|
||||
}
|
||||
|
||||
/// Computes the absolute value of each of the packed 16-bit signed
|
||||
@ -109,10 +117,10 @@ _mm_abs_epi16(__m128i __a)
|
||||
/// A 64-bit vector of [2 x i32].
|
||||
/// \returns A 64-bit integer vector containing the absolute values of the
|
||||
/// elements in the operand.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_abs_pi32(__m64 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pabsd((__v2si)__a);
|
||||
return (__m64)__builtin_elementwise_abs((__v2si)__a);
|
||||
}
|
||||
|
||||
/// Computes the absolute value of each of the packed 32-bit signed
|
||||
@ -177,7 +185,10 @@ _mm_abs_epi32(__m128i __a)
|
||||
/// \returns A 64-bit integer vector containing the concatenated right-shifted
|
||||
/// value.
|
||||
#define _mm_alignr_pi8(a, b, n) \
|
||||
((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
|
||||
((__m64)__builtin_shufflevector( \
|
||||
__builtin_ia32_psrldqi128_byteshift( \
|
||||
__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
|
||||
(n)), __extension__ (__v2di){}, 0))
|
||||
|
||||
/// Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
/// 128-bit vectors of [8 x i16].
|
||||
@ -242,10 +253,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
|
||||
/// operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_phaddw128(
|
||||
(__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||
@ -265,10 +277,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
|
||||
/// operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadd_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
|
||||
return __trunc64(__builtin_ia32_phaddd128(
|
||||
(__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
|
||||
}
|
||||
|
||||
/// Horizontally adds, with saturation, the adjacent pairs of values contained
|
||||
@ -317,10 +330,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
|
||||
/// destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
|
||||
/// sums of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hadds_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_phaddsw128(
|
||||
(__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
@ -386,10 +400,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_phsubw128(
|
||||
(__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Horizontally subtracts the adjacent pairs of values contained in 2
|
||||
@ -409,10 +424,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
|
||||
/// of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsub_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
|
||||
return __trunc64(__builtin_ia32_phsubd128(
|
||||
(__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
|
||||
}
|
||||
|
||||
/// Horizontally subtracts, with saturation, the adjacent pairs of values
|
||||
@ -461,10 +477,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
|
||||
/// the destination.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
|
||||
/// differences of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_hsubs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_phsubsw128(
|
||||
(__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
|
||||
}
|
||||
|
||||
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
|
||||
@ -525,10 +542,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
|
||||
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
|
||||
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
|
||||
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_maddubs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
|
||||
return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
|
||||
(__v16qi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
|
||||
@ -565,10 +583,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
|
||||
/// A 64-bit vector of [4 x i16] containing one of the source operands.
|
||||
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
|
||||
/// products of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
|
||||
(__v8hi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Copies the 8-bit integers from a 128-bit integer vector to the
|
||||
@ -614,12 +633,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
|
||||
/// 1: Clear the corresponding byte in the destination. \n
|
||||
/// 0: Copy the selected source byte to the corresponding byte in the
|
||||
/// destination. \n
|
||||
/// Bits [3:0] select the source byte to be copied.
|
||||
/// Bits [2:0] select the source byte to be copied.
|
||||
/// \returns A 64-bit integer vector containing the copied or cleared values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_shuffle_pi8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
|
||||
return __trunc64(__builtin_ia32_pshufb128(
|
||||
(__v16qi)__builtin_shufflevector(
|
||||
(__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
|
||||
(__v16qi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// For each 8-bit integer in the first source operand, perform one of
|
||||
@ -720,10 +742,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
|
||||
/// A 64-bit integer vector containing control bytes corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
|
||||
return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
|
||||
(__v16qi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// For each 16-bit integer in the first source operand, perform one of
|
||||
@ -746,10 +769,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
|
||||
/// A 64-bit integer vector containing control words corresponding to
|
||||
/// positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
|
||||
(__v8hi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// For each 32-bit integer in the first source operand, perform one of
|
||||
@ -772,13 +796,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
|
||||
/// A 64-bit integer vector containing two control doublewords corresponding
|
||||
/// to positions in the destination.
|
||||
/// \returns A 64-bit integer vector containing the resultant values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS
|
||||
_mm_sign_pi32(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
|
||||
return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
|
||||
(__v4si)__anyext128(__b)));
|
||||
}
|
||||
|
||||
#undef __anyext128
|
||||
#undef __trunc64
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_MMX
|
||||
|
||||
#endif /* __TMMINTRIN_H */
|
||||
|
||||
1796
lib/include/vecintrin.h
vendored
1796
lib/include/vecintrin.h
vendored
File diff suppressed because it is too large
Load Diff
188
lib/include/wasm_simd128.h
vendored
188
lib/include/wasm_simd128.h
vendored
@ -33,6 +33,7 @@ typedef unsigned long long __u64x2
|
||||
__attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
|
||||
typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
|
||||
typedef unsigned char __u8x8
|
||||
@ -956,7 +957,7 @@ static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_popcnt_i8x16((__i8x16)__a);
|
||||
return (v128_t)__builtin_elementwise_popcount((__i8x16)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
|
||||
@ -981,12 +982,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_sat_s_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_add_sat((__i8x16)__a, (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_add_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_sat_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_add_sat((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
|
||||
@ -996,32 +997,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_sat_s_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_sub_sat((__i8x16)__a, (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_sub_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_sat_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_sub_sat((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_s_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__i8x16)__a, (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_s_i8x16((__i8x16)__a, (__i8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__i8x16)__a, (__i8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__u8x16)__a, (__u8x16)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
|
||||
@ -1067,12 +1068,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_sat_s_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_add_sat((__i16x8)__a, (__i16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_add_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_add_sat_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_add_sat((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
|
||||
@ -1082,12 +1083,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_sat_s_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_sub_sat((__i16x8)__a, (__i16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_sub_sat(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_sub_sat_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_sub_sat((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
|
||||
@ -1097,22 +1098,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_s_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__i16x8)__a, (__i16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_s_i16x8((__i16x8)__a, (__i16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__i16x8)__a, (__i16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__u16x8)__a, (__u16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
|
||||
@ -1168,22 +1169,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_mul(v128_t __a,
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_s_i32x4((__i32x4)__a, (__i32x4)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__i32x4)__a, (__i32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b);
|
||||
return (v128_t)__builtin_elementwise_min((__u32x4)__a, (__u32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_s_i32x4((__i32x4)__a, (__i32x4)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__i32x4)__a, (__i32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b);
|
||||
return (v128_t)__builtin_elementwise_max((__u32x4)__a, (__u32x4)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_dot_i16x8(v128_t __a,
|
||||
@ -1878,6 +1879,151 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) {
|
||||
(__i8x16)__a, (__i8x16)__b, (__i32x4)__c);
|
||||
}
|
||||
|
||||
// FP16 intrinsics
|
||||
#define __FP16_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("fp16"), \
|
||||
__min_vector_width__(128)))
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) {
|
||||
return (v128_t)__builtin_wasm_splat_f16x8(__a);
|
||||
}
|
||||
|
||||
#ifdef __wasm_fp16__
|
||||
// TODO Replace the following macros with regular C functions and use normal
|
||||
// target-independent vector code like the other replace/extract instructions.
|
||||
|
||||
#define wasm_f16x8_extract_lane(__a, __i) \
|
||||
(__builtin_wasm_extract_lane_f16x8((__f16x8)(__a), __i))
|
||||
|
||||
#define wasm_f16x8_replace_lane(__a, __i, __b) \
|
||||
((v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)(__a), __i, __b))
|
||||
|
||||
#endif
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) {
|
||||
return (v128_t)(-(__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a == (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a != (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a < (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a > (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a <= (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a >= (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a + (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a - (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a * (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)((__f16x8)__a / (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a,
|
||||
v128_t __b) {
|
||||
return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS
|
||||
wasm_i16x8_trunc_sat_f16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS
|
||||
wasm_u16x8_trunc_sat_f16x8(v128_t __a) {
|
||||
return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) {
|
||||
return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
|
||||
return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
|
||||
v128_t __b,
|
||||
v128_t __c) {
|
||||
return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b,
|
||||
(__f16x8)__c);
|
||||
}
|
||||
|
||||
static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a,
|
||||
v128_t __b,
|
||||
v128_t __c) {
|
||||
return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b,
|
||||
(__f16x8)__c);
|
||||
}
|
||||
|
||||
// Deprecated intrinsics
|
||||
|
||||
static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle")
|
||||
|
||||
347
lib/include/xmmintrin.h
vendored
347
lib/include/xmmintrin.h
vendored
@ -32,12 +32,41 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
|
||||
#endif
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_MMX \
|
||||
#define __DEFAULT_FN_ATTRS_SSE2 \
|
||||
__attribute__((__always_inline__, __nodebug__, \
|
||||
__target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
|
||||
__target__("sse2,no-evex512"), __min_vector_width__(128)))
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse"), \
|
||||
__min_vector_width__(128)))
|
||||
#define __DEFAULT_FN_ATTRS_SSE2 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
|
||||
__min_vector_width__(128)))
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
|
||||
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
|
||||
#else
|
||||
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
|
||||
#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
|
||||
#endif
|
||||
|
||||
#define __trunc64(x) \
|
||||
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
|
||||
#define __zext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, 2, 3)
|
||||
#define __anyext128(x) \
|
||||
(__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \
|
||||
1, -1, -1)
|
||||
#define __zeroupper64(x) \
|
||||
(__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0, \
|
||||
1, 4, 5)
|
||||
|
||||
/// Adds the 32-bit float values in the low-order bits of the operands.
|
||||
///
|
||||
@ -54,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
|
||||
/// of the lower 32 bits of both operands. The upper 96 bits are copied from
|
||||
/// the upper 96 bits of the first source operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_add_ss(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_add_ss(__m128 __a, __m128 __b) {
|
||||
__a[0] += __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -74,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the sums of both
|
||||
/// operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_add_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_add_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4sf)__a + (__v4sf)__b);
|
||||
}
|
||||
|
||||
@ -96,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
|
||||
/// difference of the lower 32 bits of both operands. The upper 96 bits are
|
||||
/// copied from the upper 96 bits of the first source operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_sub_ss(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_sub_ss(__m128 __a, __m128 __b) {
|
||||
__a[0] -= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -117,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing the subtrahend.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the differences between
|
||||
/// both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_sub_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_sub_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4sf)__a - (__v4sf)__b);
|
||||
}
|
||||
|
||||
@ -139,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
|
||||
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
|
||||
/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
|
||||
/// bits of the first source operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_mul_ss(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_mul_ss(__m128 __a, __m128 __b) {
|
||||
__a[0] *= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -159,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the products of both
|
||||
/// operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_mul_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_mul_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4sf)__a * (__v4sf)__b);
|
||||
}
|
||||
|
||||
@ -181,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
|
||||
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
|
||||
/// lower 32 bits of both operands. The upper 96 bits are copied from the
|
||||
/// upper 96 bits of the first source operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_div_ss(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_div_ss(__m128 __a, __m128 __b) {
|
||||
__a[0] /= __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -200,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing the divisor.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
|
||||
/// operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_div_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_div_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4sf)__a / (__v4sf)__b);
|
||||
}
|
||||
|
||||
@ -416,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
|
||||
/// values between both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_and_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_and_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4su)__a & (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -438,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
|
||||
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
|
||||
/// one's complement of the first operand and the values in the second
|
||||
/// operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_andnot_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_andnot_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)(~(__v4su)__a & (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -456,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
|
||||
/// values between both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_or_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_or_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4su)__a | (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -475,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
|
||||
/// A 128-bit vector of [4 x float] containing one of the source operands.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
|
||||
/// of the values between both operands.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_xor_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_xor_ps(__m128 __a, __m128 __b) {
|
||||
return (__m128)((__v4su)__a ^ (__v4su)__b);
|
||||
}
|
||||
|
||||
@ -1448,10 +1465,10 @@ _mm_cvtss_si64(__m128 __a)
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtps_pi32(__m128 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
|
||||
return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
|
||||
}
|
||||
|
||||
/// Converts two low-order float values in a 128-bit vector of
|
||||
@ -1468,7 +1485,7 @@ _mm_cvtps_pi32(__m128 __a)
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvt_ps2pi(__m128 __a)
|
||||
{
|
||||
return _mm_cvtps_pi32(__a);
|
||||
@ -1558,10 +1575,10 @@ _mm_cvttss_si64(__m128 __a)
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvttps_pi32(__m128 __a)
|
||||
{
|
||||
return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
|
||||
return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
|
||||
}
|
||||
|
||||
/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
|
||||
@ -1579,7 +1596,7 @@ _mm_cvttps_pi32(__m128 __a)
|
||||
/// \param __a
|
||||
/// A 128-bit vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector containing the converted values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtt_ps2pi(__m128 __a)
|
||||
{
|
||||
return _mm_cvttps_pi32(__a);
|
||||
@ -1601,9 +1618,8 @@ _mm_cvtt_ps2pi(__m128 __a)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
|
||||
/// converted value of the second operand. The upper 96 bits are copied from
|
||||
/// the upper 96 bits of the first operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_cvtsi32_ss(__m128 __a, int __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
|
||||
int __b) {
|
||||
__a[0] = __b;
|
||||
return __a;
|
||||
}
|
||||
@ -1624,9 +1640,8 @@ _mm_cvtsi32_ss(__m128 __a, int __b)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
|
||||
/// converted value of the second operand. The upper 96 bits are copied from
|
||||
/// the upper 96 bits of the first operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_cvt_si2ss(__m128 __a, int __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
|
||||
int __b) {
|
||||
return _mm_cvtsi32_ss(__a, __b);
|
||||
}
|
||||
|
||||
@ -1648,9 +1663,8 @@ _mm_cvt_si2ss(__m128 __a, int __b)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
|
||||
/// converted value of the second operand. The upper 96 bits are copied from
|
||||
/// the upper 96 bits of the first operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_cvtsi64_ss(__m128 __a, long long __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtsi64_ss(__m128 __a, long long __b) {
|
||||
__a[0] = __b;
|
||||
return __a;
|
||||
}
|
||||
@ -1674,10 +1688,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
|
||||
/// converted value of the second operand. The upper 64 bits are copied from
|
||||
/// the upper 64 bits of the first operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpi32_ps(__m128 __a, __m64 __b)
|
||||
{
|
||||
return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
|
||||
return (__m128)__builtin_shufflevector(
|
||||
(__v4sf)__a,
|
||||
__builtin_convertvector((__v4si)__zext128(__b), __v4sf),
|
||||
4, 5, 2, 3);
|
||||
}
|
||||
|
||||
/// Converts two elements of a 64-bit vector of [2 x i32] into two
|
||||
@ -1697,7 +1714,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
|
||||
/// converted value from the second operand. The upper 64 bits are copied
|
||||
/// from the upper 64 bits of the first operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvt_pi2ps(__m128 __a, __m64 __b)
|
||||
{
|
||||
return _mm_cvtpi32_ps(__a, __b);
|
||||
@ -1714,9 +1731,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
|
||||
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
|
||||
/// used in the extraction.
|
||||
/// \returns A 32-bit float containing the extracted value.
|
||||
static __inline__ float __DEFAULT_FN_ATTRS
|
||||
_mm_cvtss_f32(__m128 __a)
|
||||
{
|
||||
static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_cvtss_f32(__m128 __a) {
|
||||
return __a[0];
|
||||
}
|
||||
|
||||
@ -1907,9 +1923,8 @@ _mm_undefined_ps(void)
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
|
||||
/// lower 32 bits contain the value provided in the source operand. The
|
||||
/// upper 96 bits are set to zero.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_set_ss(float __w)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_ss(float __w) {
|
||||
return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
@ -1925,9 +1940,8 @@ _mm_set_ss(float __w)
|
||||
/// A single-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_set1_ps(float __w)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set1_ps(float __w) {
|
||||
return __extension__ (__m128){ __w, __w, __w, __w };
|
||||
}
|
||||
|
||||
@ -1944,9 +1958,8 @@ _mm_set1_ps(float __w)
|
||||
/// A single-precision floating-point value used to initialize each vector
|
||||
/// element of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_set_ps1(float __w)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_ps1(float __w) {
|
||||
return _mm_set1_ps(__w);
|
||||
}
|
||||
|
||||
@ -1971,9 +1984,8 @@ _mm_set_ps1(float __w)
|
||||
/// A single-precision floating-point value used to initialize bits [31:0]
|
||||
/// of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_set_ps(float __z, float __y, float __x, float __w)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_set_ps(float __z, float __y, float __x, float __w) {
|
||||
return __extension__ (__m128){ __w, __x, __y, __z };
|
||||
}
|
||||
|
||||
@ -1999,9 +2011,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
|
||||
/// A single-precision floating-point value used to initialize bits [127:96]
|
||||
/// of the result.
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_setr_ps(float __z, float __y, float __x, float __w)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setr_ps(float __z, float __y, float __x, float __w) {
|
||||
return __extension__ (__m128){ __z, __y, __x, __w };
|
||||
}
|
||||
|
||||
@ -2014,9 +2025,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
|
||||
///
|
||||
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
|
||||
/// all elements set to zero.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_setzero_ps(void)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_setzero_ps(void) {
|
||||
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
}
|
||||
|
||||
@ -2231,10 +2241,10 @@ _mm_storer_ps(float *__p, __m128 __a)
|
||||
/// A pointer to an aligned memory location used to store the register value.
|
||||
/// \param __a
|
||||
/// A 64-bit integer containing the value to be stored.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_mm_stream_pi(void *__p, __m64 __a)
|
||||
{
|
||||
__builtin_ia32_movntq((__m64 *)__p, __a);
|
||||
__builtin_nontemporal_store(__a, (__m64 *)__p);
|
||||
}
|
||||
|
||||
/// Moves packed float values from a 128-bit vector of [4 x float] to a
|
||||
@ -2296,7 +2306,7 @@ void _mm_sfence(void);
|
||||
/// 3: Bits [63:48] are copied to the destination.
|
||||
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
|
||||
#define _mm_extract_pi16(a, n) \
|
||||
((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
|
||||
((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
|
||||
|
||||
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
|
||||
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
|
||||
@ -2342,10 +2352,10 @@ void _mm_sfence(void);
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the comparison results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_max_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
|
||||
return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// Compares each of the corresponding packed 8-bit unsigned integer
|
||||
@ -2361,10 +2371,10 @@ _mm_max_pi16(__m64 __a, __m64 __b)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the comparison results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_max_pu8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
|
||||
return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
|
||||
}
|
||||
|
||||
/// Compares each of the corresponding packed 16-bit integer values of
|
||||
@ -2380,10 +2390,10 @@ _mm_max_pu8(__m64 __a, __m64 __b)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the comparison results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_min_pi16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
|
||||
return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
|
||||
}
|
||||
|
||||
/// Compares each of the corresponding packed 8-bit unsigned integer
|
||||
@ -2399,10 +2409,10 @@ _mm_min_pi16(__m64 __a, __m64 __b)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the comparison results.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_min_pu8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
|
||||
return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
|
||||
}
|
||||
|
||||
/// Takes the most significant bit from each 8-bit element in a 64-bit
|
||||
@ -2417,10 +2427,10 @@ _mm_min_pu8(__m64 __a, __m64 __b)
|
||||
/// A 64-bit integer vector containing the values with bits to be extracted.
|
||||
/// \returns The most significant bit from each 8-bit element in \a __a,
|
||||
/// written to bits [7:0].
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ int __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_movemask_pi8(__m64 __a)
|
||||
{
|
||||
return __builtin_ia32_pmovmskb((__v8qi)__a);
|
||||
return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
|
||||
}
|
||||
|
||||
/// Multiplies packed 16-bit unsigned integer values and writes the
|
||||
@ -2436,10 +2446,11 @@ _mm_movemask_pi8(__m64 __a)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the products of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_mulhi_pu16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
|
||||
(__v8hi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
|
||||
@ -2476,8 +2487,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
|
||||
/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
|
||||
/// <c>[b6, b4, b2, b0]</c>.
|
||||
/// \returns A 64-bit integer vector containing the shuffled values.
|
||||
#define _mm_shuffle_pi16(a, n) \
|
||||
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
|
||||
#define _mm_shuffle_pi16(a, n) \
|
||||
((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
|
||||
(n) & 0x3, ((n) >> 2) & 0x3, \
|
||||
((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
|
||||
|
||||
/// Conditionally copies the values from each 8-bit element in the first
|
||||
/// 64-bit integer vector operand to the specified memory location, as
|
||||
@ -2502,10 +2515,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
|
||||
/// A pointer to a 64-bit memory location that will receive the conditionally
|
||||
/// copied integer values. The address of the memory location does not have
|
||||
/// to be aligned.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
|
||||
{
|
||||
__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
|
||||
// This is complex, because we need to support the case where __p is pointing
|
||||
// within the last 15 to 8 bytes of a page. In that case, using a 128-bit
|
||||
// write might cause a trap where a 64-bit maskmovq would not. (Memory
|
||||
// locations not selected by the mask bits might still cause traps.)
|
||||
__m128i __d128 = __anyext128(__d);
|
||||
__m128i __n128 = __zext128(__n);
|
||||
if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
|
||||
((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
|
||||
// If there's a risk of spurious trap due to a 128-bit write, back up the
|
||||
// pointer by 8 bytes and shift values in registers to match.
|
||||
__p -= 8;
|
||||
__d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
|
||||
__n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
|
||||
}
|
||||
|
||||
__builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
|
||||
}
|
||||
|
||||
/// Computes the rounded averages of the packed unsigned 8-bit integer
|
||||
@ -2521,10 +2549,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the averages of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_avg_pu8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
|
||||
return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
|
||||
(__v16qi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Computes the rounded averages of the packed unsigned 16-bit integer
|
||||
@ -2540,10 +2569,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
|
||||
/// \param __b
|
||||
/// A 64-bit integer vector containing one of the source operands.
|
||||
/// \returns A 64-bit integer vector containing the averages of both operands.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_avg_pu16(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
|
||||
return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
|
||||
(__v8hi)__anyext128(__b)));
|
||||
}
|
||||
|
||||
/// Subtracts the corresponding 8-bit unsigned integer values of the two
|
||||
@ -2562,10 +2592,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
|
||||
/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
|
||||
/// sets of absolute differences between both operands. The upper bits are
|
||||
/// cleared.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_sad_pu8(__m64 __a, __m64 __b)
|
||||
{
|
||||
return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
|
||||
return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
|
||||
(__v16qi)__zext128(__b)));
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
@ -2741,9 +2772,8 @@ void _mm_setcsr(unsigned int __i);
|
||||
/// Bits [95:64] are written to bits [63:32] of the destination. \n
|
||||
/// Bits [127:96] are written to bits [127:96] of the destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_unpackhi_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_unpackhi_ps(__m128 __a, __m128 __b) {
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
|
||||
}
|
||||
|
||||
@ -2763,9 +2793,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
|
||||
/// Bits [31:0] are written to bits [63:32] of the destination. \n
|
||||
/// Bits [63:32] are written to bits [127:96] of the destination.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_unpacklo_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_unpacklo_ps(__m128 __a, __m128 __b) {
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
|
||||
}
|
||||
|
||||
@ -2785,9 +2814,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
|
||||
/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
|
||||
/// written to the lower 32 bits of the result.
|
||||
/// \returns A 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_move_ss(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_move_ss(__m128 __a, __m128 __b) {
|
||||
__a[0] = __b[0];
|
||||
return __a;
|
||||
}
|
||||
@ -2807,9 +2835,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
|
||||
/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
|
||||
/// written to the lower 64 bits of the result.
|
||||
/// \returns A 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_movehl_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movehl_ps(__m128 __a, __m128 __b) {
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
|
||||
}
|
||||
|
||||
@ -2828,9 +2855,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
|
||||
/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
|
||||
/// written to the upper 64 bits of the result.
|
||||
/// \returns A 128-bit floating-point vector of [4 x float].
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||
_mm_movelh_ps(__m128 __a, __m128 __b)
|
||||
{
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
_mm_movelh_ps(__m128 __a, __m128 __b) {
|
||||
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
|
||||
}
|
||||
|
||||
@ -2846,22 +2872,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
|
||||
/// from the corresponding elements in this operand.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
|
||||
/// values from the operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpi16_ps(__m64 __a)
|
||||
{
|
||||
__m64 __b, __c;
|
||||
__m128 __r;
|
||||
|
||||
__b = _mm_setzero_si64();
|
||||
__b = _mm_cmpgt_pi16(__b, __a);
|
||||
__c = _mm_unpackhi_pi16(__a, __b);
|
||||
__r = _mm_setzero_ps();
|
||||
__r = _mm_cvtpi32_ps(__r, __c);
|
||||
__r = _mm_movelh_ps(__r, __r);
|
||||
__c = _mm_unpacklo_pi16(__a, __b);
|
||||
__r = _mm_cvtpi32_ps(__r, __c);
|
||||
|
||||
return __r;
|
||||
return __builtin_convertvector((__v4hi)__a, __v4sf);
|
||||
}
|
||||
|
||||
/// Converts a 64-bit vector of 16-bit unsigned integer values into a
|
||||
@ -2876,21 +2890,10 @@ _mm_cvtpi16_ps(__m64 __a)
|
||||
/// destination are copied from the corresponding elements in this operand.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
|
||||
/// values from the operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpu16_ps(__m64 __a)
|
||||
{
|
||||
__m64 __b, __c;
|
||||
__m128 __r;
|
||||
|
||||
__b = _mm_setzero_si64();
|
||||
__c = _mm_unpackhi_pi16(__a, __b);
|
||||
__r = _mm_setzero_ps();
|
||||
__r = _mm_cvtpi32_ps(__r, __c);
|
||||
__r = _mm_movelh_ps(__r, __r);
|
||||
__c = _mm_unpacklo_pi16(__a, __b);
|
||||
__r = _mm_cvtpi32_ps(__r, __c);
|
||||
|
||||
return __r;
|
||||
return __builtin_convertvector((__v4hu)__a, __v4sf);
|
||||
}
|
||||
|
||||
/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
|
||||
@ -2905,16 +2908,12 @@ _mm_cvtpu16_ps(__m64 __a)
|
||||
/// from the corresponding lower 4 elements in this operand.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
|
||||
/// values from the operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpi8_ps(__m64 __a)
|
||||
{
|
||||
__m64 __b;
|
||||
|
||||
__b = _mm_setzero_si64();
|
||||
__b = _mm_cmpgt_pi8(__b, __a);
|
||||
__b = _mm_unpacklo_pi8(__a, __b);
|
||||
|
||||
return _mm_cvtpi16_ps(__b);
|
||||
return __builtin_convertvector(
|
||||
__builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
|
||||
0, 1, 2, 3), __v4sf);
|
||||
}
|
||||
|
||||
/// Converts the lower four unsigned 8-bit integer values from a 64-bit
|
||||
@ -2930,15 +2929,12 @@ _mm_cvtpi8_ps(__m64 __a)
|
||||
/// operand.
|
||||
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
|
||||
/// values from the source operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpu8_ps(__m64 __a)
|
||||
{
|
||||
__m64 __b;
|
||||
|
||||
__b = _mm_setzero_si64();
|
||||
__b = _mm_unpacklo_pi8(__a, __b);
|
||||
|
||||
return _mm_cvtpi16_ps(__b);
|
||||
return __builtin_convertvector(
|
||||
__builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
|
||||
0, 1, 2, 3), __v4sf);
|
||||
}
|
||||
|
||||
/// Converts the two 32-bit signed integer values from each 64-bit vector
|
||||
@ -2957,16 +2953,12 @@ _mm_cvtpu8_ps(__m64 __a)
|
||||
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
|
||||
/// copied and converted values from the first operand. The upper 64 bits
|
||||
/// contain the copied and converted values from the second operand.
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
|
||||
{
|
||||
__m128 __c;
|
||||
|
||||
__c = _mm_setzero_ps();
|
||||
__c = _mm_cvtpi32_ps(__c, __b);
|
||||
__c = _mm_movelh_ps(__c, __c);
|
||||
|
||||
return _mm_cvtpi32_ps(__c, __a);
|
||||
return __builtin_convertvector(
|
||||
__builtin_shufflevector((__v2si)__a, (__v2si)__b,
|
||||
0, 1, 2, 3), __v4sf);
|
||||
}
|
||||
|
||||
/// Converts each single-precision floating-point element of a 128-bit
|
||||
@ -2986,16 +2978,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
|
||||
/// A 128-bit floating-point vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
|
||||
/// values.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtps_pi16(__m128 __a)
|
||||
{
|
||||
__m64 __b, __c;
|
||||
|
||||
__b = _mm_cvtps_pi32(__a);
|
||||
__a = _mm_movehl_ps(__a, __a);
|
||||
__c = _mm_cvtps_pi32(__a);
|
||||
|
||||
return _mm_packs_pi32(__b, __c);
|
||||
return __trunc64(__builtin_ia32_packssdw128(
|
||||
(__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
|
||||
}
|
||||
|
||||
/// Converts each single-precision floating-point element of a 128-bit
|
||||
@ -3016,7 +3003,7 @@ _mm_cvtps_pi16(__m128 __a)
|
||||
/// 128-bit floating-point vector of [4 x float].
|
||||
/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
|
||||
/// converted values and the uppper 32 bits are set to zero.
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
|
||||
static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
|
||||
_mm_cvtps_pi8(__m128 __a)
|
||||
{
|
||||
__m64 __b, __c;
|
||||
@ -3196,8 +3183,14 @@ do { \
|
||||
#define _m_psadbw _mm_sad_pu8
|
||||
#define _m_ _mm_
|
||||
|
||||
#undef __trunc64
|
||||
#undef __zext128
|
||||
#undef __anyext128
|
||||
#undef __zeroupper64
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_MMX
|
||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||
#undef __DEFAULT_FN_ATTRS_SSE2
|
||||
#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
|
||||
|
||||
/* Ugly hack for backwards-compatibility (compatible with gcc) */
|
||||
#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user