zig cc: Update intrinsic headers to Clang 20.

2025-12-06 06:13:07 +00:00 · 2025-02-05 10:23:43 +01:00 · 2025-02-05 10:23:43 +01:00 · ce754724b3
commit ce754724b3
parent 0181cfe8ad
80 changed files with 18035 additions and 2970 deletions
--- a/lib/include/adcintrin.h
+++ b/lib/include/adcintrin.h
@ -15,7 +15,12 @@
 #endif

 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__)) constexpr
+#else
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#endif

 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
--- a/lib/include/adxintrin.h
+++ b/lib/include/adxintrin.h
@ -15,8 +15,13 @@
 #define __ADXINTRIN_H

 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx"))) constexpr
+#else
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+#endif

 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -2502,37 +2502,37 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {

 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_popcnt(vector signed char __a) {
-  return (vector unsigned char)__builtin_altivec_vpopcntb(
+  return (vector unsigned char)__builtin_elementwise_popcount(
      (vector unsigned char)__a);
 }
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_popcnt(vector unsigned char __a) {
-  return __builtin_altivec_vpopcntb(__a);
+  return __builtin_elementwise_popcount(__a);
 }
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_popcnt(vector signed short __a) {
-  return (vector unsigned short)__builtin_altivec_vpopcnth(
+  return (vector unsigned short)__builtin_elementwise_popcount(
      (vector unsigned short)__a);
 }
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_popcnt(vector unsigned short __a) {
-  return __builtin_altivec_vpopcnth(__a);
+  return __builtin_elementwise_popcount(__a);
 }
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_popcnt(vector signed int __a) {
-  return __builtin_altivec_vpopcntw((vector unsigned int)__a);
+  return __builtin_elementwise_popcount((vector unsigned int)__a);
 }
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_popcnt(vector unsigned int __a) {
-  return __builtin_altivec_vpopcntw(__a);
+  return __builtin_elementwise_popcount(__a);
 }
 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_popcnt(vector signed long long __a) {
-  return __builtin_altivec_vpopcntd((vector unsigned long long)__a);
+  return __builtin_elementwise_popcount((vector unsigned long long)__a);
 }
 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_popcnt(vector unsigned long long __a) {
-  return __builtin_altivec_vpopcntd(__a);
+  return __builtin_elementwise_popcount(__a);
 }

 #define vec_vclz vec_cntlz
--- a/lib/include/amxavx512intrin.h
+++ b/lib/include/amxavx512intrin.h
@ -0,0 +1,382 @@
+/*===--------------------- amxavx512intrin.h - AMXAVX512 --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_AVX512INTRIN_H
+#define __AMX_AVX512INTRIN_H
+#if defined(__x86_64__) && defined(__SSE2__)
+
+#define __DEFAULT_FN_ATTRS_AVX512                                              \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-avx512,avx10.2-512")))
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the int32 source elements to fp32. The row of the tile is selected by a
+///    32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowd2ps(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.f32[i] := CONVERT_INT32_TO_FP32(tsrc.row[row_index].dword[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWD2PS instruction.
+///
+/// \param tsrc
+///    The source tile. Max size is 1024 Bytes.
+/// \param row
+///    The row of the source tile
+#define _tile_cvtrowd2ps(tsrc, row) __builtin_ia32_tcvtrowd2ps(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the fp32 source elements to bf16. It places the resulting bf16 elements
+///    in the high 16 bits within each dword. The row of the tile is selected
+///    by a 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2bf16h(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.word[2*i+0] := 0
+///         dst.bf16[2*i+1] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2BF16H instruction.
+///
+/// \param tsrc
+///    The source tile. Max size is 1024 Bytes.
+/// \param row
+///    The the row of the source tile.
+#define _tile_cvtrowps2bf16h(tsrc, row)                                        \
+  __builtin_ia32_tcvtrowps2bf16h(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the fp32 source elements to bf16. It places the resulting bf16 elements
+///    in the low 16 bits within each dword. The row of the tile is selected
+///    by a 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2bf16l(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.word[2*i+1] := 0
+///         dst.bf16[2*i+0] := CONVERT_FP32_TO_BF16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2BF16L instruction.
+///
+/// \param tsrc
+///    The source tile. Max size is 1024 Bytes.
+/// \param row
+///    The the row of the source tile.
+#define _tile_cvtrowps2bf16l(tsrc, row)                                        \
+  __builtin_ia32_tcvtrowps2bf16l(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the fp32 source elements to fp16. It places the resulting fp16 elements
+///    in the high 16 bits within each dword. The row of the tile is selected
+///    by a 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phh(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.word[2*i+0] := 0
+///         dst.fp16[2*i+1] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHH instruction.
+///
+/// \param tsrc
+///    The source tile. Max size is 1024 Bytes.
+/// \param row
+///    The the row of the source tile.
+#define _tile_cvtrowps2phh(tsrc, row) __builtin_ia32_tcvtrowps2phh(tsrc, row)
+
+/// Moves a row from a tile register to a zmm destination register, converting
+///    the fp32 source elements to fp16. It places the resulting fp16 elements
+///    in the low 16 bits within each dword. The row of the tile is selected
+///    by a 32b GPR.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m512i _tile_cvtrowps2phl(__tile tsrc, unsigned int row);
+/// \endcode
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL >> 3
+/// row_index := row & 0xffff
+/// row_chunk := ((row >> 16) & 0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes / 4) - 1
+///     IF i + row_chunk / 4 >= tsrc.colsb / 4
+///         dst.dword[i] := 0
+///     ELSE
+///         dst.word[2*i+1] := 0
+///         dst.fp16[2*i+0] := CONVERT_FP32_TO_FP16(tsrc.row[row_index].fp32[row_chunk/4+i], RNE)
+///     FI
+/// ENDFOR
+/// dst[MAX_VL-1:VL] := 0
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCVTROWPS2PHL instruction.
+///
+/// \param tsrc
+///    The source tile. Max size is 1024 Bytes.
+/// \param row
+///    The the row of the source tile.
+#define _tile_cvtrowps2phl(tsrc, row) __builtin_ia32_tcvtrowps2phl(tsrc, row)
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m512 _tile_movrow(__tile a, unsigned b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param a
+///     The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///     The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///     The destination v16f32 data. Size is 64 Bytes.
+///
+/// \code{.operation}
+/// VL := 512
+/// VL_bytes := VL>>3
+/// row_index := b&0xffff
+/// row_chunk := ((b>>16)&0xffff) * VL_bytes
+/// FOR i := 0 TO (VL_bytes-1)
+///     IF (row_chunk + i >= a.colsb)
+///             dst.byte[i] := 0
+///     ELSE
+///             dst.byte[i] := a.row[row_index].byte[row_chunk+i]
+/// ENDFOR
+/// \endcode
+#define _tile_movrow(a, b) __builtin_ia32_tilemovrow(a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
+    unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
+                              _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
+                              _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
+    unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
+    unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+  return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_AVX512 _tile_movrow_internal(
+    unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+  return (__m512i)__builtin_ia32_tilemovrow_internal(m, n, src, u);
+}
+
+/// Move a row from a tile (src0) to a v16f32 dst, converting the int32 source
+/// elements to fp32. No SIMD exceptions are generated. Rounding is done as if
+/// MXCSR.RC=RNE. Embedded rounding is not supported.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWD2PS </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v16f32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowd2ps_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2BF16H </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32bf16 dst, converting the fp32 source
+/// elements to bf16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2BF16L </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v32bf16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at high 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHH </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move a row from a tile (src0) to a v32fp16 dst, converting the fp32 source
+/// elements to fp16 at low 16-bits of each dword.
+/// The row and chunk elements of tile is fetched from 32bit src1.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCVTROWPS2PHL </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v32fp16 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
+  return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+/// Move one row of a tile data to a v16f32 data.
+/// The row of the tile is selected by a 32b GPR.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEMOVROW </c> instruction.
+///
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source r32. Size is 4 Bytes.
+/// \returns
+///    The destination v16i32 data. Size is 64 Bytes.
+__DEFAULT_FN_ATTRS_AVX512
+static __m512i __tile_movrow(__tile1024i src0, unsigned src1) {
+  return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1);
+}
+
+#endif // __x86_64__ && __SSE2__
+#endif // __AMX_AVX512INTRIN_H
--- a/lib/include/amxbf16transposeintrin.h
+++ b/lib/include/amxbf16transposeintrin.h
@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles \a a and \a b, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in \a dst, and store the
+///    32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+///					FP32(b.row[k].bf16[2*n+0])
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+///					FP32(b.row[k].bf16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles src0 and src1, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in "dst", and store the
+///    32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                        __tile1024i src1) {
+  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                       src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
--- a/lib/include/amxcomplextransposeintrin.h
+++ b/lib/include/amxcomplextransposeintrin.h
@ -0,0 +1,303 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The imaginary part of the \a a element
+///    is multiplied with the real part of the corresponding \a b element, and
+///    the real part of the \a a element is multiplied with the imaginary part
+///    of the corresponding \a b elements. The two accumulated results are
+///    added, and then accumulated into the corresponding row and column of
+///    \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b)                                          \
+  __builtin_ia32_ttcmmimfp16ps((dst), (a), (b))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (rtransposed colum of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The real part of the \a a element is
+///    multiplied with the real part of the corresponding \a b element, and the
+///    negated imaginary part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b)                                          \
+  __builtin_ia32_ttcmmrlfp16ps((dst), (a), (b))
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles \a a and \a b
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The negated imaginary part of the \a a
+///    element is multiplied with the real part of the corresponding \a b
+///    element, and the real part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
+  __builtin_ia32_tconjtcmmimfp16ps((dst), (a), (b))
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+///    and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+///	FOR j := 0 TO (dst.colsb / 4) - 1
+///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+///	ENDFOR
+///	write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16((dst), (a))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_conjtfp16_internal(unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles src0 and src1
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                    __tile1024i src1) {
+  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                              dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+///    writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
--- a/lib/include/amxfp16intrin.h
+++ b/lib/include/amxfp16intrin.h
@ -15,6 +15,10 @@
 #define __AMX_FP16INTRIN_H
 #ifdef __x86_64__

+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
 ///    and \a b, accumulating the intermediate single-precision (32-bit)
 ///    floating-point elements with elements in \a dst, and store the 32-bit
@ -54,5 +58,36 @@
 #define _tile_dpfp16ps(dst, a, b)                                \
  __builtin_ia32_tdpfp16ps(dst, a, b)

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
+  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
 #endif /* __x86_64__ */
 #endif /* __AMX_FP16INTRIN_H */
--- a/lib/include/amxfp16transposeintrin.h
+++ b/lib/include/amxfp16transposeintrin.h
@ -0,0 +1,94 @@
+/*===----- amxfp16transposeintrin.h - AMX-FP16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxfp16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16TRANSPOSEINTRIN_H
+#define __AMX_FP16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-fp16,amx-transpose")))
+
+/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
+///    tiles \a a and \a b, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in \a dst, and store the
+///    32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///					FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///					FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps((dst), (a), (b))
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in
+///    tiles src0 and src1, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in "dst", and store the
+///    32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpfp16ps(__tile1024i *dst, __tile1024i src0,
+                                        __tile1024i src1) {
+  dst->tile = _tile_tdpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                       src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_FP16TRANSPOSEINTRIN_H */
--- a/lib/include/amxfp8intrin.h
+++ b/lib/include/amxfp8intrin.h
@ -0,0 +1,230 @@
+/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXFP8INTRIN_H
+#define __AMXFP8INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_FP8                                                 \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8")))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+///     FOR n := 0 TO dst.colsb / 4 - 1
+///       temp1[n] +=
+///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
+///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
+///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
+///     ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src2
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+                                     src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+///     FOR n := 0 TO dst.colsb / 4 - 1
+///       temp1[n] +=
+///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
+///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
+///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
+///     ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src2
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+                                      src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+///     FOR n := 0 TO dst.colsb / 4 - 1
+///       temp1[n] +=
+///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
+///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
+///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
+///     ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src2
+///    The 2nd source tile. Max size is 1024 Bytes.
+
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+                                      src1.tile, src2.tile);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8
+_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2
+/// accumulating into a Single Precision (FP32) source/dest \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///   temp1[(dst.colsb / 4 - 1) : 0] = 0
+///   FOR k := 0 TO src1.colsb / 4 - 1
+///     FOR n := 0 TO dst.colsb / 4 - 1
+///       temp1[n] +=
+///         INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0])
+///         + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1])
+///         + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2])
+///         + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3])
+///     ENDFOR
+///   ENDFOR
+///   FOR n := 0 TO dst.colsb / 4 - 1
+///     tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n])
+///   ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src2
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP8 static void
+__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) {
+  dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile,
+                                     src1.tile, src2.tile);
+}
+
+#define _tile_dpbf8ps(dst, src1, src2)                                         \
+  __builtin_ia32_tdpbf8ps((dst), (src1), (src2))
+#define _tile_dpbhf8ps(dst, src1, src2)                                        \
+  __builtin_ia32_tdpbhf8ps((dst), (src1), (src2))
+#define _tile_dphbf8ps(dst, src1, src2)                                        \
+  __builtin_ia32_tdphbf8ps((dst), (src1), (src2))
+#define _tile_dphf8ps(dst, src1, src2)                                         \
+  __builtin_ia32_tdphf8ps((dst), (src1), (src2))
+
+#undef __DEFAULT_FN_ATTRS_FP8
+
+#endif /* __x86_64__ */
+#endif /* __AMXFP8INTRIN_H */
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -22,8 +22,6 @@
  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
 #define __DEFAULT_FN_ATTRS_BF16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
-#define __DEFAULT_FN_ATTRS_FP16                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))

 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@ -232,9 +230,11 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 /// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+typedef int _tile1024i_1024a
+    __attribute__((__vector_size__(1024), __aligned__(1024)));

 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                     __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloadd64_internal(m, n, base,
@ -242,7 +242,7 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
 }

 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
                       __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloaddt164_internal(m, n, base,
@ -278,7 +278,7 @@ _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
 }

 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ void __DEFAULT_FN_ATTRS_INT8
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                      __SIZE_TYPE__ stride, _tile1024i tile) {
  return __builtin_ia32_tilestored64_internal(m, n, base,
@ -292,13 +292,6 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }

-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
-_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
 /// This struct pack the shape and tile data together for user. We suggest
 /// initializing the struct as early as possible, because compiler depends
 /// on the shape information to do configure. The constant value is preferred
@ -493,32 +486,9 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                      src0.tile, src1.tile);
 }

-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_FP16
-static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
-                                       __tile1024i src1) {
-  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                      src0.tile, src1.tile);
-}
-
 #undef __DEFAULT_FN_ATTRS_TILE
 #undef __DEFAULT_FN_ATTRS_INT8
 #undef __DEFAULT_FN_ATTRS_BF16
-#undef __DEFAULT_FN_ATTRS_FP16

 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/lib/include/amxmovrsintrin.h
+++ b/lib/include/amxmovrsintrin.h
@ -0,0 +1,48 @@
+/*===-------- amxmovrsintrin.h - AMX MOVRS intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-------------------------------------------------------------------=== */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXMOVRSINTRIN_H
+#define __AMXMOVRSINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_MOVRS                                               \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-movrs")))
+
+#define _tile_loaddrs(dst, base, stride)                                       \
+  __builtin_ia32_tileloaddrs64((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
+#define _tile_stream_loaddrs(dst, base, stride)                                \
+  __builtin_ia32_tileloaddrst164((dst), ((const void *)(base)),                \
+                                 (__SIZE_TYPE__)(stride))
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrs_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrs64_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_MOVRS
+_tile_loaddrst1_internal(unsigned short m, unsigned short n, const void *base,
+                         __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddrst164_internal(m, n, base,
+                                                 (__SIZE_TYPE__)(stride));
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS
+__tile_loaddrs(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrs_internal(dst->row, dst->col, base, stride);
+}
+static __inline__ void __DEFAULT_FN_ATTRS_MOVRS __tile_stream_loaddrs(
+    __tile1024i *dst, const void *base, __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddrst1_internal(dst->row, dst->col, base, stride);
+}
+#undef __DEFAULT_FN_ATTRS_MOVRS
+#endif /* __x86_64__ */
+#endif /* __AMXMOVRSINTRIN_H */
--- a/lib/include/amxmovrstransposeintrin.h
+++ b/lib/include/amxmovrstransposeintrin.h
@ -0,0 +1,200 @@
+/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
+#define __AMX_MOVRS_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-transpose,amx-movrs")))
+
+#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
+#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
+#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
+#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
+  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rs_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1rst1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
+                                  const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                                &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely become
+/// read shared in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS
+static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
+                                  const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                                &dst1->tile, base, stride);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#endif /* __x86_64__ */
+#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */
--- a/lib/include/amxtf32intrin.h
+++ b/lib/include/amxtf32intrin.h
@ -0,0 +1,108 @@
+/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_TF32INTRIN_H
+#define __AMX_TF32INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TF32                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tf32")))
+
+/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus
+/// with \a srcdst.
+/// All the calculation is base on float32 but with the lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \
+///                       constexpr int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
+///
+/// \param srcdst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// 	The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// 	The 2nd source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
+///	dword[12:0] := 0
+///	dword[31:13] := x[31:13]
+///	return dword
+/// }
+///
+/// DEFINE silence_snan_fp32(x[31:0]) {
+/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
+/// 		x.fraction[22] := 1
+/// 	return x
+/// }
+///
+/// elements_a := a.colsb / 4
+/// elements_dest := srcdst.colsb / 4
+///
+/// FOR m = 0 TO (srcdst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR k = 0 TO (elements_a-1)
+/// 		FOR n = 0 TO (elements_dest-1)
+/// 			af := silence_snan_fp32(a.row[m].fp32[k])
+/// 			bf := silence_snan_fp32(b.row[k].fp32[n])
+/// 			tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af)
+/// 					* zero_lower_mantissa_bits_fp32(bf)
+/// 		ENDFOR
+/// 	ENDFOR
+///
+/// 	FOR n = 0 TO (elements_dest-1)
+/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
+/// 	ENDFOR
+///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
+///
+/// ENDFOR
+///
+/// zero_upper_rows(srcdst, srcdst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_mmultf32ps(srcdst, a, b)                                         \
+  __builtin_ia32_tmmultf32ps((srcdst), (a), (b))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32
+_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                          _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst.
+/// All the calculation is base on float32 but with the lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TF32
+static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0,
+                              __tile1024i src1) {
+  dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                        src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_TF32INTRIN_H
--- a/lib/include/amxtf32transposeintrin.h
+++ b/lib/include/amxtf32transposeintrin.h
@ -0,0 +1,105 @@
+/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxtf32tranposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_TF32TRANSPOSEINTRIN_H
+#define __AMX_TF32TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE                                      \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-tf32,amx-transpose")))
+
+/// \code
+/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
+///                        constexpr int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
+///
+/// \param srcdst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// 	The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// 	The 2nd source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
+/// 	dword[12:0] := 0
+/// 	dword[31:13] := x[31:13]
+/// 	return dword
+/// }
+///
+/// DEFINE silence_snan_fp32(x[31:0]) {
+/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
+/// 		x.fraction[22] := 1
+/// 	return x
+/// }
+///
+/// elements_dest:= srcdst.colsb/4
+///
+/// FOR m := 0 TO (srcdst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR k := 0 TO (a.rows-1)
+/// 		FOR n := 0 TO (elements_dest-1)
+/// 			a1e := silence_snan_fp32(a.row[k].fp32[m])
+/// 			a2e := silence_snan_fp32(b.row[k].fp32[n])
+/// 			s1e := zero_lower_mantissa_bits_fp32(a1e)
+/// 			s2e := zero_lower_mantissa_bits_fp32(a2e)
+/// 			tmp.fp32[n] += s1e * s2e
+/// 		ENDFOR
+/// 	ENDFOR
+///
+/// 	FOR n := 0 TO (elements_dest-1)
+/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
+/// 	ENDFOR
+///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
+///
+/// ENDFOR
+///
+/// zero_upper_rows(srcdst, srcdst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_tmmultf32ps(srcdst, a, b)                                        \
+  __builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
+
+// dst = m x n (srcdest), src1 = k x m, src2 = k x n
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
+_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
+/// Matrix Plus with dst. All the calculation is base on float32 but with the
+/// lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
+static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_TF32TRANSPOSEINTRIN_H
--- a/lib/include/amxtransposeintrin.h
+++ b/lib/include/amxtransposeintrin.h
@ -0,0 +1,248 @@
+/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * ===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_TRANSPOSEINTRIN_H
+#define __AMX_TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
+
+#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
+  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
+#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
+#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
+  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
+#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
+  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
+
+/// Transpose 32-bit elements from \a src and write the result to \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_transposed(__tile dst, __tile src);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// 	The source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+///
+/// FOR i := 0 TO (dst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR j := 0 TO (dst.colsb/4-1)
+/// 		tmp.dword[j] := src.row[j].dword[i]
+/// 	ENDFOR
+/// 	dst.row[i] := tmp
+/// ENDFOR
+///
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  // Use __tile1024i_1024a* to escape the alignment check in
+  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
+  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+                                      (_tile1024i_1024a *)dst1, base,
+                                      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz0t1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
+                                      (_tile1024i_1024a *)dst1, base,
+                                      (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
+    unsigned short row, unsigned short col0, unsigned short col1,
+    _tile1024i *dst0, _tile1024i *dst1, const void *base,
+    __SIZE_TYPE__ stride) {
+  __builtin_ia32_t2rpntlvwz1t1_internal(
+      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
+      (__SIZE_TYPE__)(stride));
+}
+
+// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
+_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_ttransposed_internal(m, n, src);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
+                              const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                            &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
+                              const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                            &dst1->tile, base, stride);
+}
+
+/// Converts a pair of tiles from memory into VNNI format, and places the
+/// results in a pair of destinations specified by dst. The pair of tiles
+/// in memory is specified via a tsib; the second tile is after the first
+/// one, separated by the same stride that separates each row.
+/// The tile configuration for the destination tiles indicates the amount
+/// of data to read from memory. The instruction will load a number of rows
+/// that is equal to twice the number of rows in tmm1. The size of each row
+/// is equal to the average width of the destination tiles. If the second
+/// tile is configured with zero rows and columns, only the first tile will
+/// be written. The last row will be not be read from memory but instead
+/// filled with zeros.
+/// Provides a hint to the implementation that the data will likely not be
+/// reused in the near future and the data caching can be optimized.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
+///
+/// \param dst0
+///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param dst1
+///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
+                                const void *base, __SIZE_TYPE__ stride) {
+  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
+                              &dst1->tile, base, stride);
+}
+
+/// Transpose 32-bit elements from src and write the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TRANSPOSE
+static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
+  dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
+}
+
+#endif /* __x86_64__ */
+#endif /* __AMX_TRANSPOSEINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -264,28 +264,28 @@ __rbitl(unsigned long __t) {
 }

 /* 8.3 16-bit multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulbb(int32_t __a, int32_t __b) {
  return __builtin_arm_smulbb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulbt(int32_t __a, int32_t __b) {
  return __builtin_arm_smulbt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smultb(int32_t __a, int32_t __b) {
  return __builtin_arm_smultb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smultt(int32_t __a, int32_t __b) {
  return __builtin_arm_smultt(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulwb(int32_t __a, int32_t __b) {
  return __builtin_arm_smulwb(__a, __b);
 }
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__, target("dsp")))
 __smulwt(int32_t __a, int32_t __b) {
  return __builtin_arm_smulwt(__a, __b);
 }
@ -304,46 +304,46 @@ __smulwt(int32_t __a, int32_t __b) {
 #endif

 /* 8.4.2 Saturating addition and subtraction intrinsics */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qadd(int32_t __t, int32_t __v) {
  return __builtin_arm_qadd(__t, __v);
 }

-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qsub(int32_t __t, int32_t __v) {
  return __builtin_arm_qsub(__t, __v);
 }

-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __qdbl(int32_t __t) {
  return __builtin_arm_qadd(__t, __t);
 }
 #endif

 /* 8.4.3 Accumulating multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlabb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlabt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlabt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlatb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlatb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlatt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlatt(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlawb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlawb(__a, __b, __c);
 }
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("dsp")))
 __smlawt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlawt(__a, __b, __c);
 }
@ -621,8 +621,6 @@ __rintnf(float __a) {
 #endif

 /* 8.8 CRC32 intrinsics */
-#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
-    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32b(uint32_t __a, uint8_t __b) {
  return __builtin_arm_crc32b(__a, __b);
@ -662,7 +660,6 @@ static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target
 __crc32cd(uint32_t __a, uint64_t __b) {
  return __builtin_arm_crc32cd(__a, __b);
 }
-#endif

 /* 8.6 Floating-point data-processing intrinsics */
 /* Armv8.3-A Javascript conversion intrinsic */
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sme.h
+++ b/lib/include/arm_sme.h
@ -35,12 +35,6 @@ __ai bool __arm_has_sme(void) __arm_streaming_compatible {
  return x0 & (1ULL << 63);
 }

-__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible {
-  uint64_t x0, x1;
-  __builtin_arm_get_sme_state(&x0, &x1);
-  return x0 & 1;
-}
-
 void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;
 void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;
 void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;
@ -48,6 +42,8 @@ void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;

 __ai __attribute__((target("sme"))) void svundef_za(void) __arm_streaming_compatible __arm_out("za") { }

+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme___arm_in_streaming_mode)))
+bool __arm_in_streaming_mode(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m)))
 void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m)))
@ -604,6 +600,94 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_
 void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svuint8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m)))
 void svwrite_ver_za8_m(uint64_t, uint32_t, svbool_t, svint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
+void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
+void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
+void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
+void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
+void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
+void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
+void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
+void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
+void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
+void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
+void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
+void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
+void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
+void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
+void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
+void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
+void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
+void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
+void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
+void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
+void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
+void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
+void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
+void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
+void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
+void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
+void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
+void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
+void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
+void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
+void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
+void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
+void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
+void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
+void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
+void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
+void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
+void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
+void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
+void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x2)))
 void svmla_single_za16_f16_vg1x2(uint32_t, svfloat16x2_t, svfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_f16_vg1x4)))
@ -660,22 +744,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_f16_m))
 void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_f16_m)))
 void svmops_za16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
-void svadd_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
-void svadd_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
-void svsub_za16_f16_vg1x2(uint32_t, svfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
-void svsub_za16_f16_vg1x4(uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x2)))
-void svadd_za16_vg1x2(uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_f16_vg1x4)))
-void svadd_za16_vg1x4(uint32_t, svfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x2)))
-void svsub_za16_vg1x2(uint32_t, svfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_f16_vg1x4)))
-void svsub_za16_vg1x4(uint32_t, svfloat16x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m)))
 void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
@ -684,6 +752,138 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m))
 void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m)))
 void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm)))
+void svdot_single_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm)))
+void svdot_single_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm)))
+void svdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm)))
+void svdot_lane_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm)))
+void svdot_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm)))
+void svdot_za16_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm)))
+void svmla_single_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm)))
+void svmla_single_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm)))
+void svmla_single_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm)))
+void svmla_lane_za16_mf8_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm)))
+void svmla_lane_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm)))
+void svmla_lane_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm)))
+void svmla_za16_mf8_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm)))
+void svmla_za16_mf8_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm)))
+void svmopa_za16_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm)))
+void svvdot_lane_za16_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x2_fpm)))
+void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za16_mf8_vg1x4_fpm)))
+void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x2_fpm)))
+void svdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za16_mf8_vg1x4_fpm)))
+void svdot_lane_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x2_fpm)))
+void svdot_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za16_mf8_vg1x4_fpm)))
+void svdot_za16_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x1_fpm)))
+void svmla_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x2_fpm)))
+void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_mf8_vg2x4_fpm)))
+void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x1_fpm)))
+void svmla_lane_za16_vg2x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x2_fpm)))
+void svmla_lane_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_mf8_vg2x4_fpm)))
+void svmla_lane_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x2_fpm)))
+void svmla_za16_vg2x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_mf8_vg2x4_fpm)))
+void svmla_za16_vg2x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_mf8_m_fpm)))
+void svmopa_za16_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdot_lane_za16_mf8_vg1x2_fpm)))
+void svvdot_lane_za16_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm)))
+void svdot_single_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm)))
+void svdot_single_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm)))
+void svdot_lane_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm)))
+void svdot_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm)))
+void svdot_za32_mf8_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm)))
+void svdot_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm)))
+void svmla_single_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm)))
+void svmla_single_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm)))
+void svmla_single_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm)))
+void svmla_lane_za32_mf8_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm)))
+void svmla_lane_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm)))
+void svmla_lane_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm)))
+void svmla_za32_mf8_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm)))
+void svmla_za32_mf8_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm)))
+void svmopa_za32_mf8_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm)))
+void svvdotb_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm)))
+void svvdott_lane_za32_mf8_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x2_fpm)))
+void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_single_za32_mf8_vg1x4_fpm)))
+void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x2_fpm)))
+void svdot_lane_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_lane_za32_mf8_vg1x4_fpm)))
+void svdot_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x2_fpm)))
+void svdot_za32_vg1x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svdot_za32_mf8_vg1x4_fpm)))
+void svdot_za32_vg1x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x1_fpm)))
+void svmla_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x2_fpm)))
+void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za32_mf8_vg4x4_fpm)))
+void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x1_fpm)))
+void svmla_lane_za32_vg4x1_fpm(uint32_t, svmfloat8_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x2_fpm)))
+void svmla_lane_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za32_mf8_vg4x4_fpm)))
+void svmla_lane_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x2_fpm)))
+void svmla_za32_vg4x2_fpm(uint32_t, svmfloat8x2_t, svmfloat8x2_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za32_mf8_vg4x4_fpm)))
+void svmla_za32_vg4x4_fpm(uint32_t, svmfloat8x4_t, svmfloat8x4_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_mf8_m_fpm)))
+void svmopa_za32_m_fpm(uint64_t, svbool_t, svbool_t, svmfloat8_t, svmfloat8_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdotb_lane_za32_mf8_vg1x4_fpm)))
+void svvdotb_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svvdott_lane_za32_mf8_vg1x4_fpm)))
+void svvdott_lane_za32_vg1x4_fpm(uint32_t, svmfloat8x2_t, svmfloat8_t, uint64_t, fpm_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m)))
 void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m)))
@ -732,6 +932,106 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m
 void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m)))
 void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_u8_x4)))
+svuint8x4_t svluti4_zt_u8_x4(uint64_t, svuint8x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svluti4_zt_s8_x4)))
+svint8x4_t svluti4_zt_s8_x4(uint64_t, svuint8x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8)))
+void svwrite_lane_zt_u8(uint64_t, svuint8_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32)))
+void svwrite_lane_zt_u32(uint64_t, svuint32_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64)))
+void svwrite_lane_zt_u64(uint64_t, svuint64_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16)))
+void svwrite_lane_zt_u16(uint64_t, svuint16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16)))
+void svwrite_lane_zt_bf16(uint64_t, svbfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8)))
+void svwrite_lane_zt_s8(uint64_t, svint8_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64)))
+void svwrite_lane_zt_f64(uint64_t, svfloat64_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32)))
+void svwrite_lane_zt_f32(uint64_t, svfloat32_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16)))
+void svwrite_lane_zt_f16(uint64_t, svfloat16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32)))
+void svwrite_lane_zt_s32(uint64_t, svint32_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64)))
+void svwrite_lane_zt_s64(uint64_t, svint64_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16)))
+void svwrite_lane_zt_s16(uint64_t, svint16_t, uint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8)))
+void svwrite_zt_u8(uint64_t, svuint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32)))
+void svwrite_zt_u32(uint64_t, svuint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64)))
+void svwrite_zt_u64(uint64_t, svuint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16)))
+void svwrite_zt_u16(uint64_t, svuint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16)))
+void svwrite_zt_bf16(uint64_t, svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8)))
+void svwrite_zt_s8(uint64_t, svint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64)))
+void svwrite_zt_f64(uint64_t, svfloat64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32)))
+void svwrite_zt_f32(uint64_t, svfloat32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16)))
+void svwrite_zt_f16(uint64_t, svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32)))
+void svwrite_zt_s32(uint64_t, svint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64)))
+void svwrite_zt_s64(uint64_t, svint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16)))
+void svwrite_zt_s16(uint64_t, svint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u8)))
+void svwrite_lane_zt(uint64_t, svuint8_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u32)))
+void svwrite_lane_zt(uint64_t, svuint32_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u64)))
+void svwrite_lane_zt(uint64_t, svuint64_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_u16)))
+void svwrite_lane_zt(uint64_t, svuint16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_bf16)))
+void svwrite_lane_zt(uint64_t, svbfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s8)))
+void svwrite_lane_zt(uint64_t, svint8_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f64)))
+void svwrite_lane_zt(uint64_t, svfloat64_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f32)))
+void svwrite_lane_zt(uint64_t, svfloat32_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_f16)))
+void svwrite_lane_zt(uint64_t, svfloat16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s32)))
+void svwrite_lane_zt(uint64_t, svint32_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s64)))
+void svwrite_lane_zt(uint64_t, svint64_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_lane_zt_s16)))
+void svwrite_lane_zt(uint64_t, svint16_t, uint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u8)))
+void svwrite_zt(uint64_t, svuint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u32)))
+void svwrite_zt(uint64_t, svuint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u64)))
+void svwrite_zt(uint64_t, svuint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_u16)))
+void svwrite_zt(uint64_t, svuint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_bf16)))
+void svwrite_zt(uint64_t, svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s8)))
+void svwrite_zt(uint64_t, svint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f64)))
+void svwrite_zt(uint64_t, svfloat64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f32)))
+void svwrite_zt(uint64_t, svfloat32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_f16)))
+void svwrite_zt(uint64_t, svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s32)))
+void svwrite_zt(uint64_t, svint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s64)))
+void svwrite_zt(uint64_t, svint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_zt_s16)))
+void svwrite_zt(uint64_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_u32_vg1x2)))
 void svadd_write_single_za32_u32_vg1x2(uint32_t, svuint32x2_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_write_single_za32_s32_vg1x2)))
@ -2138,78 +2438,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_u8_vg1x
 void svwrite_za8_vg1x4(uint32_t, svuint8x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_za8_s8_vg1x4)))
 void svwrite_za8_vg1x4(uint32_t, svint8x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
-void svadd_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
-void svadd_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
-void svmla_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
-void svmla_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
-void svmla_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
-void svmla_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
-void svmla_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
-void svmla_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
-void svmls_single_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
-void svmls_single_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
-void svmls_lane_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
-void svmls_lane_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
-void svmls_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
-void svmls_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
-void svmopa_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
-void svmops_za16_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
-void svsub_za16_bf16_vg1x2(uint32_t, svbfloat16x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
-void svsub_za16_bf16_vg1x4(uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x2)))
-void svadd_za16_vg1x2(uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za16_bf16_vg1x4)))
-void svadd_za16_vg1x4(uint32_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_single_za16_bf16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x2)))
-void svmla_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_lane_za16_bf16_vg1x4)))
-void svmla_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x2)))
-void svmla_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmla_za16_bf16_vg1x4)))
-void svmla_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_single_za16_bf16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x2)))
-void svmls_lane_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_lane_za16_bf16_vg1x4)))
-void svmls_lane_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x2)))
-void svmls_za16_vg1x2(uint32_t, svbfloat16x2_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmls_za16_bf16_vg1x4)))
-void svmls_za16_vg1x4(uint32_t, svbfloat16x4_t, svbfloat16x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za16_bf16_m)))
-void svmopa_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za16_bf16_m)))
-void svmops_za16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x2)))
-void svsub_za16_vg1x2(uint32_t, svbfloat16x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsub_za16_bf16_vg1x4)))
-void svsub_za16_vg1x4(uint32_t, svbfloat16x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x2)))
 void svadd_za64_f64_vg1x2(uint32_t, svfloat64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svadd_za64_f64_vg1x4)))
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/arm_vector_types.h
+++ b/lib/include/arm_vector_types.h
@ -17,9 +17,62 @@
 typedef float float32_t;
 typedef __fp16 float16_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
+typedef __mfp8 mfloat8_t;
 typedef double float64_t;
 #endif

+
+typedef uint64_t fpm_t;
+
+enum __ARM_FPM_FORMAT { __ARM_FPM_E5M2, __ARM_FPM_E4M3 };
+
+enum __ARM_FPM_OVERFLOW { __ARM_FPM_INFNAN, __ARM_FPM_SATURATE };
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_fpm_init(void) {
+  return 0;
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_src1_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+  return (__fpm & ~7ull) | (fpm_t)__format;
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_src2_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+  return (__fpm & ~0x38ull) | ((fpm_t)__format << 3u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_dst_format(fpm_t __fpm, enum __ARM_FPM_FORMAT __format) {
+  return (__fpm & ~0x1c0ull) | ((fpm_t)__format << 6u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_overflow_mul(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
+  return (__fpm & ~0x4000ull) | ((fpm_t)__behaviour << 14u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_overflow_cvt(fpm_t __fpm, enum __ARM_FPM_OVERFLOW __behaviour) {
+  return (__fpm & ~0x8000ull) | ((fpm_t)__behaviour << 15u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_lscale(fpm_t __fpm, uint64_t __scale) {
+  return (__fpm & ~0x7f0000ull) | (__scale << 16u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_nscale(fpm_t __fpm, int64_t __scale) {
+  return (__fpm & ~0xff000000ull) | (((fpm_t)__scale & 0xffu) << 24u);
+}
+
+static __inline__ fpm_t __attribute__((__always_inline__, __nodebug__))
+__arm_set_fpm_lscale2(fpm_t __fpm, uint64_t __scale) {
+  return (uint32_t)__fpm | (__scale << 32u);
+}
+
 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
@ -36,6 +89,10 @@ typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
+#endif
 typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
@ -109,6 +166,16 @@ typedef struct uint64x2x2_t {
  uint64x2_t val[2];
 } uint64x2x2_t;

+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct mfloat8x8x2_t {
+  mfloat8x8_t val[2];
+} mfloat8x8x2_t;
+
+typedef struct mfloat8x16x2_t {
+  mfloat8x16_t val[2];
+} mfloat8x16x2_t;
+
+#endif
 typedef struct float16x4x2_t {
  float16x4_t val[2];
 } float16x4x2_t;
@ -199,6 +266,16 @@ typedef struct uint64x2x3_t {
  uint64x2_t val[3];
 } uint64x2x3_t;

+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct mfloat8x8x3_t {
+  mfloat8x8_t val[3];
+} mfloat8x8x3_t;
+
+typedef struct mfloat8x16x3_t {
+  mfloat8x16_t val[3];
+} mfloat8x16x3_t;
+
+#endif
 typedef struct float16x4x3_t {
  float16x4_t val[3];
 } float16x4x3_t;
@ -289,6 +366,16 @@ typedef struct uint64x2x4_t {
  uint64x2_t val[4];
 } uint64x2x4_t;

+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct mfloat8x8x4_t {
+  mfloat8x8_t val[4];
+} mfloat8x8x4_t;
+
+typedef struct mfloat8x16x4_t {
+  mfloat8x16_t val[4];
+} mfloat8x16x4_t;
+
+#endif
 typedef struct float16x4x4_t {
  float16x4_t val[4];
 } float16x4x4_t;
--- a/lib/include/avx10_2_512bf16intrin.h
+++ b/lib/include/avx10_2_512bf16intrin.h
@ -0,0 +1,561 @@
+/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512BF16INTRIN_H
+#define __AVX10_2_512BF16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
+  return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
+  return (__m512bh)__builtin_ia32_undef512();
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
+  return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
+                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
+    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
+    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
+    __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
+    __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
+    __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
+  return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
+                             bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
+                             bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
+                             bf8,  bf7,  bf6,  bf5,  bf4,  bf3,  bf2,  bf1};
+}
+
+#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
+                        bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19,  \
+                        bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28,  \
+                        bf29, bf30, bf31, bf32)                                \
+  _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26),       \
+                 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19),       \
+                 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12),       \
+                 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4),     \
+                 (bf3), (bf2), (bf1))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_castbf16_ps(__m512bh __a) {
+  return (__m512)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_castbf16_pd(__m512bh __a) {
+  return (__m512d)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castbf16_si512(__m512bh __a) {
+  return (__m512i)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castpd_pbh(__m512d __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castsi512_pbh(__m512i __a) {
+  return (__m512bh)__a;
+}
+
+static __inline__ __m128bh __DEFAULT_FN_ATTRS512
+_mm512_castbf16512_pbh128(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_castbf16512_pbh256(__m512bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_castbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextbf16128_pbh512(__m128bh __a) {
+  return __builtin_shufflevector(
+      __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_zextbf16256_pbh512(__m256bh __a) {
+  return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
+                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                                 29, 30, 31);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
+  return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
+                                    (__m512i)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_load_pbh(void const *__p) {
+  return *(const __m512bh *)__p;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_loadu_pbh(void const *__p) {
+  struct __loadu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_pbh *)__p)->__v;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
+                                                              __m512bh __A) {
+  *(__m512bh *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
+                                                               __m512bh __A) {
+  struct __storeu_pbh {
+    __m512bh_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_pbh *)__P)->__v = __A;
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
+                                                (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+                                                  (__v32hi)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)((__v32bf)__A + (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)((__v32bf)__A - (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)((__v32bf)__A * (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)((__v32bf)__A / (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
+                                                                __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+#define _mm512_cmp_pbh_mask(__A, __B, __P)                                     \
+  ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
+                                              (__v32bf)(__m512bh)(__B),        \
+                                              (int)(__P), (__mmask32) - 1))
+
+#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
+  ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
+                                              (__v32bf)(__m512bh)(__B),        \
+                                              (int)(__P), (__mmask32)(__U)))
+
+#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm)                            \
+  ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
+
+#define _mm512_fpclass_pbh_mask(__A, imm)                                      \
+  ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
+    __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
+  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
+      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
+                                                   (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_getexp_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
+      (__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_rsqrt_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
+                                                     (__mmask32)__U);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
+      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
+}
+
+#define _mm512_reduce_pbh(__A, imm)                                            \
+  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(),   \
+      (__mmask32) - 1))
+
+#define _mm512_mask_reduce_pbh(__W, __U, __A, imm)                             \
+  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
+      (__mmask32)(__U)))
+
+#define _mm512_maskz_reduce_pbh(__U, __A, imm)                                 \
+  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32)(__U)))
+
+#define _mm512_roundscale_pbh(__A, imm)                                        \
+  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32) - 1))
+
+#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm)                         \
+  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
+      (__mmask32)(__U)))
+
+#define _mm512_maskz_roundscale_pbh(__U, __A, imm)                             \
+  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
+      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
+      (__mmask32)(__U)))
+
+#define _mm512_getmant_pbh(__A, __B, __C)                                      \
+  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
+
+#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
+  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
+
+#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C)                           \
+  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
+      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
+      (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
+  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                                (__v32bf)_mm512_sqrt_pbh(__A),
+                                                (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
+                                                 (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B,
+                                                 -(__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
+                                                 (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B,
+                                                 -(__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
+    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__A);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh(
+    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)__C);
+}
+
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
+    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
+  return (__m512bh)__builtin_ia32_selectpbf_512(
+      (__mmask32)__U,
+      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
+      (__v32bf)_mm512_setzero_pbh());
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
+#endif
--- a/lib/include/avx10_2_512convertintrin.h
+++ b/lib/include/avx10_2_512convertintrin.h
@ -0,0 +1,320 @@
+/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512convertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512CONVERTINTRIN_H
+#define __AVX10_2_512CONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A,
+                                                                  __m512 __B) {
+  return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
+      (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1),
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
+      (__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtx2ps_ph(__mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
+      (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtx_round2ps_ph(A, B, R)                                       \
+  ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask(                                \
+      (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(),              \
+      (__mmask32)(-1), (const int)(R)))
+
+#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R)                            \
+  ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B),     \
+                                               (__v32hf)(W), (__mmask32)(U),   \
+                                               (const int)(R)))
+
+#define _mm512_maskz_cvtx_round2ps_ph(U, A, B, R)                              \
+  ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask(                                \
+      (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_setzero_ph(),                \
+      (__mmask32)(U), (const int)(R)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiasph_bf8(__m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_bf8(
+    __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiasph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiassph_bf8(__m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_bf8(
+    __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiassph_bf8(__mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiasph_hf8(__m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_hf8(
+    __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiasph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiassph_hf8(__m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+      (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_hf8(
+    __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiassph_hf8(__mmask32 __U, __m512i __A, __m512h __B) {
+  return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+      (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+      (__mmask32)__U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_bf8(__m512h __A,
+                                                                  __m512h __B) {
+  return (__m512i)__builtin_ia32_vcvt2ph2bf8_512((__v32hf)(__A),
+                                                 (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvt2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvt2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvt2ph_bf8(__A, __B),
+      (__v64qi)(__m512i)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvts2ph_bf8(__m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_vcvt2ph2bf8s_512((__v32hf)(__A),
+                                                  (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvts2ph_bf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvts2ph_bf8(__mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvts2ph_bf8(__A, __B),
+      (__v64qi)(__m512i)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvt2ph_hf8(__m512h __A,
+                                                                  __m512h __B) {
+  return (__m512i)__builtin_ia32_vcvt2ph2hf8_512((__v32hf)(__A),
+                                                 (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvt2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvt2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvt2ph_hf8(__A, __B),
+      (__v64qi)(__m512i)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvts2ph_hf8(__m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_vcvt2ph2hf8s_512((__v32hf)(__A),
+                                                  (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvts2ph_hf8(__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_cvts2ph_hf8(__A, __B),
+      (__v64qi)(__m512i)_mm512_setzero_si512());
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) {
+  return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
+      (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) {
+  return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
+      (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) {
+  return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask(
+      (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_bf8(__m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_bf8(__mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_bf8(__m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsph_bf8(__m256i __W, __mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsph_bf8(__mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2bf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtph_hf8(__m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_hf8(__mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsph_hf8(__m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsph_hf8(__m256i __W, __mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsph_hf8(__mmask32 __U, __m512h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2hf8s_512_mask(
+      (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtbf8_ph(__m256i __A) {
+  return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8));
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtbf8_ph(__m512h __S, __mmask32 __U, __m256i __A) {
+  return _mm512_castsi512_ph(
+      _mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8));
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbf8_ph(__mmask32 __U, __m256i __A) {
+  return _mm512_castsi512_ph(
+      _mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8));
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif // __AVX10_2_512CONVERTINTRIN_H
+#endif // __SSE2__
--- a/lib/include/avx10_2_512minmaxintrin.h
+++ b/lib/include/avx10_2_512minmaxintrin.h
@ -0,0 +1,127 @@
+/*===---- avx10_2_512minmaxintrin.h - AVX10_2_512MINMAX intrinsics ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2_512MINMAXINTRIN_H
+#define __AVX10_2_512MINMAXINTRIN_H
+
+#define _mm512_minmax_pbh(A, B, C)                                             \
+  ((__m512bh)__builtin_ia32_vminmaxbf16512((__v32bf)(__m512bh)(A),             \
+                                           (__v32bf)(__m512bh)(A), (int)(C)))
+
+#define _mm512_mask_minmax_pbh(W, U, A, B, C)                                  \
+  ((__m512bh)__builtin_ia32_selectpbf_512(                                     \
+      (__mmask32)(U),                                                          \
+      (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A),                       \
+                                 (__v32bf)(__m512bh)(B), (int)(C)),            \
+      (__v32bf)(__m512bh)(W)))
+
+#define _mm512_maskz_minmax_pbh(U, A, B, C)                                    \
+  ((__m512bh)__builtin_ia32_selectpbf_512(                                     \
+      (__mmask32)(U),                                                          \
+      (__v32bf)_mm512_minmax_pbh((__v32bf)(__m512bh)(A),                       \
+                                 (__v32bf)(__m512bh)(B), (int)(C)),            \
+      (__v32bf) __builtin_bit_cast(__m512bh, _mm512_setzero_ps())))
+
+#define _mm512_minmax_pd(A, B, C)                                              \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)_mm512_undefined_pd(), (__mmask8)-1,                             \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_pd(W, U, A, B, C)                                   \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_pd(U, A, B, C)                                     \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_pd(A, B, C, R)                                     \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)_mm512_undefined_pd(), (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_minmax_round_pd(W, U, A, B, C, R)                          \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)(__m512d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_pd(U, A, B, C, R)                            \
+  ((__m512d)__builtin_ia32_vminmaxpd512_round_mask(                            \
+      (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C),                    \
+      (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm512_minmax_ph(A, B, C)                                              \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ph(W, U, A, B, C)                                   \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)(__m512h)(W), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ph(U, A, B, C)                                     \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ph(A, B, C, R)                                     \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ph(W, U, A, B, C, R)                          \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ph(U, A, B, C, R)                            \
+  ((__m512h)__builtin_ia32_vminmaxph512_round_mask(                            \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (int)(C),                  \
+      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_minmax_ps(A, B, C)                                              \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C),                    \
+      (__v16sf)_mm512_undefined_ps(), (__mmask16)-1,                           \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_minmax_ps(W, U, A, B, C)                                   \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W),      \
+      (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_minmax_ps(U, A, B, C)                                     \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C),                    \
+      (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_minmax_round_ps(A, B, C, R)                                     \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C),                    \
+      (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_minmax_round_ps(W, U, A, B, C, R)                          \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), (__v16sf)(W),      \
+      (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_minmax_round_ps(U, A, B, C, R)                            \
+  ((__m512)__builtin_ia32_vminmaxps512_round_mask(                             \
+      (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C),                    \
+      (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+#endif // __AVX10_2_512MINMAXINTRIN_H
--- a/lib/include/avx10_2_512niintrin.h
+++ b/lib/include/avx10_2_512niintrin.h
@ -0,0 +1,314 @@
+/*===---- avx10_2_512niintrin.h - AVX10.2-512 new instruction intrinsics ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512niintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512NIINTRIN_H
+#define __AVX10_2_512NIINTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+/* VNNI FP16 */
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
+                                                           __m512h __A,
+                                                           __m512h __B) {
+  return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
+                                           (__v32hf)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
+                                                                __mmask16 __U,
+                                                                __m512h __A,
+                                                                __m512h __B) {
+  return (__m512)__builtin_ia32_selectps_512(
+      (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
+                                                                 __m512 __W,
+                                                                 __m512h __A,
+                                                                 __m512h __B) {
+  return (__m512)__builtin_ia32_selectps_512(
+      (__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
+      (__v16sf)_mm512_setzero_ps());
+}
+
+/* VMPSADBW */
+#define _mm512_mpsadbw_epu8(A, B, imm)                                         \
+  ((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A),                   \
+                                      (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_mpsadbw_epu8(W, U, A, B, imm)                              \
+  ((__m512i)__builtin_ia32_selectw_512(                                        \
+      (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)),           \
+      (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_mpsadbw_epu8(U, A, B, imm)                                \
+  ((__m512i)__builtin_ia32_selectw_512(                                        \
+      (__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)),           \
+      (__v32hi)_mm512_setzero_si512()))
+
+/* VNNI INT8 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
+                                                                 __m512i __A,
+                                                                 __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
+                                                                  __m512i __A,
+                                                                  __m512i __B) {
+  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
+    __m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
+    __mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      __U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
+      (__v16si)_mm512_setzero_si512());
+}
+
+/* VNNI INT16 */
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
+                                                                 __m512i __B,
+                                                                 __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
+                                                                  __m512i __B,
+                                                                  __m512i __C) {
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
+                                              (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+      (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
+    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
+      (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SSE2__ */
+#endif /* __AVX10_2_512NIINTRIN_H */
--- a/lib/include/avx10_2_512satcvtdsintrin.h
+++ b/lib/include/avx10_2_512satcvtdsintrin.h
@ -0,0 +1,303 @@
+/*===----- avx10_2_512satcvtdsintrin.h - AVX10_2_512SATCVTDS intrinsics ----===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512satcvtdsintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX10_2_512SATCVTDSINTRIN_H
+#define __AVX10_2_512SATCVTDSINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
+                 __min_vector_width__(512)))
+
+// 512 bit : Double -> Int
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi32(__m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+      (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epi32(__m256i __W, __mmask8 __U, __m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+      (__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epi32(__mmask8 __U, __m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+      (__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epi32(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(),                \
+      (__mmask8) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundpd_epi32(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U),         \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundpd_epi32(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
+      (const int)(__R)))
+
+// 512 bit : Double -> uInt
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu32(__m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+      (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+      (__v8df)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epu32(__mmask8 __U, __m512d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+      (__v8df)__A, (__v8si)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epu32(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8si)_mm256_undefined_si256(),                \
+      (__mmask8) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundpd_epu32(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8si)(__m256i)(__W), (__mmask8)(__U),         \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundpd_epu32(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8si)_mm256_setzero_si256(), (__mmask8)(__U), \
+      (const int)(__R)))
+
+//  512 bit : Double -> Long
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi64(__m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+      (__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epi64(__m512i __W, __mmask8 __U, __m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+      (__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epi64(__mmask8 __U, __m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+      (__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epi64(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(),                \
+      (__mmask8) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundpd_epi64(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U),         \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundpd_epi64(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(                          \
+      (__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
+      (const int)(__R)))
+
+// 512 bit : Double -> ULong
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu64(__m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+      (__v8df)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epu64(__m512i __W, __mmask8 __U, __m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+      (__v8df)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epu64(__mmask8 __U, __m512d __A) {
+  return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+      (__v8df)__A, (__v8di)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epu64(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8di)_mm512_undefined_epi32(),                \
+      (__mmask8) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundpd_epu64(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U),         \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundpd_epu64(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(                         \
+      (__v8df)(__m512d)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U), \
+      (const int)(__R)))
+
+// 512 bit: Float -> int
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi32(__m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+      (__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epi32(__m512i __W, __mmask16 __U, __m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+      (__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epi32(__mmask16 __U, __m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+      (__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epi32(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(                          \
+      (__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(),               \
+      (__mmask16) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundps_epi32(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(                          \
+      (__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U),       \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundps_epi32(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(                          \
+      (__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(),                 \
+      (__mmask16)(__U), (const int)(__R)))
+
+// 512 bit: Float -> uint
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu32(__m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+      (__v16sf)(__A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epu32(__m512i __W, __mmask16 __U, __m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+      (__v16sf)(__A), (__v16si)(__W), __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epu32(__mmask16 __U, __m512 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+      (__v16sf)(__A), (__v16si)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epu32(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(                         \
+      (__v16sf)(__m512)(__A), (__v16si)_mm512_undefined_epi32(),               \
+      (__mmask16) - 1, (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundps_epu32(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(                         \
+      (__v16sf)(__m512)(__A), (__v16si)(__m512i)(__W), (__mmask16)(__U),       \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundps_epu32(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(                         \
+      (__v16sf)(__m512)(__A), (__v16si)_mm512_setzero_si512(),                 \
+      (__mmask16)(__U), (const int)(__R)))
+
+// 512 bit : float -> long
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi64(__m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+      (__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epi64(__m512i __W, __mmask8 __U, __m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+      (__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epi64(__mmask8 __U, __m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+      (__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epi64(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(                          \
+      (__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+      (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundps_epi64(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(                          \
+      (__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U),          \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundps_epi64(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(                          \
+      (__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U),  \
+      (const int)(__R)))
+
+// 512 bit : float -> ulong
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu64(__m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+      (__v8sf)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epu64(__m512i __W, __mmask8 __U, __m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+      (__v8sf)__A, (__v8di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epu64(__mmask8 __U, __m256 __A) {
+  return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+      (__v8sf)__A, (__v8di)_mm512_setzero_si512(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epu64(__A, __R)                                   \
+  ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(                         \
+      (__v8sf)(__m256)(__A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+      (const int)(__R)))
+
+#define _mm512_mask_cvtts_roundps_epu64(__W, __U, __A, __R)                    \
+  ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(                         \
+      (__v8sf)(__m256)(__A), (__v8di)(__m512i)(__W), (__mmask8)(__U),          \
+      (const int)(__R)))
+
+#define _mm512_maskz_cvtts_roundps_epu64(__U, __A, __R)                        \
+  ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(                         \
+      (__v8sf)(__m256)(__A), (__v8di)_mm512_setzero_si512(), (__mmask8)(__U),  \
+      (const int)(__R)))
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __AVX10_2_512SATCVTDSINTRIN_H
--- a/lib/include/avx10_2_512satcvtintrin.h
+++ b/lib/include/avx10_2_512satcvtintrin.h
@ -0,0 +1,301 @@
+/*===------ avx10_2_512satcvtintrin.h - AVX10_2_512SATCVT intrinsics -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2_512satcvtintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2_512SATCVTINTRIN_H
+#define __AVX10_2_512SATCVTINTRIN_H
+
+#define _mm512_ipcvtbf16_epi8(A)                                               \
+  ((__m512i)__builtin_ia32_vcvtbf162ibs512((__v32bf)(__m512bh)(A)))
+
+#define _mm512_mask_ipcvtbf16_epi8(W, U, A)                                    \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvtbf16_epi8(A),      \
+                                       (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_ipcvtbf16_epi8(U, A)                                      \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvtbf16_epi8(A),      \
+                                       (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_ipcvtbf16_epu8(A)                                               \
+  ((__m512i)__builtin_ia32_vcvtbf162iubs512((__v32bf)(__m512bh)(A)))
+
+#define _mm512_mask_ipcvtbf16_epu8(W, U, A)                                    \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvtbf16_epu8(A),      \
+                                       (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_ipcvtbf16_epu8(U, A)                                      \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvtbf16_epu8(A),      \
+                                       (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_ipcvttbf16_epi8(A)                                              \
+  ((__m512i)__builtin_ia32_vcvttbf162ibs512((__v32bf)(__m512bh)(A)))
+
+#define _mm512_mask_ipcvttbf16_epi8(W, U, A)                                   \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvttbf16_epi8(A),     \
+                                       (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_ipcvttbf16_epi8(U, A)                                     \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvttbf16_epi8(A),     \
+                                       (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_ipcvttbf16_epu8(A)                                              \
+  ((__m512i)__builtin_ia32_vcvttbf162iubs512((__v32bf)(__m512bh)(A)))
+
+#define _mm512_mask_ipcvttbf16_epu8(W, U, A)                                   \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvttbf16_epu8(A),     \
+                                       (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_ipcvttbf16_epu8(U, A)                                     \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U),                         \
+                                       (__v32hi)_mm512_ipcvttbf16_epu8(A),     \
+                                       (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_ipcvtph_epi8(A)                                                 \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask(                                 \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvtph_epi8(W, U, A)                                      \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A),           \
+                                              (__v32hu)(W), (__mmask32)(U),    \
+                                              _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvtph_epi8(U, A)                                        \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask(                                 \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvt_roundph_epi8(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A),           \
+                                              (__v32hu)_mm512_setzero_si512(), \
+                                              (__mmask32)-1, (const int)R))
+
+#define _mm512_mask_ipcvt_roundph_epi8(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask(                                 \
+      (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
+
+#define _mm512_maskz_ipcvt_roundph_epi8(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvtph2ibs512_mask((__v32hf)(__m512h)(A),           \
+                                              (__v32hu)_mm512_setzero_si512(), \
+                                              (__mmask32)(U), (const int)R))
+
+#define _mm512_ipcvtph_epu8(A)                                                 \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvtph_epu8(W, U, A)                                      \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask((__v32hf)(__m512h)(A),          \
+                                               (__v32hu)(W), (__mmask32)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvtph_epu8(U, A)                                        \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvt_roundph_epu8(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      (const int)R))
+
+#define _mm512_mask_ipcvt_roundph_epu8(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), (const int)R))
+
+#define _mm512_maskz_ipcvt_roundph_epu8(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvtph2iubs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      (const int)R))
+
+#define _mm512_ipcvtps_epi8(A)                                                 \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask(                                 \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvtps_epi8(W, U, A)                                      \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A),            \
+                                              (__v16su)(W), (__mmask16)(U),    \
+                                              _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvtps_epi8(U, A)                                        \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask(                                 \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvt_roundps_epi8(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A),            \
+                                              (__v16su)_mm512_setzero_si512(), \
+                                              (__mmask16)-1, (const int)R))
+
+#define _mm512_mask_ipcvt_roundps_epi8(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask(                                 \
+      (__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
+
+#define _mm512_maskz_ipcvt_roundps_epi8(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvtps2ibs512_mask((__v16sf)(__m512)(A),            \
+                                              (__v16su)_mm512_setzero_si512(), \
+                                              (__mmask16)(U), (const int)R))
+
+#define _mm512_ipcvtps_epu8(A)                                                 \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask(                                \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvtps_epu8(W, U, A)                                      \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask((__v16sf)(__m512)(A),           \
+                                               (__v16su)(W), (__mmask16)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvtps_epu8(U, A)                                        \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask(                                \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvt_roundps_epu8(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask(                                \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,    \
+      (const int)R))
+
+#define _mm512_mask_ipcvt_roundps_epu8(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask(                                \
+      (__v16sf)(__m512)(A), (__v16su)(W), (__mmask16)(U), (const int)R))
+
+#define _mm512_maskz_ipcvt_roundps_epu8(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvtps2iubs512_mask(                                \
+      (__v16sf)(__m512)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),   \
+      (const int)R))
+
+#define _mm512_ipcvttph_epi8(A)                                                \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvttph_epi8(W, U, A)                                     \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask((__v32hf)(__m512h)(A),          \
+                                               (__v32hu)(W), (__mmask32)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvttph_epi8(U, A)                                       \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvtt_roundph_epi8(A, S)                                       \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      S))
+
+#define _mm512_mask_ipcvtt_roundph_epi8(W, U, A, S)                            \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
+
+#define _mm512_maskz_ipcvtt_roundph_epi8(U, A, S)                              \
+  ((__m512i)__builtin_ia32_vcvttph2ibs512_mask(                                \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      S))
+
+#define _mm512_ipcvttph_epu8(A)                                                \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask(                               \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvttph_epu8(W, U, A)                                     \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask((__v32hf)(__m512h)(A),         \
+                                                (__v32hu)(W), (__mmask32)(U),  \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvttph_epu8(U, A)                                       \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask(                               \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvtt_roundph_epu8(A, S)                                       \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask(                               \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)-1,   \
+      S))
+
+#define _mm512_mask_ipcvtt_roundph_epu8(W, U, A, S)                            \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask(                               \
+      (__v32hf)(__m512h)(A), (__v32hu)(W), (__mmask32)(U), S))
+
+#define _mm512_maskz_ipcvtt_roundph_epu8(U, A, S)                              \
+  ((__m512i)__builtin_ia32_vcvttph2iubs512_mask(                               \
+      (__v32hf)(__m512h)(A), (__v32hu)_mm512_setzero_si512(), (__mmask32)(U),  \
+      S))
+
+#define _mm512_ipcvttps_epi8(A)                                                \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask(                                \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvttps_epi8(W, U, A)                                     \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask((__v16sf)(__m512h)(A),          \
+                                               (__v16su)(W), (__mmask16)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvttps_epi8(U, A)                                       \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask(                                \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvtt_roundps_epi8(A, S)                                       \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask(                                \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,   \
+      S))
+
+#define _mm512_mask_ipcvtt_roundps_epi8(W, U, A, S)                            \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask(                                \
+      (__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
+
+#define _mm512_maskz_ipcvtt_roundps_epi8(U, A, S)                              \
+  ((__m512i)__builtin_ia32_vcvttps2ibs512_mask(                                \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),  \
+      S))
+
+#define _mm512_ipcvttps_epu8(A)                                                \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask(                               \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_ipcvttps_epu8(W, U, A)                                     \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask((__v16sf)(__m512h)(A),         \
+                                                (__v16su)(W), (__mmask16)(U),  \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_ipcvttps_epu8(U, A)                                       \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask(                               \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),  \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_ipcvtt_roundps_epu8(A, S)                                       \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask(                               \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)-1,   \
+      S))
+
+#define _mm512_mask_ipcvtt_roundps_epu8(W, U, A, S)                            \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask(                               \
+      (__v16sf)(__m512h)(A), (__v16su)(W), (__mmask16)(U), S))
+
+#define _mm512_maskz_ipcvtt_roundps_epu8(U, A, S)                              \
+  ((__m512i)__builtin_ia32_vcvttps2iubs512_mask(                               \
+      (__v16sf)(__m512h)(A), (__v16su)_mm512_setzero_si512(), (__mmask16)(U),  \
+      S))
+
+#endif // __AVX10_2_512SATCVTINTRIN_H
--- a/lib/include/avx10_2bf16intrin.h
+++ b/lib/include/avx10_2bf16intrin.h
--- a/lib/include/avx10_2convertintrin.h
+++ b/lib/include/avx10_2convertintrin.h
@ -0,0 +1,590 @@
+/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2convertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2CONVERTINTRIN_H
+#define __AVX10_2CONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(256)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
+                                                               __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
+      (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
+      (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
+      (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
+                                                                  __m256 __B) {
+  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
+      (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1),
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
+      (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
+      (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm256_cvtx_round2ps_ph(A, B, R)                                       \
+  ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask(                                \
+      (__v8sf)(A), (__v8sf)(B), (__v16hf)_mm256_undefined_ph(),                \
+      (__mmask16)(-1), (const int)(R)))
+
+#define _mm256_mask_cvtx_round2ps_ph(W, U, A, B, R)                            \
+  ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask(                                \
+      (__v8sf)(A), (__v8sf)(B), (__v16hf)(W), (__mmask16)(U), (const int)(R)))
+
+#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R)                              \
+  ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask(                                \
+      (__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()),                \
+      (__mmask16)(U), (const int)(R)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_bf8(__m128i __A,
+                                                                  __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
+      (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8(
+    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtbiassph_bf8(__m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtbiassph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtbiassph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtbiassph_bf8(__m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
+      (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_bf8(
+    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtbiassph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtbiasph_hf8(__m128i __A,
+                                                                  __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
+      (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8(
+    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtbiassph_hf8(__m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtbiassph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtbiassph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
+      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtbiassph_hf8(__m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
+      (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiassph_hf8(
+    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtbiassph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
+  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
+      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
+      (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A,
+                                                               __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A), (__v8hf)(__B));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B),
+      (__v16qi)(__m128i)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_bf8(__m256h __A,
+                                                                  __m256h __B) {
+  return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A),
+                                                 (__v16hf)(__B));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvt2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
+      (__v32qi)(__m256i)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_bf8(__m128h __A,
+                                                                __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A), (__v8hf)(__B));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvts2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B), (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvts2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvts2ph_bf8(__A, __B),
+      (__v16qi)(__m128i)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvts2ph_bf8(__m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A),
+                                                  (__v16hf)(__B));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvts2ph_bf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B), (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvts2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_bf8(__A, __B),
+      (__v32qi)(__m256i)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A,
+                                                               __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A), (__v8hf)(__B));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B),
+      (__v16qi)(__m128i)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvt2ph_hf8(__m256h __A,
+                                                                  __m256h __B) {
+  return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A),
+                                                 (__v16hf)(__B));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvt2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
+      (__v32qi)(__m256i)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts2ph_hf8(__m128h __A,
+                                                                __m128h __B) {
+  return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A), (__v8hf)(__B));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvts2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B), (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvts2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_cvts2ph_hf8(__A, __B),
+      (__v16qi)(__m128i)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvts2ph_hf8(__m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A),
+                                                  (__v16hf)(__B));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvts2ph_hf8(__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B), (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask16)__U, (__v32qi)_mm256_cvts2ph_hf8(__A, __B),
+      (__v32qi)(__m256i)_mm256_setzero_si256());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) {
+  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
+      (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
+      (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U,
+                                                                 __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
+      (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) {
+  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
+      (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) {
+  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
+      (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) {
+  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
+      (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_bf8(__m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_bf8(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsph_bf8(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_bf8(__m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsph_bf8(__mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtph_hf8(__m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsph_hf8(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsph_hf8(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
+      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsph_hf8(__m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsph_hf8(__mmask16 __U, __m256h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
+      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) {
+  return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtbf8_ph(__m128h __S, __mmask8 __U, __m128i __A) {
+  return _mm_castsi128_ph(
+      _mm_mask_slli_epi16((__m128i)__S, __U, _mm_cvtepi8_epi16(__A), 8));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) {
+  return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8));
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) {
+  return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8));
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtbf8_ph(__m256h __S, __mmask16 __U, __m128i __A) {
+  return _mm256_castsi256_ph(
+      _mm256_mask_slli_epi16((__m256i)__S, __U, _mm256_cvtepi8_epi16(__A), 8));
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) {
+  return _mm256_castsi256_ph(
+      _mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8));
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVX10_2CONVERTINTRIN_H
+#endif // __SSE2__
--- a/lib/include/avx10_2copyintrin.h
+++ b/lib/include/avx10_2copyintrin.h
@ -0,0 +1,66 @@
+/*===---- avx10_2copyintrin.h - AVX10.2 Copy intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2copyintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2COPYINTRIN_H
+#define __AVX10_2COPYINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(128)))
+
+/// Constructs a 128-bit integer vector, setting the lower 32 bits to the
+///    lower 32 bits of the parameter \a __A; the upper bits are zeoroed.
+///
+/// \code{.operation}
+/// result[31:0] := __A[31:0]
+/// result[MAX:32] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD </c> instruction.
+///
+/// \param __A
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector. The lower 32 bits are copied from the
+///    parameter \a __A; the upper bits are zeroed.
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi32(__m128i __A) {
+  return (__m128i)__builtin_shufflevector(
+      (__v4si)__A, (__v4si)_mm_setzero_si128(), 0, 4, 4, 4);
+}
+
+/// Constructs a 128-bit integer vector, setting the lower 16 bits to the
+///    lower 16 bits of the parameter \a __A; the upper bits are zeoroed.
+///
+/// \code{.operation}
+/// result[15:0] := __A[15:0]
+/// result[MAX:16] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVW </c> instruction.
+///
+/// \param __A
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector. The lower 16 bits are copied from the
+///    parameter \a __A; the upper bits are zeroed.
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_move_epi16(__m128i __A) {
+  return (__m128i)__builtin_shufflevector(
+      (__v8hi)__A, (__v8hi)_mm_setzero_si128(), 0, 8, 8, 8, 8, 8, 8, 8);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+
+#endif // __AVX10_2COPYINTRIN_H
--- a/lib/include/avx10_2minmaxintrin.h
+++ b/lib/include/avx10_2minmaxintrin.h
@ -0,0 +1,277 @@
+/*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2MINMAXINTRIN_H
+#define __AVX10_2MINMAXINTRIN_H
+
+#define _mm_minmax_pbh(A, B, C)                                                \
+  ((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A),              \
+                                           (__m128bh)(__v8bf)(B), (int)(C)))
+
+#define _mm_mask_minmax_pbh(W, U, A, B, C)                                     \
+  ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
+      (__mmask8)(U),                                                           \
+      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
+                             (int)(C)),                                        \
+      (__v8bf)(W)))
+
+#define _mm_maskz_minmax_pbh(U, A, B, C)                                       \
+  ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
+      (__mmask8)(U),                                                           \
+      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
+                             (int)(C)),                                        \
+      (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
+
+#define _mm256_minmax_pbh(A, B, C)                                             \
+  ((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A),             \
+                                           (__m256bh)(__v16bf)(B), (int)(C)))
+
+#define _mm256_mask_minmax_pbh(W, U, A, B, C)                                  \
+  ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
+      (__mmask16)(U),                                                          \
+      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
+                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
+      (__v16bf)(W)))
+
+#define _mm256_maskz_minmax_pbh(U, A, B, C)                                    \
+  ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
+      (__mmask16)(U),                                                          \
+      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
+                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
+      (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
+
+#define _mm_minmax_pd(A, B, C)                                                 \
+  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm_mask_minmax_pd(W, U, A, B, C)                                      \
+  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_minmax_pd(U, A, B, C)                                        \
+  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
+
+#define _mm256_minmax_pd(A, B, C)                                              \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)_mm256_setzero_pd(), (__mmask8)-1, _MM_FROUND_NO_EXC))
+
+#define _mm256_mask_minmax_pd(W, U, A, B, C)                                   \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)(__m256d)(W), (__mmask8)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_maskz_minmax_pd(U, A, B, C)                                     \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)_mm256_setzero_pd(), (__mmask8)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_minmax_round_pd(A, B, C, R)                                     \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R)))
+
+#define _mm256_mask_minmax_round_pd(W, U, A, B, C, R)                          \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm256_maskz_minmax_round_pd(U, A, B, C, R)                            \
+  ((__m256d)__builtin_ia32_vminmaxpd256_round_mask(                            \
+      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
+      (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_ph(A, B, C)                                                 \
+  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)-1))
+
+#define _mm_mask_minmax_ph(W, U, A, B, C)                                      \
+  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)(__m128h)(W), (__mmask16)-1))
+
+#define _mm_maskz_minmax_ph(U, A, B, C)                                        \
+  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)(U)))
+
+#define _mm256_minmax_ph(A, B, C)                                              \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
+      (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, _MM_FROUND_NO_EXC))
+
+#define _mm256_mask_minmax_ph(W, U, A, B, C)                                   \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
+      (__v16hf)(__m256h)(W), (__mmask16)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_maskz_minmax_ph(U, A, B, C)                                     \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
+      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_minmax_round_ph(A, B, C, R)                                     \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
+      (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R)))
+
+#define _mm256_mask_minmax_round_ph(W, U, A, B, C, R)                          \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (C),                       \
+      (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R)))
+
+#define _mm256_maskz_minmax_round_ph(U, A, B, C, R)                            \
+  ((__m256h)__builtin_ia32_vminmaxph256_round_mask(                            \
+      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
+      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+#define _mm_minmax_ps(A, B, C)                                                 \
+  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+
+#define _mm_mask_minmax_ps(W, U, A, B, C)                                      \
+  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
+      (__mmask8)(U)))
+
+#define _mm_maskz_minmax_ps(U, A, B, C)                                        \
+  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
+
+#define _mm256_minmax_ps(A, B, C)                                              \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
+      (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_NO_EXC))
+
+#define _mm256_mask_minmax_ps(W, U, A, B, C)                                   \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
+      (__mmask8)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_maskz_minmax_ps(U, A, B, C)                                     \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
+      (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), _MM_FROUND_NO_EXC))
+
+#define _mm256_minmax_round_ps(A, B, C, R)                                     \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
+      (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R)))
+
+#define _mm256_mask_minmax_round_ps(W, U, A, B, C, R)                          \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm256_maskz_minmax_round_ps(U, A, B, C, R)                            \
+  ((__m256)__builtin_ia32_vminmaxps256_round_mask(                             \
+      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
+      (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_sd(A, B, C)                                                 \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sd(W, U, A, B, C)                                      \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sd(U, A, B, C)                                        \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sd(A, B, C, R)                                        \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_minmax_round_sd(W, U, A, B, C, R)                             \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sd(U, A, B, C, R)                               \
+  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
+      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
+      (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_sh(A, B, C)                                                 \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_sh(W, U, A, B, C)                                      \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_sh(U, A, B, C)                                        \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_sh(A, B, C, R)                                        \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_minmax_round_sh(W, U, A, B, C, R)                             \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_sh(U, A, B, C, R)                               \
+  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_minmax_ss(A, B, C)                                                 \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_minmax_ss(W, U, A, B, C)                                      \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_minmax_ss(U, A, B, C)                                        \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_minmax_round_ss(A, B, C, R)                                        \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_minmax_round_ss(W, U, A, B, C, R)                             \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_minmax_round_ss(U, A, B, C, R)                               \
+  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
+      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
+      (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
+#endif // __AVX10_2MINMAXINTRIN_H
--- a/lib/include/avx10_2niintrin.h
+++ b/lib/include/avx10_2niintrin.h
--- a/lib/include/avx10_2satcvtdsintrin.h
+++ b/lib/include/avx10_2satcvtdsintrin.h
@ -0,0 +1,496 @@
+/*===----------- avx10_2satcvtdsintrin.h - AVX512SATCVTDS intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2satcvtdsintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2SATCVTDSINTRIN_H
+#define __AVX10_2SATCVTDSINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(256)))
+
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
+                 __min_vector_width__(128)))
+
+#define _mm_cvtts_roundsd_i32(__A, __R)                                        \
+  ((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128)(__A), (const int)(__R)))
+
+#define _mm_cvtts_roundsd_si32(__A, __R)                                       \
+  ((int)__builtin_ia32_vcvttsd2sis32((__v2df)(__m128d)(__A), (const int)(__R)))
+
+#define _mm_cvtts_roundsd_u32(__A, __R)                                        \
+  ((unsigned int)__builtin_ia32_vcvttsd2usis32((__v2df)(__m128d)(__A),         \
+                                               (const int)(__R)))
+
+#define _mm_cvtts_roundss_i32(__A, __R)                                        \
+  ((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R)))
+
+#define _mm_cvtts_roundss_si32(__A, __R)                                       \
+  ((int)__builtin_ia32_vcvttss2sis32((__v4sf)(__m128)(__A), (const int)(__R)))
+
+#define _mm_cvtts_roundss_u32(__A, __R)                                        \
+  ((unsigned int)__builtin_ia32_vcvttss2usis32((__v4sf)(__m128)(__A),          \
+                                               (const int)(__R)))
+
+#ifdef __x86_64__
+#define _mm_cvtts_roundss_u64(__A, __R)                                        \
+  ((unsigned long long)__builtin_ia32_vcvttss2usis64((__v4sf)(__m128)(__A),    \
+                                                     (const int)(__R)))
+
+#define _mm_cvtts_roundsd_u64(__A, __R)                                        \
+  ((unsigned long long)__builtin_ia32_vcvttsd2usis64((__v2df)(__m128d)(__A),   \
+                                                     (const int)(__R)))
+
+#define _mm_cvtts_roundss_i64(__A, __R)                                        \
+  ((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A),              \
+                                           (const int)(__R)))
+
+#define _mm_cvtts_roundss_si64(__A, __R)                                       \
+  ((long long)__builtin_ia32_vcvttss2sis64((__v4sf)(__m128)(__A),              \
+                                           (const int)(__R)))
+
+#define _mm_cvtts_roundsd_si64(__A, __R)                                       \
+  ((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A),             \
+                                           (const int)(__R)))
+
+#define _mm_cvtts_roundsd_i64(__A, __R)                                        \
+  ((long long)__builtin_ia32_vcvttsd2sis64((__v2df)(__m128d)(__A),             \
+                                           (const int)(__R)))
+#endif /* __x86_64__ */
+
+// 128 Bit : Double -> int
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi32(__m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask(
+      (__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask((__v2df)__A, (__v4si)__W,
+                                                      __U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttspd_epi32(__mmask16 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs128_mask(
+      (__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U));
+}
+
+// 256 Bit : Double -> int
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvttspd_epi32(__m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
+      (__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttspd_epi32(__m128i __W, __mmask8 __U, __m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
+      (__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttspd_epi32(__mmask8 __U, __m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(
+      (__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundpd_epi32(__A, __R)                                   \
+  ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(                          \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(),            \
+      (__mmask8) - 1, (int)(__R)))
+
+#define _mm256_mask_cvtts_roundpd_epi32(__W, __U, __A, __R)                    \
+  ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(                          \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R)))
+
+#define _mm256_maskz_cvtts_roundpd_epi32(__U, __A, __R)                        \
+  ((__m128i)__builtin_ia32_vcvttpd2dqs256_round_mask(                          \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(),              \
+      (__mmask8)__U, (int)(__R)))
+
+// 128 Bit : Double -> uint
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu32(__m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
+      (__v2df)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
+      (__v2df)__A, (__v4si)(__m128i)__W, (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttspd_epu32(__mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs128_mask(
+      (__v2df)__A, (__v4si)(__m128i)_mm_setzero_si128(), __U));
+}
+
+// 256 Bit : Double -> uint
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvttspd_epu32(__m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
+      (__v4df)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttspd_epu32(__m128i __W, __mmask8 __U, __m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
+      (__v4df)__A, (__v4si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttspd_epu32(__mmask8 __U, __m256d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(
+      (__v4df)__A, (__v4si)_mm_setzero_si128(), __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundpd_epu32(__A, __R)                                   \
+  ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(                         \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_undefined_si128(),            \
+      (__mmask8) - 1, (int)(__R)))
+
+#define _mm256_mask_cvtts_roundpd_epu32(__W, __U, __A, __R)                    \
+  ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(                         \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)__W, (__mmask8)__U, (int)(__R)))
+
+#define _mm256_maskz_cvtts_roundpd_epu32(__U, __A, __R)                        \
+  ((__m128i)__builtin_ia32_vcvttpd2udqs256_round_mask(                         \
+      (__v4df)(__m256d)__A, (__v4si)(__m128i)_mm_setzero_si128(),              \
+      (__mmask8)__U, (int)(__R)))
+
+// 128 Bit : Double -> long
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epi64(__m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask(
+      (__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttspd_epi64(__m128i __W, __mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask((__v2df)__A, (__v2di)__W,
+                                                      (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttspd_epi64(__mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2qqs128_mask(
+      (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
+}
+
+// 256 Bit : Double -> long
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttspd_epi64(__m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
+      (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttspd_epi64(__m256i __W, __mmask8 __U, __m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
+      (__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttspd_epi64(__mmask8 __U, __m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(
+      (__v4df)__A, (__v4di)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundpd_epi64(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(                          \
+      (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1,           \
+      (int)__R))
+
+#define _mm256_mask_cvtts_roundpd_epi64(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask((__v4df)__A, (__v4di)__W, \
+                                                     (__mmask8)__U, (int)__R))
+
+#define _mm256_maskz_cvtts_roundpd_epi64(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttpd2qqs256_round_mask(                          \
+      (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R))
+
+// 128 Bit : Double -> ulong
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttspd_epu64(__m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask(
+      (__v2df)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttspd_epu64(__m128i __W, __mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask((__v2df)__A, (__v2di)__W,
+                                                       (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttspd_epu64(__mmask8 __U, __m128d __A) {
+  return ((__m128i)__builtin_ia32_vcvttpd2uqqs128_mask(
+      (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
+}
+
+// 256 Bit : Double -> ulong
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttspd_epu64(__m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
+      (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttspd_epu64(__m256i __W, __mmask8 __U, __m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
+      (__v4df)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttspd_epu64(__mmask8 __U, __m256d __A) {
+  return ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(
+      (__v4df)__A, (__v4di)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundpd_epu64(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(                         \
+      (__v4df)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1,           \
+      (int)__R))
+
+#define _mm256_mask_cvtts_roundpd_epu64(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(                         \
+      (__v4df)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
+
+#define _mm256_maskz_cvtts_roundpd_epu64(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttpd2uqqs256_round_mask(                         \
+      (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U, (int)__R))
+
+// 128 Bit : float -> int
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi32(__m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask(
+      (__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttsps_epi32(__m128i __W, __mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask((__v4sf)__A, (__v4si)__W,
+                                                      (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttsps_epi32(__mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2dqs128_mask(
+      (__v4sf)__A, (__v4si)(__m128i)_mm_setzero_si128(), (__mmask8)__U));
+}
+
+// 256 Bit : float -> int
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttsps_epi32(__m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
+      (__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttsps_epi32(__m256i __W, __mmask8 __U, __m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
+      (__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttsps_epi32(__mmask8 __U, __m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(
+      (__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundps_epi32(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(                          \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(),          \
+      (__mmask8) - 1, (int)(__R)))
+
+#define _mm256_mask_cvtts_roundps_epi32(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(                          \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R)))
+
+#define _mm256_maskz_cvtts_roundps_epi32(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttps2dqs256_round_mask(                          \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(),            \
+      (__mmask8)__U, (int)(__R)))
+
+// 128 Bit : float -> uint
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu32(__m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask(
+      (__v4sf)__A, (__v4si)(__m128i)_mm_undefined_si128(), (__mmask8)(-1)));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttsps_epu32(__m128i __W, __mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask((__v4sf)__A, (__v4si)__W,
+                                                       (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttsps_epu32(__mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2udqs128_mask(
+      (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U));
+}
+
+// 256 Bit : float -> uint
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttsps_epu32(__m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
+      (__v8sf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttsps_epu32(__m256i __W, __mmask8 __U, __m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
+      (__v8sf)__A, (__v8si)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttsps_epu32(__mmask8 __U, __m256 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(
+      (__v8sf)__A, (__v8si)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundps_epu32(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(                         \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_undefined_si256(),          \
+      (__mmask8) - 1, (int)(__R)))
+
+#define _mm256_mask_cvtts_roundps_epu32(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(                         \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)__W, (__mmask8)__U, (int)(__R)))
+
+#define _mm256_maskz_cvtts_roundps_epu32(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttps2udqs256_round_mask(                         \
+      (__v8sf)(__m256)__A, (__v8si)(__m256i)_mm256_setzero_si256(),            \
+      (__mmask8)__U, (int)(__R)))
+
+// 128 bit : float -> long
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epi64(__m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
+      (__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttsps_epi64(__m128i __W, __mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
+      (__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2qqs128_mask(
+      (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
+}
+// 256 bit : float -> long
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttsps_epi64(__m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
+      (__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttsps_epi64(__m256i __W, __mmask8 __U, __m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
+      (__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttsps_epi64(__mmask8 __U, __m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(
+      (__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundps_epi64(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(                          \
+      (__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1,   \
+      (int)__R))
+
+#define _mm256_mask_cvtts_roundps_epi64(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(                          \
+      (__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
+
+#define _mm256_maskz_cvtts_roundps_epi64(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttps2qqs256_round_mask(                          \
+      (__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U,      \
+      (int)__R))
+
+// 128 bit : float -> ulong
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttsps_epu64(__m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
+      (__v4sf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttsps_epu64(__m128i __W, __mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
+      (__v4sf)__A, (__v2di)(__m128i)__W, (__mmask8)__U));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) {
+  return ((__m128i)__builtin_ia32_vcvttps2uqqs128_mask(
+      (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U));
+}
+// 256 bit : float -> ulong
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttsps_epu64(__m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
+      (__v4sf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttsps_epu64(__m256i __W, __mmask8 __U, __m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
+      (__v4sf)__A, (__v4di)__W, __U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttsps_epu64(__mmask8 __U, __m128 __A) {
+  return ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(
+      (__v4sf)__A, (__v4di)_mm256_setzero_si256(), __U,
+      _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm256_cvtts_roundps_epu64(__A, __R)                                   \
+  ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(                         \
+      (__v4sf)(__m128)__A, (__v4di)_mm256_undefined_si256(), (__mmask8) - 1,   \
+      (int)__R))
+
+#define _mm256_mask_cvtts_roundps_epu64(__W, __U, __A, __R)                    \
+  ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(                         \
+      (__v4sf)(__m128)__A, (__v4di)__W, (__mmask8)__U, (int)__R))
+
+#define _mm256_maskz_cvtts_roundps_epu64(__U, __A, __R)                        \
+  ((__m256i)__builtin_ia32_vcvttps2uqqs256_round_mask(                         \
+      (__v4sf)(__m128)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U,      \
+      (int)__R))
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+#endif // __AVX10_2SATCVTDSINTRIN_H
--- a/lib/include/avx10_2satcvtintrin.h
+++ b/lib/include/avx10_2satcvtintrin.h
@ -0,0 +1,444 @@
+/*===----------- avx10_2satcvtintrin.h - AVX10_2SATCVT intrinsics ----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx10_2satcvtintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2SATCVTINTRIN_H
+#define __AVX10_2SATCVTINTRIN_H
+
+#define _mm_ipcvtbf16_epi8(A)                                                  \
+  ((__m128i)__builtin_ia32_vcvtbf162ibs128((__v8bf)(__m128bh)(A)))
+
+#define _mm_mask_ipcvtbf16_epi8(W, U, A)                                       \
+  ((__m128i)__builtin_ia32_selectw_128(                                        \
+      (__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epi8(A), (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_ipcvtbf16_epi8(U, A)                                         \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U),                          \
+                                       (__v8hi)_mm_ipcvtbf16_epi8(A),          \
+                                       (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_ipcvtbf16_epi8(A)                                               \
+  ((__m256i)__builtin_ia32_vcvtbf162ibs256((__v16bf)(__m256bh)(A)))
+
+#define _mm256_mask_ipcvtbf16_epi8(W, U, A)                                    \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvtbf16_epi8(A),      \
+                                       (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_ipcvtbf16_epi8(U, A)                                      \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvtbf16_epi8(A),      \
+                                       (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_ipcvtbf16_epu8(A)                                                  \
+  ((__m128i)__builtin_ia32_vcvtbf162iubs128((__v8bf)(__m128bh)(A)))
+
+#define _mm_mask_ipcvtbf16_epu8(W, U, A)                                       \
+  ((__m128i)__builtin_ia32_selectw_128(                                        \
+      (__mmask8)(U), (__v8hi)_mm_ipcvtbf16_epu8(A), (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_ipcvtbf16_epu8(U, A)                                         \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U),                          \
+                                       (__v8hi)_mm_ipcvtbf16_epu8(A),          \
+                                       (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_ipcvtbf16_epu8(A)                                               \
+  ((__m256i)__builtin_ia32_vcvtbf162iubs256((__v16bf)(__m256bh)(A)))
+
+#define _mm256_mask_ipcvtbf16_epu8(W, U, A)                                    \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvtbf16_epu8(A),      \
+                                       (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_ipcvtbf16_epu8(U, A)                                      \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvtbf16_epu8(A),      \
+                                       (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_ipcvtph_epi8(A)                                                    \
+  ((__m128i)__builtin_ia32_vcvtph2ibs128_mask(                                 \
+      (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvtph_epi8(W, U, A)                                         \
+  ((__m128i)__builtin_ia32_vcvtph2ibs128_mask((__v8hf)(__m128h)(A),            \
+                                              (__v8hu)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvtph_epi8(U, A)                                           \
+  ((__m128i)__builtin_ia32_vcvtph2ibs128_mask(                                 \
+      (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvtph_epi8(A)                                                 \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask(                                 \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvtph_epi8(W, U, A)                                      \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A),           \
+                                              (__v16hu)(W), (__mmask16)(U),    \
+                                              _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvtph_epi8(U, A)                                        \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask(                                 \
+      (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()),                \
+      (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvt_roundph_epi8(A, R)                                        \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A),           \
+                                              (__v16hu)_mm256_setzero_si256(), \
+                                              (__mmask16)-1, (const int)R))
+
+#define _mm256_mask_ipcvt_roundph_epi8(W, U, A, R)                             \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask(                                 \
+      (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
+
+#define _mm256_maskz_ipcvt_roundph_epi8(U, A, R)                               \
+  ((__m256i)__builtin_ia32_vcvtph2ibs256_mask((__v16hf)(__m256h)(A),           \
+                                              (__v16hu)_mm256_setzero_si256(), \
+                                              (__mmask16)(U), (const int)R))
+
+#define _mm_ipcvtph_epu8(A)                                                    \
+  ((__m128i)__builtin_ia32_vcvtph2iubs128_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvtph_epu8(W, U, A)                                         \
+  ((__m128i)__builtin_ia32_vcvtph2iubs128_mask((__v8hf)(__m128h)(A),           \
+                                               (__v8hu)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvtph_epu8(U, A)                                           \
+  ((__m128i)__builtin_ia32_vcvtph2iubs128_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvtph_epu8(A)                                                 \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvtph_epu8(W, U, A)                                      \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask((__v16hf)(__m256h)(A),          \
+                                               (__v16hu)(W), (__mmask16)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvtph_epu8(U, A)                                        \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()),                \
+      (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvt_roundph_epu8(A, R)                                        \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      (const int)R))
+
+#define _mm256_mask_ipcvt_roundph_epu8(W, U, A, R)                             \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
+
+#define _mm256_maskz_ipcvt_roundph_epu8(U, A, R)                               \
+  ((__m256i)__builtin_ia32_vcvtph2iubs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),  \
+      (const int)R))
+
+#define _mm_ipcvtps_epi8(A)                                                    \
+  ((__m128i)__builtin_ia32_vcvtps2ibs128_mask(                                 \
+      (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvtps_epi8(W, U, A)                                         \
+  ((__m128i)__builtin_ia32_vcvtps2ibs128_mask((__v4sf)(__m128)(A),             \
+                                              (__v4su)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvtps_epi8(U, A)                                           \
+  ((__m128i)__builtin_ia32_vcvtps2ibs128_mask(                                 \
+      (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvtps_epi8(A)                                                 \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask(                                 \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvtps_epi8(W, U, A)                                      \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A),             \
+                                              (__v8su)(W), (__mmask8)(U),      \
+                                              _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvtps_epi8(U, A)                                        \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask(                                 \
+      (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U),    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvt_roundps_epi8(A, R)                                        \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A),             \
+                                              (__v8su)_mm256_setzero_si256(),  \
+                                              (__mmask8)-1, (const int)R))
+
+#define _mm256_mask_ipcvt_roundps_epi8(W, U, A, R)                             \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask(                                 \
+      (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
+
+#define _mm256_maskz_ipcvt_roundps_epi8(U, A, R)                               \
+  ((__m256i)__builtin_ia32_vcvtps2ibs256_mask((__v8sf)(__m256)(A),             \
+                                              (__v8su)_mm256_setzero_si256(),  \
+                                              (__mmask8)(U), (const int)R))
+
+#define _mm_ipcvtps_epu8(A)                                                    \
+  ((__m128i)__builtin_ia32_vcvtps2iubs128_mask(                                \
+      (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvtps_epu8(W, U, A)                                         \
+  ((__m128i)__builtin_ia32_vcvtps2iubs128_mask((__v4sf)(__m128)(A),            \
+                                               (__v4su)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvtps_epu8(U, A)                                           \
+  ((__m128i)__builtin_ia32_vcvtps2iubs128_mask(                                \
+      (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvtps_epu8(A)                                                 \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvtps_epu8(W, U, A)                                      \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)(W), (__mmask8)(U),     \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvtps_epu8(U, A)                                        \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U),    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvt_roundps_epu8(A, R)                                        \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)_mm256_setzero_si256(), \
+                                               (__mmask8)-1, (const int)R))
+
+#define _mm256_mask_ipcvt_roundps_epu8(W, U, A, R)                             \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
+
+#define _mm256_maskz_ipcvt_roundps_epu8(U, A, R)                               \
+  ((__m256i)__builtin_ia32_vcvtps2iubs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)_mm256_setzero_si256(), \
+                                               (__mmask8)(U), (const int)R))
+
+#define _mm_ipcvttbf16_epi8(A)                                                 \
+  ((__m128i)__builtin_ia32_vcvttbf162ibs128((__v8bf)(__m128bh)(A)))
+
+#define _mm_mask_ipcvttbf16_epi8(W, U, A)                                      \
+  ((__m128i)__builtin_ia32_selectw_128(                                        \
+      (__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epi8(A), (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_ipcvttbf16_epi8(U, A)                                        \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U),                          \
+                                       (__v8hi)_mm_ipcvttbf16_epi8(A),         \
+                                       (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_ipcvttbf16_epi8(A)                                              \
+  ((__m256i)__builtin_ia32_vcvttbf162ibs256((__v16bf)(__m256bh)(A)))
+
+#define _mm256_mask_ipcvttbf16_epi8(W, U, A)                                   \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvttbf16_epi8(A),     \
+                                       (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_ipcvttbf16_epi8(U, A)                                     \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvttbf16_epi8(A),     \
+                                       (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_ipcvttbf16_epu8(A)                                                 \
+  ((__m128i)__builtin_ia32_vcvttbf162iubs128((__v8bf)(__m128bh)(A)))
+
+#define _mm_mask_ipcvttbf16_epu8(W, U, A)                                      \
+  ((__m128i)__builtin_ia32_selectw_128(                                        \
+      (__mmask8)(U), (__v8hi)_mm_ipcvttbf16_epu8(A), (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_ipcvttbf16_epu8(U, A)                                        \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U),                          \
+                                       (__v8hi)_mm_ipcvttbf16_epu8(A),         \
+                                       (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_ipcvttbf16_epu8(A)                                              \
+  ((__m256i)__builtin_ia32_vcvttbf162iubs256((__v16bf)(__m256bh)(A)))
+
+#define _mm256_mask_ipcvttbf16_epu8(W, U, A)                                   \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvttbf16_epu8(A),     \
+                                       (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_ipcvttbf16_epu8(U, A)                                     \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U),                         \
+                                       (__v16hi)_mm256_ipcvttbf16_epu8(A),     \
+                                       (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_ipcvttph_epi8(A)                                                   \
+  ((__m128i)__builtin_ia32_vcvttph2ibs128_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvttph_epi8(W, U, A)                                        \
+  ((__m128i)__builtin_ia32_vcvttph2ibs128_mask((__v8hf)(__m128h)(A),           \
+                                               (__v8hu)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvttph_epi8(U, A)                                          \
+  ((__m128i)__builtin_ia32_vcvttph2ibs128_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvttph_epi8(A)                                                \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvttph_epi8(W, U, A)                                     \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask((__v16hf)(__m256h)(A),          \
+                                               (__v16hu)(W), (__mmask16)(U),   \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvttph_epi8(U, A)                                       \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()),                \
+      (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvtt_roundph_epi8(A, R)                                       \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      (const int)R))
+
+#define _mm256_mask_ipcvtt_roundph_epi8(W, U, A, R)                            \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
+
+#define _mm256_maskz_ipcvtt_roundph_epi8(U, A, R)                              \
+  ((__m256i)__builtin_ia32_vcvttph2ibs256_mask(                                \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),  \
+      (const int)R))
+
+#define _mm_ipcvttph_epu8(A)                                                   \
+  ((__m128i)__builtin_ia32_vcvttph2iubs128_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hu)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvttph_epu8(W, U, A)                                        \
+  ((__m128i)__builtin_ia32_vcvttph2iubs128_mask((__v8hf)(__m128h)(A),          \
+                                                (__v8hu)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvttph_epu8(U, A)                                          \
+  ((__m128i)__builtin_ia32_vcvttph2iubs128_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hu)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvttph_epu8(A)                                                \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask(                               \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvttph_epu8(W, U, A)                                     \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask((__v16hf)(__m256h)(A),         \
+                                                (__v16hu)(W), (__mmask16)(U),  \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvttph_epu8(U, A)                                       \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask(                               \
+      (__v16hf)(__m256h)(A), (__v16hu)(_mm256_setzero_si256()),                \
+      (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvtt_roundph_epu8(A, R)                                       \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask(                               \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)-1,   \
+      (const int)R))
+
+#define _mm256_mask_ipcvtt_roundph_epu8(W, U, A, R)                            \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask(                               \
+      (__v16hf)(__m256h)(A), (__v16hu)(W), (__mmask16)(U), (const int)R))
+
+#define _mm256_maskz_ipcvtt_roundph_epu8(U, A, R)                              \
+  ((__m256i)__builtin_ia32_vcvttph2iubs256_mask(                               \
+      (__v16hf)(__m256h)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U),  \
+      (const int)R))
+
+#define _mm_ipcvttps_epi8(A)                                                   \
+  ((__m128i)__builtin_ia32_vcvttps2ibs128_mask(                                \
+      (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvttps_epi8(W, U, A)                                        \
+  ((__m128i)__builtin_ia32_vcvttps2ibs128_mask((__v4sf)(__m128)(A),            \
+                                               (__v4su)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvttps_epi8(U, A)                                          \
+  ((__m128i)__builtin_ia32_vcvttps2ibs128_mask(                                \
+      (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvttps_epi8(A)                                                \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvttps_epi8(W, U, A)                                     \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)(W), (__mmask8)(U),     \
+                                               _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvttps_epi8(U, A)                                       \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U),    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvtt_roundps_epi8(A, R)                                       \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)_mm256_setzero_si256(), \
+                                               (__mmask8)-1, (const int)R))
+
+#define _mm256_mask_ipcvtt_roundps_epi8(W, U, A, R)                            \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask(                                \
+      (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
+
+#define _mm256_maskz_ipcvtt_roundps_epi8(U, A, R)                              \
+  ((__m256i)__builtin_ia32_vcvttps2ibs256_mask((__v8sf)(__m256)(A),            \
+                                               (__v8su)_mm256_setzero_si256(), \
+                                               (__mmask8)(U), (const int)R))
+
+#define _mm_ipcvttps_epu8(A)                                                   \
+  ((__m128i)__builtin_ia32_vcvttps2iubs128_mask(                               \
+      (__v4sf)(__m128)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1))
+
+#define _mm_mask_ipcvttps_epu8(W, U, A)                                        \
+  ((__m128i)__builtin_ia32_vcvttps2iubs128_mask((__v4sf)(__m128)(A),           \
+                                                (__v4su)(W), (__mmask8)(U)))
+
+#define _mm_maskz_ipcvttps_epu8(U, A)                                          \
+  ((__m128i)__builtin_ia32_vcvttps2iubs128_mask(                               \
+      (__v4sf)(__m128)(A), (__v4su)(_mm_setzero_si128()), (__mmask8)(U)))
+
+#define _mm256_ipcvttps_epu8(A)                                                \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask(                               \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_mask_ipcvttps_epu8(W, U, A)                                     \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask((__v8sf)(__m256)(A),           \
+                                                (__v8su)(W), (__mmask8)(U),    \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_maskz_ipcvttps_epu8(U, A)                                       \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask(                               \
+      (__v8sf)(__m256)(A), (__v8su)(_mm256_setzero_si256()), (__mmask8)(U),    \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm256_ipcvtt_roundps_epu8(A, R)                                       \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask(                               \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1,       \
+      (const int)R))
+
+#define _mm256_mask_ipcvtt_roundps_epu8(W, U, A, R)                            \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask(                               \
+      (__v8sf)(__m256)(A), (__v8su)(W), (__mmask8)(U), (const int)R))
+
+#define _mm256_maskz_ipcvtt_roundps_epu8(U, A, R)                              \
+  ((__m256i)__builtin_ia32_vcvttps2iubs256_mask(                               \
+      (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U),      \
+      (const int)R))
+#endif // __AVX10_2SATCVTINTRIN_H
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -15,12 +15,21 @@
 #define __AVX2INTRIN_H

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx2,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
+                 __min_vector_width__(128)))
+#endif

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
--- a/lib/include/avx512bitalgintrin.h
+++ b/lib/include/avx512bitalgintrin.h
@ -23,7 +23,7 @@
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi16(__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+  return (__m512i)__builtin_elementwise_popcount((__v32hu)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -45,7 +45,7 @@ _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi8(__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+  return (__m512i)__builtin_elementwise_popcount((__v64qu)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -175,12 +175,21 @@ typedef enum
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512f,no-evex512")))

+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /* Create vectors with repeated elements */

-static  __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_setzero_si512(void)
-{
-  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_setzero_si512(void) {
+  return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0};
 }

 #define _mm512_setzero_epi32 _mm512_setzero_si512
@ -256,20 +265,16 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
                                             (__v8di) _mm512_setzero_si512());
 }

-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_setzero_ps(void)
-{
-  return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-                                 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void) {
+  return __extension__(__m512){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                               0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
 }

 #define _mm512_setzero _mm512_setzero_ps

-static  __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_setzero_pd(void)
-{
-  return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_setzero_pd(void) {
+  return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 }

 static __inline __m512 __DEFAULT_FN_ATTRS512
@ -9775,5 +9780,8 @@ _mm512_cvtsi512_si32(__m512i __A) {
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #endif /* __AVX512FINTRIN_H */
--- a/lib/include/avx512vlbitalgintrin.h
+++ b/lib/include/avx512vlbitalgintrin.h
@ -27,7 +27,7 @@
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi16(__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+  return (__m256i)__builtin_elementwise_popcount((__v16hu)__A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -49,7 +49,7 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi16(__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+  return (__m128i)__builtin_elementwise_popcount((__v8hu)__A);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@ -71,7 +71,7 @@ _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi8(__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+  return (__m256i)__builtin_elementwise_popcount((__v32qu)__A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -93,7 +93,7 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi8(__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+  return (__m128i)__builtin_elementwise_popcount((__v16qu)__A);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
--- a/lib/include/avx512vpopcntdqintrin.h
+++ b/lib/include/avx512vpopcntdqintrin.h
@ -21,8 +21,15 @@
                 __target__("avx512vpopcntdq,evex512"),                        \
                 __min_vector_width__(512)))

-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_popcnt_epi64(__m512i __A) {
+  return (__m512i)__builtin_elementwise_popcount((__v8du)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -36,8 +43,9 @@ _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_popcnt_epi32(__m512i __A) {
+  return (__m512i)__builtin_elementwise_popcount((__v16su)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/lib/include/avx512vpopcntdqvlintrin.h
+++ b/lib/include/avx512vpopcntdqvlintrin.h
@ -25,9 +25,17 @@
                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
                 __min_vector_width__(256)))

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_popcnt_epi64(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+  return (__m128i)__builtin_elementwise_popcount((__v2du)__A);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@ -41,9 +49,9 @@ _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_popcnt_epi32(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+  return (__m128i)__builtin_elementwise_popcount((__v4su)__A);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@ -57,9 +65,9 @@ _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_popcnt_epi64(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+  return (__m256i)__builtin_elementwise_popcount((__v4du)__A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -73,9 +81,9 @@ _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_popcnt_epi32(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+  return (__m256i)__builtin_elementwise_popcount((__v8su)__A);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -50,12 +50,29 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
 #endif

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
                 __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
                 __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
+                 __min_vector_width__(128)))
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif

 /* Arithmetic */
 /// Adds two 256-bit vectors of [4 x double].
@ -3689,7 +3706,7 @@ _mm256_undefined_si256(void)
 ///    A double-precision floating-point value used to initialize bits [63:0]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
  return __extension__ (__m256d){ __d, __c, __b, __a };
@ -3728,7 +3745,7 @@ _mm256_set_pd(double __a, double __b, double __c, double __d)
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set_ps(float __a, float __b, float __c, float __d,
              float __e, float __f, float __g, float __h)
 {
@ -3955,7 +3972,7 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 ///    A double-precision floating-point value used to initialize bits [255:192]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
  return _mm256_set_pd(__d, __c, __b, __a);
@ -3995,7 +4012,7 @@ _mm256_setr_pd(double __a, double __b, double __c, double __d)
 ///    A single-precision floating-point value used to initialize bits [255:224]
 ///    of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
 {
@ -4212,7 +4229,7 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 ///    A double-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_pd(double __w)
 {
  return _mm256_set_pd(__w, __w, __w, __w);
@ -4231,7 +4248,7 @@ _mm256_set1_pd(double __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm256_set1_ps(float __w)
 {
  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
@ -4322,10 +4339,8 @@ _mm256_set1_epi64x(long long __q)
 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 ///
 /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setzero_pd(void)
-{
-  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) {
+  return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
 }

 /// Constructs a 256-bit floating-point vector of [8 x float] with all
@ -4336,9 +4351,7 @@ _mm256_setzero_pd(void)
 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 ///
 /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setzero_ps(void)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) {
  return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 }

@ -4349,9 +4362,8 @@ _mm256_setzero_ps(void)
 /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 ///
 /// \returns A 256-bit integer vector initialized to zero.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setzero_si256(void)
-{
+static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_setzero_si256(void) {
  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
 }

@ -5121,6 +5133,8 @@ _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
 }

 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR

 #endif /* __AVXINTRIN_H */
--- a/lib/include/avxvnniint16intrin.h
+++ b/lib/include/avxvnniint16intrin.h
@ -15,14 +15,6 @@
 #ifndef __AVXVNNIINT16INTRIN_H
 #define __AVXVNNIINT16INTRIN_H

-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(256)))
-
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 ///    signed 16-bit results. Sum these 2 results with the corresponding
@ -53,12 +45,9 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwsud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -90,11 +79,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -127,12 +114,9 @@ _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -165,11 +149,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@ -201,12 +183,9 @@ _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwusd_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@ -238,11 +217,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@ -275,12 +252,9 @@ _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwusds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
@ -313,11 +287,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -349,12 +321,9 @@ _mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpwuud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -386,11 +355,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -423,12 +390,9 @@ _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
@ -461,13 +425,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 #endif // __AVXVNNIINT16INTRIN_H
--- a/lib/include/avxvnniint8intrin.h
+++ b/lib/include/avxvnniint8intrin.h
@ -14,14 +14,6 @@
 #ifndef __AVXVNNIINT8INTRIN_H
 #define __AVXVNNIINT8INTRIN_H

-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(128)))
-
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@ -52,12 +44,9 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbssd_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@ -89,11 +78,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@ -126,12 +113,9 @@ _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbssds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
@ -164,11 +148,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -200,12 +182,9 @@ _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbsud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -237,11 +216,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -274,12 +251,9 @@ _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -312,11 +286,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
+#define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -348,12 +320,9 @@ _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
+#define _mm_dpbuud_epi32(__W, __A, __B)                                        \
+  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
+                                       (__v4si)(__B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -385,11 +354,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
+#define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
+  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
+                                       (__v8si)(__B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
@ -422,14 +389,10 @@ _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
+#define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
+  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
+                                        (__v4si)(__B)))

-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
 ///    32-bit integer in \a __W with signed saturation, and store the packed
@ -460,12 +423,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
+#define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
+  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
+                                        (__v8si)(__B)))

 #endif // __AVXVNNIINT8INTRIN_H
--- a/lib/include/bmi2intrin.h
+++ b/lib/include/bmi2intrin.h
@ -15,7 +15,13 @@
 #define __BMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+#endif

 /// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
 ///    starting at bit number \a __Y.
@ -38,8 +44,7 @@
 ///    The lower 8 bits specify the bit number of the lowest bit to zero.
 /// \returns The partially zeroed 32-bit value.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bzhi_u32(unsigned int __X, unsigned int __Y)
-{
+_bzhi_u32(unsigned int __X, unsigned int __Y) {
  return __builtin_ia32_bzhi_si(__X, __Y);
 }

@ -68,8 +73,7 @@ _bzhi_u32(unsigned int __X, unsigned int __Y)
 ///    The 32-bit mask specifying where to deposit source bits.
 /// \returns The 32-bit result.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pdep_u32(unsigned int __X, unsigned int __Y)
-{
+_pdep_u32(unsigned int __X, unsigned int __Y) {
  return __builtin_ia32_pdep_si(__X, __Y);
 }

@ -98,8 +102,7 @@ _pdep_u32(unsigned int __X, unsigned int __Y)
 ///    The 32-bit mask specifying which source bits to extract.
 /// \returns The 32-bit result.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pext_u32(unsigned int __X, unsigned int __Y)
-{
+_pext_u32(unsigned int __X, unsigned int __Y) {
  return __builtin_ia32_pext_si(__X, __Y);
 }

@ -124,8 +127,7 @@ _pext_u32(unsigned int __X, unsigned int __Y)
 ///    A pointer to memory for storing the upper half of the product.
 /// \returns The lower half of the product.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
-{
+_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
  unsigned long long __res = (unsigned long long) __X * __Y;
  *__P = (unsigned int)(__res >> 32);
  return (unsigned int)__res;
@ -154,8 +156,7 @@ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
 ///    The lower 8 bits specify the bit number of the lowest bit to zero.
 /// \returns The partially zeroed 64-bit value.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bzhi_u64(unsigned long long __X, unsigned long long __Y)
-{
+_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
  return __builtin_ia32_bzhi_di(__X, __Y);
 }

@ -184,8 +185,7 @@ _bzhi_u64(unsigned long long __X, unsigned long long __Y)
 ///    The 64-bit mask specifying where to deposit source bits.
 /// \returns The 64-bit result.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pdep_u64(unsigned long long __X, unsigned long long __Y)
-{
+_pdep_u64(unsigned long long __X, unsigned long long __Y) {
  return __builtin_ia32_pdep_di(__X, __Y);
 }

@ -214,8 +214,7 @@ _pdep_u64(unsigned long long __X, unsigned long long __Y)
 ///    The 64-bit mask specifying which source bits to extract.
 /// \returns The 64-bit result.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pext_u64(unsigned long long __X, unsigned long long __Y)
-{
+_pext_u64(unsigned long long __X, unsigned long long __Y) {
  return __builtin_ia32_pext_di(__X, __Y);
 }

@ -241,8 +240,7 @@ _pext_u64(unsigned long long __X, unsigned long long __Y)
 /// \returns The lower half of the product.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
-	   unsigned long long *__P)
-{
+           unsigned long long *__P) {
  unsigned __int128 __res = (unsigned __int128) __X * __Y;
  *__P = (unsigned long long) (__res >> 64);
  return (unsigned long long) __res;
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -17,7 +17,12 @@
 /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
   instruction behaves as BSF on non-BMI targets, there is code that expects
   to use it as a potentially faster version of BSF. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __RELAXED_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__)) constexpr
+#else
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#endif

 /// Counts the number of trailing zero bits in the operand.
 ///
@ -31,8 +36,7 @@
 ///    bits in the operand.
 /// \see _tzcnt_u16
 static __inline__ unsigned short __RELAXED_FN_ATTRS
-__tzcnt_u16(unsigned short __X)
-{
+__tzcnt_u16(unsigned short __X) {
  return __builtin_ia32_tzcnt_u16(__X);
 }

@ -65,8 +69,7 @@ __tzcnt_u16(unsigned short __X)
 ///    bits in the operand.
 /// \see { _mm_tzcnt_32 _tzcnt_u32 }
 static __inline__ unsigned int __RELAXED_FN_ATTRS
-__tzcnt_u32(unsigned int __X)
-{
+__tzcnt_u32(unsigned int __X) {
  return __builtin_ia32_tzcnt_u32(__X);
 }

@ -82,8 +85,7 @@ __tzcnt_u32(unsigned int __X)
 ///    the operand.
 /// \see { __tzcnt_u32 _tzcnt_u32 }
 static __inline__ int __RELAXED_FN_ATTRS
-_mm_tzcnt_32(unsigned int __X)
-{
+_mm_tzcnt_32(unsigned int __X) {
  return (int)__builtin_ia32_tzcnt_u32(__X);
 }

@ -118,8 +120,7 @@ _mm_tzcnt_32(unsigned int __X)
 ///    bits in the operand.
 /// \see { _mm_tzcnt_64 _tzcnt_u64 }
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
-__tzcnt_u64(unsigned long long __X)
-{
+__tzcnt_u64(unsigned long long __X) {
  return __builtin_ia32_tzcnt_u64(__X);
 }

@ -135,8 +136,7 @@ __tzcnt_u64(unsigned long long __X)
 ///    the operand.
 /// \see { __tzcnt_u64 _tzcnt_u64 }
 static __inline__ long long __RELAXED_FN_ATTRS
-_mm_tzcnt_64(unsigned long long __X)
-{
+_mm_tzcnt_64(unsigned long long __X) {
  return (long long)__builtin_ia32_tzcnt_u64(__X);
 }

@ -164,7 +164,13 @@ _mm_tzcnt_64(unsigned long long __X)
 #if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+#endif

 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
@ -181,8 +187,7 @@ _mm_tzcnt_64(unsigned long long __X)
 ///    operand with the one's complement of the first operand.
 /// \see _andn_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__andn_u32(unsigned int __X, unsigned int __Y)
-{
+__andn_u32(unsigned int __X, unsigned int __Y) {
  return ~__X & __Y;
 }

@ -224,8 +229,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
 ///    extracted bits.
 /// \see _bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__bextr_u32(unsigned int __X, unsigned int __Y)
-{
+__bextr_u32(unsigned int __X, unsigned int __Y) {
  return __builtin_ia32_bextr_u32(__X, __Y);
 }

@ -249,9 +253,8 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///    extracted bits.
 /// \see __bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) {
+  return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

 /* Intel-specified, single-leading-underscore version of BEXTR2 */
@ -289,8 +292,7 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) {
 ///    the source operand.
 /// \see _blsi_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsi_u32(unsigned int __X)
-{
+__blsi_u32(unsigned int __X) {
  return __X & -__X;
 }

@ -325,8 +327,7 @@ __blsi_u32(unsigned int __X)
 /// \returns An unsigned integer containing the newly created mask.
 /// \see _blsmsk_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
-{
+__blsmsk_u32(unsigned int __X) {
  return __X ^ (__X - 1);
 }

@ -361,8 +362,7 @@ __blsmsk_u32(unsigned int __X)
 ///    operand.
 /// \see _blsr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
+__blsr_u32(unsigned int __X) {
  return __X & (__X - 1);
 }

@ -401,8 +401,7 @@ __blsr_u32(unsigned int __X)
 ///    operand with the one's complement of the first operand.
 /// \see _andn_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__andn_u64 (unsigned long long __X, unsigned long long __Y)
-{
+__andn_u64 (unsigned long long __X, unsigned long long __Y) {
  return ~__X & __Y;
 }

@ -445,8 +444,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
 ///    extracted bits.
 /// \see _bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__bextr_u64(unsigned long long __X, unsigned long long __Y)
-{
+__bextr_u64(unsigned long long __X, unsigned long long __Y) {
  return __builtin_ia32_bextr_u64(__X, __Y);
 }

@ -470,9 +468,8 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///    extracted bits.
 /// \see __bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) {
+  return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

 /* Intel-specified, single-leading-underscore version of BEXTR2 */
@ -510,8 +507,7 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) {
 ///    bits from the source operand.
 /// \see _blsi_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsi_u64(unsigned long long __X)
-{
+__blsi_u64(unsigned long long __X) {
  return __X & -__X;
 }

@ -546,8 +542,7 @@ __blsi_u64(unsigned long long __X)
 /// \returns An unsigned 64-bit integer containing the newly created mask.
 /// \see _blsmsk_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
-{
+__blsmsk_u64(unsigned long long __X) {
  return __X ^ (__X - 1);
 }

@ -582,8 +577,7 @@ __blsmsk_u64(unsigned long long __X)
 ///    source operand.
 /// \see _blsr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
+__blsr_u64(unsigned long long __X) {
  return __X & (__X - 1);
 }

--- a/lib/include/cmpccxaddintrin.h
+++ b/lib/include/cmpccxaddintrin.h
@ -63,7 +63,7 @@ typedef enum {
                                    (int)(__D))))

 #define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
-  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
+  ((long long)(__builtin_ia32_cmpccxadd64((__A), (long long)(__B),             \
                                          (long long)(__C), (int)(__D))))

 #endif // __x86_64__
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -187,17 +187,18 @@
 #define bit_ENQCMD           0x20000000

 /* Features in %edx for leaf 7 sub-leaf 0 */
-#define bit_AVX5124VNNIW  0x00000004
-#define bit_AVX5124FMAPS  0x00000008
-#define bit_UINTR         0x00000020
-#define bit_SERIALIZE     0x00004000
-#define bit_TSXLDTRK      0x00010000
-#define bit_PCONFIG       0x00040000
-#define bit_IBT           0x00100000
-#define bit_AMXBF16       0x00400000
-#define bit_AVX512FP16    0x00800000
-#define bit_AMXTILE       0x01000000
-#define bit_AMXINT8       0x02000000
+#define bit_AVX5124VNNIW        0x00000004
+#define bit_AVX5124FMAPS        0x00000008
+#define bit_UINTR               0x00000020
+#define bit_AVX512VP2INTERSECT  0x00000100
+#define bit_SERIALIZE           0x00004000
+#define bit_TSXLDTRK            0x00010000
+#define bit_PCONFIG             0x00040000
+#define bit_IBT                 0x00100000
+#define bit_AMXBF16             0x00400000
+#define bit_AVX512FP16          0x00800000
+#define bit_AMXTILE             0x01000000
+#define bit_AMXINT8             0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
 #define bit_SHA512        0x00000001
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -49,12 +49,27 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 #endif

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
+                 __min_vector_width__(128)))
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)

 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@ -71,8 +86,8 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
 ///    from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a,
+                                                                  __m128d __b) {
  __a[0] += __b[0];
  return __a;
 }
@ -89,8 +104,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 /// \returns A 128-bit vector of [2 x double] containing the sums of both
 ///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2df)__a + (__v2df)__b);
 }

@ -111,8 +126,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
 ///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a,
+                                                                  __m128d __b) {
  __a[0] -= __b[0];
  return __a;
 }
@ -129,8 +144,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing the subtrahend.
 /// \returns A 128-bit vector of [2 x double] containing the differences between
 ///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2df)__a - (__v2df)__b);
 }

@ -150,8 +165,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 ///    product of the lower 64 bits of both operands. The upper 64 bits are
 ///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a,
+                                                                  __m128d __b) {
  __a[0] *= __b[0];
  return __a;
 }
@ -168,8 +183,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing one of the operands.
 /// \returns A 128-bit vector of [2 x double] containing the products of both
 ///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2df)__a * (__v2df)__b);
 }

@ -190,8 +205,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
 ///    copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a,
+                                                                  __m128d __b) {
  __a[0] /= __b[0];
  return __a;
 }
@ -209,8 +224,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing the divisor.
 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
 ///    operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2df)__a / (__v2df)__b);
 }

@ -358,8 +373,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
 ///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2du)__a & (__v2du)__b);
 }

@ -378,8 +393,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
 ///    values in the second operand and the one's complement of the first
 ///    operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
-                                                           __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_pd(__m128d __a, __m128d __b) {
  return (__m128d)(~(__v2du)__a & (__v2du)__b);
 }

@ -395,8 +410,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
 ///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
-                                                       __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a,
+                                                                 __m128d __b) {
  return (__m128d)((__v2du)__a | (__v2du)__b);
 }

@ -412,8 +427,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
 ///    values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
-                                                        __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a,
+                                                                  __m128d __b) {
  return (__m128d)((__v2du)__a ^ (__v2du)__b);
 }

@ -1291,7 +1306,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
 ///    floating-point elements are converted to double-precision values. The
 ///    upper two elements are unused.
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtps_pd(__m128 __a) {
  return (__m128d) __builtin_convertvector(
      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
 }
@ -1312,7 +1328,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
 ///
 ///    The upper two elements are unused.
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtepi32_pd(__m128i __a) {
  return (__m128d) __builtin_convertvector(
      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
 }
@ -1398,8 +1415,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
 ///    converted value from the second parameter. The upper 64 bits are copied
 ///    from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
-                                                            int __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi32_sd(__m128d __a, int __b) {
  __a[0] = __b;
  return __a;
 }
@ -1423,8 +1440,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
 ///    converted value from the second parameter. The upper 64 bits are copied
 ///    from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
-                                                          __m128 __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_sd(__m128d __a, __m128 __b) {
  __a[0] = __b[0];
  return __a;
 }
@ -1486,8 +1503,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }

 /// Converts the two double-precision floating-point elements of a
@ -1505,8 +1522,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }

 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@ -1520,8 +1537,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtpi32_pd(__m64 __a) {
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }

 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@ -1535,7 +1553,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
 /// \returns A double-precision floating-point value copied from the lower 64
 ///    bits of \a __a.
-static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
+static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsd_f64(__m128d __a) {
  return __a[0];
 }

@ -1770,7 +1789,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
 ///    set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) {
  return __extension__(__m128d){__w, 0.0};
 }

@ -1786,7 +1805,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
 ///    A double-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) {
  return __extension__(__m128d){__w, __w};
 }

@ -1802,7 +1821,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
 ///    A double-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) {
  return _mm_set1_pd(__w);
 }

@ -1820,8 +1839,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
 ///    A double-precision floating-point value used to initialize the lower 64
 ///    bits of the result.
 /// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
-                                                        double __x) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w,
+                                                                  double __x) {
  return __extension__(__m128d){__x, __w};
 }

@ -1840,8 +1859,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
 ///    A double-precision floating-point value used to initialize the upper 64
 ///    bits of the result.
 /// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
-                                                         double __x) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w,
+                                                                   double __x) {
  return __extension__(__m128d){__w, __x};
 }

@ -1854,7 +1873,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
 ///
 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
 ///    all elements set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) {
  return __extension__(__m128d){0.0, 0.0};
 }

@ -1873,8 +1892,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
 ///    lower 64 bits of the result.
 /// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
-                                                         __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_sd(__m128d __a, __m128d __b) {
  __a[0] = __b[0];
  return __a;
 }
@ -2091,8 +2110,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_epi32(__m128i __a, __m128i __b) {
  return (__m128i)((__v4su)__a + (__v4su)__b);
 }

@ -2108,9 +2127,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
 }

 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@ -2129,8 +2147,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
 ///    A 128-bit vector of [2 x i64].
 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
 ///    parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_epi64(__m128i __a, __m128i __b) {
  return (__m128i)((__v2du)__a + (__v2du)__b);
 }

@ -2431,9 +2449,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
-                                                            __m64 __b) {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
+                                             (__v4si)__anyext128(__b)));
 }

 /// Multiplies 32-bit unsigned integer values contained in the lower
@ -2521,8 +2539,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
 ///    A 128-bit integer vector containing the subtrahends.
 /// \returns A 128-bit integer vector containing the differences of the values
 ///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_epi32(__m128i __a, __m128i __b) {
  return (__m128i)((__v4su)__a - (__v4su)__b);
 }

@ -2539,9 +2557,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
-                                                            __m64 __b) {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
 }

 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@ -2556,8 +2573,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
 ///    A 128-bit integer vector containing the subtrahends.
 /// \returns A 128-bit integer vector containing the differences of the values
 ///    in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
-                                                           __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_epi64(__m128i __a, __m128i __b) {
  return (__m128i)((__v2du)__a - (__v2du)__b);
 }

@ -3255,8 +3272,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
-                                                            long long __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi64_sd(__m128d __a, long long __b) {
  __a[0] = __b;
  return __a;
 }
@ -3310,7 +3327,8 @@ static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
 /// \param __a
 ///    A 128-bit integer vector.
 /// \returns A 128-bit vector of [4 x float] containing the converted values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtepi32_ps(__m128i __a) {
  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
 }

@ -3494,8 +3512,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
 ///    destination vector of [2 x i64].
 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
 ///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
-                                                            long long __q0) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_epi64x(long long __q1, long long __q0) {
  return __extension__(__m128i)(__v2di){__q0, __q1};
 }

@ -3515,9 +3533,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
 ///    destination vector of [2 x i64].
 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
 ///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
-                                                           __m64 __q0) {
-  return _mm_set_epi64x((long long)__q1, (long long)__q0);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_epi64(__m64 __q1, __m64 __q0) {
+  return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]);
 }

 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
@ -3542,8 +3560,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
 ///    vector.
 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
 ///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
-                                                           int __i1, int __i0) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3,
+                                                                     int __i2,
+                                                                     int __i1,
+                                                                     int __i0) {
  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
 }

@ -3581,7 +3601,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
 ///    vector.
 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
 ///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
              short __w2, short __w1, short __w0) {
  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
@ -3630,7 +3650,7 @@ _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
 ///    Initializes bits [7:0] of the destination vector.
 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
 ///    provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
             char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
             char __b4, char __b3, char __b2, char __b1, char __b0) {
@ -3652,7 +3672,8 @@ _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
 ///    vector.
 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
 ///    elements containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_epi64x(long long __q) {
  return _mm_set_epi64x(__q, __q);
 }

@ -3669,7 +3690,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
 ///    vector.
 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
 ///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_epi64(__m64 __q) {
  return _mm_set_epi64(__q, __q);
 }

@ -3686,7 +3708,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
 ///    vector.
 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
 ///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) {
  return _mm_set_epi32(__i, __i, __i, __i);
 }

@ -3703,7 +3725,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
 ///    vector.
 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
 ///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_epi16(short __w) {
  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
 }

@ -3720,7 +3743,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
 ///    vector.
 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
 ///    containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) {
  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
                      __b, __b, __b, __b, __b);
 }
@ -3739,8 +3762,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
 ///    A 64-bit integral value used to initialize the upper 64 bits of the
 ///    result.
 /// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
-                                                            __m64 __q1) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_epi64(__m64 __q0, __m64 __q1) {
  return _mm_set_epi64(__q1, __q0);
 }

@ -3761,9 +3784,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
 /// \param __i3
 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
 /// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
-                                                            int __i2,
-                                                            int __i3) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) {
  return _mm_set_epi32(__i3, __i2, __i1, __i0);
 }

@ -3792,7 +3814,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
 /// \param __w7
 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
 /// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
               short __w5, short __w6, short __w7) {
  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
@ -3839,7 +3861,7 @@ _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
 /// \param __b15
 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
 /// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7, char __b8, char __b9, char __b10,
              char __b11, char __b12, char __b13, char __b14, char __b15) {
@ -3855,7 +3877,7 @@ _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
 ///
 /// \returns An initialized 128-bit integer vector with all elements set to
 ///    zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) {
  return __extension__(__m128i)(__v2di){0LL, 0LL};
 }

@ -4588,7 +4610,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
 ///    destination.
 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movepi64_pi64(__m128i __a) {
  return (__m64)__a[0];
 }

@ -4603,8 +4626,9 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
 ///    A 64-bit value.
 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
 ///    the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
-  return __extension__(__m128i)(__v2di){(long long)__a, 0};
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movpi64_epi64(__m64 __a) {
+  return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
 }

 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
@ -4619,7 +4643,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
 ///    destination.
 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
 ///    the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_epi64(__m128i __a) {
  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
 }

@ -4638,8 +4663,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
 ///    A 128-bit vector of [2 x double]. \n
 ///    Bits [127:64] are written to bits [127:64] of the destination.
 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
-                                                             __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_pd(__m128d __a, __m128d __b) {
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
 }

@ -4658,8 +4683,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
 ///    A 128-bit vector of [2 x double]. \n
 ///    Bits [63:0] are written to bits [127:64] of the destination.
 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
-                                                             __m128d __b) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_pd(__m128d __a, __m128d __b) {
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
 }

@ -4722,7 +4747,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
 ///    A 128-bit floating-point vector of [2 x double].
 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castpd_ps(__m128d __a) {
  return (__m128)__a;
 }

@ -4737,7 +4763,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
 ///    A 128-bit floating-point vector of [2 x double].
 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castpd_si128(__m128d __a) {
  return (__m128i)__a;
 }

@ -4752,7 +4779,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castps_pd(__m128 __a) {
  return (__m128d)__a;
 }

@ -4767,7 +4795,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castps_si128(__m128 __a) {
  return (__m128i)__a;
 }

@ -4782,7 +4811,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
 ///    A 128-bit integer vector.
 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
 ///    bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castsi128_ps(__m128i __a) {
  return (__m128)__a;
 }

@ -4797,7 +4827,8 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
 ///    A 128-bit integer vector.
 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
 ///    bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_castsi128_pd(__m128i __a) {
  return (__m128d)__a;
 }

@ -4889,8 +4920,11 @@ void _mm_pause(void);
 #if defined(__cplusplus)
 } // extern "C"
 #endif
+
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))

--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -14,6 +14,7 @@
 #ifndef __GFNIINTRIN_H
 #define __GFNIINTRIN_H

+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 /* Default attributes for simple form (no masking). */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
@ -25,6 +26,37 @@
                 __target__("avx,gfni,no-evex512"),                            \
                 __min_vector_width__(256)))

+/* Default attributes for VLX masked forms. */
+#define __DEFAULT_FN_ATTRS_VL128                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(256)))
+#else
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("gfni"),           \
+                 __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y                                                   \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"),       \
+                 __min_vector_width__(256)))
+
+/* Default attributes for VLX masked forms. */
+#define __DEFAULT_FN_ATTRS_VL128                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni"),                         \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni"),                         \
+                 __min_vector_width__(256)))
+#endif
+
 /* Default attributes for ZMM unmasked forms. */
 #define __DEFAULT_FN_ATTRS_Z                                                   \
  __attribute__((__always_inline__, __nodebug__,                               \
@ -36,16 +68,6 @@
                 __target__("avx512bw,evex512,gfni"),                          \
                 __min_vector_width__(512)))

-/* Default attributes for VLX masked forms. */
-#define __DEFAULT_FN_ATTRS_VL128                                               \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256                                               \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
-                 __min_vector_width__(256)))
-
 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
                                                   (__v16qi)(__m128i)(B), \
--- a/lib/include/hexagon_types.h
+++ b/lib/include/hexagon_types.h
@ -1,7 +1,11 @@
-/******************************************************************************/
-/*   (c) 2020 Qualcomm Innovation Center, Inc. All rights reserved.           */
-/*                                                                            */
-/******************************************************************************/
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #ifndef HEXAGON_TYPES_H
 #define HEXAGON_TYPES_H

--- a/lib/include/hvx_hexagon_protos.h
+++ b/lib/include/hvx_hexagon_protos.h
@ -5178,6 +5178,433 @@
 #define Q6_Vuh_vmpy_VuhVuh_rs16(Vu,Vv) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpyuhvs)(Vu,Vv)
 #endif /* __HEXAGON_ARCH___ >= 69 */

+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.sf=vadd(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vadd_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Wsf_vadd_VbfVbf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_sf_bf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.h=Vu32.hf
+   C Intrinsic Prototype: HVX_Vector Q6_Vh_equals_Vhf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vh_equals_Vhf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_h_hf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.hf=Vu32.h
+   C Intrinsic Prototype: HVX_Vector Q6_Vhf_equals_Vh(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vhf_equals_Vh(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_hf_h)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.sf=Vu32.w
+   C Intrinsic Prototype: HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vsf_equals_Vw(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_sf_w)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.w=Vu32.sf
+   C Intrinsic Prototype: HVX_Vector Q6_Vw_equals_Vsf(HVX_Vector Vu)
+   Instruction Type:      CVI_VS
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Vw_equals_Vsf(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vconv_w_sf)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.bf=vcvt(Vu32.sf,Vv32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vbf_vcvt_VsfVsf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vbf_vcvt_VsfVsf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_bf_sf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Qd4=vcmp.gt(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gt_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VA Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_gt_VbfVbf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)                          \
+  ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf)(Vu, Vv)), -1)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Qx4&=vcmp.gt(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtand_QVbfVbf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_gtand_QVbfVbf(Qx, Vu, Vv)                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)                          \
+  ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_and)(                     \
+       __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,      \
+       Vv)),                                                                   \
+   -1)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Qx4|=vcmp.gt(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtor_QVbfVbf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_gtor_QVbfVbf(Qx, Vu, Vv)                                     \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)                          \
+  ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_or)(                      \
+       __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,      \
+       Vv)),                                                                   \
+   -1)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Qx4^=vcmp.gt(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPred Q6_Q_vcmp_gtxacc_QVbfVbf(HVX_VectorPred
+   Qx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VA Execution
+   Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_Q_vcmp_gtxacc_QVbfVbf(Qx, Vu, Vv)                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandqrt)                          \
+  ((__BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vgtbf_xor)(                     \
+       __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vandvrt)((Qx), -1), Vu,      \
+       Vv)),                                                                   \
+   -1)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.bf=vmax(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmax_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_LATE Execution Slots: SLOT23
+   ========================================================================== */
+
+#define Q6_Vbf_vmax_VbfVbf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmax_bf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vd32.bf=vmin(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vbf_vmin_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_LATE Execution Slots: SLOT23
+   ========================================================================== */
+
+#define Q6_Vbf_vmin_VbfVbf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmin_bf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.sf=vmpy(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpy_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Wsf_vmpy_VbfVbf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vxx32.sf+=vmpy(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vmpyacc_WsfVbfVbf(HVX_VectorPair
+   Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution
+   Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Wsf_vmpyacc_WsfVbfVbf(Vxx, Vu, Vv)                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_sf_bf_acc)(Vxx, Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 73
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.sf=vsub(Vu32.bf,Vv32.bf)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Wsf_vsub_VbfVbf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Wsf_vsub_VbfVbf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_sf_bf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 73 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32=vgetqfext(Vu32.x,Rt32)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfext_VR(HVX_Vector Vu, Word32 Rt)
+   Instruction Type:      CVI_VX
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vgetqfext_VR(Vu, Rt)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext)(Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vx32|=vgetqfext(Vu32.x,Rt32)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vgetqfextor_VVR(HVX_Vector Vx,
+   HVX_Vector Vu, Word32 Rt) Instruction Type:      CVI_VX Execution Slots:
+   SLOT23
+   ========================================================================== */
+
+#define Q6_V_vgetqfextor_VVR(Vx, Vu, Rt)                                       \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_get_qfext_oracc)(Vx, Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.x=vsetqfext(Vu32,Rt32)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vsetqfext_VR(HVX_Vector Vu, Word32 Rt)
+   Instruction Type:      CVI_VX
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vsetqfext_VR(Vu, Rt)                                              \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_set_qfext)(Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=vabs(Vu32.f8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vabs_V(HVX_Vector Vu)
+   Instruction Type:      CVI_VX_LATE
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vabs_V(Vu) __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vabs_f8)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vadd(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vadd_VV(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vadd_VV(Vu, Vv)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vadd_hf_f8)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.b=vcvt2(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vb_vcvt2_VhfVhf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vb_vcvt2_VhfVhf(Vu, Vv)                                             \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_b_hf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vcvt2(Vu32.b)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vb(HVX_Vector Vu)
+   Instruction Type:      CVI_VX_DV
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vcvt2_Vb(Vu)                                                    \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_b)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vcvt2(Vu32.ub)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt2_Vub(HVX_Vector Vu)
+   Instruction Type:      CVI_VX_DV
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vcvt2_Vub(Vu)                                                   \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_hf_ub)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.ub=vcvt2(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vub_vcvt2_VhfVhf(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vub_vcvt2_VhfVhf(Vu, Vv)                                            \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt2_ub_hf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=vcvt(Vu32.hf,Vv32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vcvt_VhfVhf(HVX_Vector Vu, HVX_Vector
+   Vv) Instruction Type:      CVI_VX Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vcvt_VhfVhf(Vu, Vv)                                               \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_f8_hf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vcvt(Vu32.f8)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vcvt_V(HVX_Vector Vu)
+   Instruction Type:      CVI_VX_DV
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vcvt_V(Vu)                                                      \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vcvt_hf_f8)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=vfmax(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vfmax_VV(HVX_Vector Vu, HVX_Vector Vv)
+   Instruction Type:      CVI_VX_LATE
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vfmax_VV(Vu, Vv)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmax_f8)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=vfmin(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vfmin_VV(HVX_Vector Vu, HVX_Vector Vv)
+   Instruction Type:      CVI_VX_LATE
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vfmin_VV(Vu, Vv)                                                  \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfmin_f8)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.f8=vfneg(Vu32.f8)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vfneg_V(HVX_Vector Vu)
+   Instruction Type:      CVI_VX_LATE
+   Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_V_vfneg_V(Vu)                                                       \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vfneg_f8)(Vu)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32=vmerge(Vu32.x,Vv32.w)
+   C Intrinsic Prototype: HVX_Vector Q6_V_vmerge_VVw(HVX_Vector Vu, HVX_Vector
+   Vv) Instruction Type:      CVI_VS Execution Slots:       SLOT0123
+   ========================================================================== */
+
+#define Q6_V_vmerge_VVw(Vu, Vv)                                                \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmerge_qf)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vmpy(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpy_VV(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vmpy_VV(Vu, Vv)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vxx32.hf+=vmpy(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vmpyacc_WhfVV(HVX_VectorPair
+   Vxx, HVX_Vector Vu, HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution
+   Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vmpyacc_WhfVV(Vxx, Vu, Vv)                                      \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_hf_f8_acc)(Vxx, Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vmpy(Vu32.hf,Rt32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_VhfRhf(HVX_Vector Vu, Word32
+   Rt) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vqf16_vmpy_VhfRhf(Vu, Rt)                                           \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_hf)(Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf16=vmpy(Vu32.qf16,Rt32.hf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf16_vmpy_Vqf16Rhf(HVX_Vector Vu,
+   Word32 Rt) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vqf16_vmpy_Vqf16Rhf(Vu, Rt)                                         \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_qf16)(Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vd32.qf32=vmpy(Vu32.sf,Rt32.sf)
+   C Intrinsic Prototype: HVX_Vector Q6_Vqf32_vmpy_VsfRsf(HVX_Vector Vu, Word32
+   Rt) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Vqf32_vmpy_VsfRsf(Vu, Rt)                                           \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmpy_rt_sf)(Vu, Rt)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
+#if __HVX_ARCH__ >= 79
+/* ==========================================================================
+   Assembly Syntax:       Vdd32.hf=vsub(Vu32.f8,Vv32.f8)
+   C Intrinsic Prototype: HVX_VectorPair Q6_Whf_vsub_VV(HVX_Vector Vu,
+   HVX_Vector Vv) Instruction Type:      CVI_VX_DV Execution Slots:       SLOT23
+   ========================================================================== */
+
+#define Q6_Whf_vsub_VV(Vu, Vv)                                                 \
+  __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vsub_hf_f8)(Vu, Vv)
+#endif /* __HEXAGON_ARCH___ >= 79 */
+
 #endif /* __HVX__ */

 #endif
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -605,6 +605,20 @@ _storebe_i64(void * __P, long long __D) {
 #include <movdirintrin.h>
 #endif

+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVRS__)
+#include <movrsintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX10_2__) && defined(__MOVRS__))
+#include <movrs_avx10_2intrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX10_2_512__) && defined(__MOVRS__))
+#include <movrs_avx10_2_512intrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
 #include <pconfigintrin.h>
 #endif
@ -620,9 +634,6 @@ _storebe_i64(void * __P, long long __D) {
 #if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
-#include <amxfp16intrin.h>
-#endif

 #if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
    defined(__WIDEKL__)
@ -634,10 +645,59 @@ _storebe_i64(void * __P, long long __D) {
 #include <amxintrin.h>
 #endif

+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
+#include <amxfp16intrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
 #include <amxcomplexintrin.h>
 #endif

+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__)
+#include <amxfp8intrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TRANSPOSE__)
+#include <amxtransposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_MOVRS__)
+#include <amxmovrsintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_MOVRS__) && defined(__AMX_TRANSPOSE__))
+#include <amxmovrstransposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_AVX512__)
+#include <amxavx512intrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
+#include <amxtf32intrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
+#include <amxtf32transposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_BF16__) && defined(__AMX_TRANSPOSE__))
+#include <amxbf16transposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_FP16__) && defined(__AMX_TRANSPOSE__))
+#include <amxfp16transposeintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_COMPLEX__) && defined(__AMX_TRANSPOSE__))
+#include <amxcomplextransposeintrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) ||                             \
    defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
@ -648,6 +708,30 @@ _storebe_i64(void * __P, long long __D) {
 #include <avx512vlvp2intersectintrin.h>
 #endif

+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2__)
+#include <avx10_2bf16intrin.h>
+#include <avx10_2convertintrin.h>
+#include <avx10_2copyintrin.h>
+#include <avx10_2minmaxintrin.h>
+#include <avx10_2niintrin.h>
+#include <avx10_2satcvtdsintrin.h>
+#include <avx10_2satcvtintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX10_2_512__)
+#include <avx10_2_512bf16intrin.h>
+#include <avx10_2_512convertintrin.h>
+#include <avx10_2_512minmaxintrin.h>
+#include <avx10_2_512niintrin.h>
+#include <avx10_2_512satcvtdsintrin.h>
+#include <avx10_2_512satcvtintrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX10_2_512__) && defined(__SM4__))
+#include <sm4evexintrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
 #include <enqcmdintrin.h>
 #endif
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -94,8 +94,8 @@ void __outwordstring(unsigned short, unsigned short *, unsigned long);
 unsigned long __readcr0(void);
 unsigned long __readcr2(void);
 unsigned __LPTRINT_TYPE__ __readcr3(void);
-unsigned long __readcr4(void);
-unsigned long __readcr8(void);
+unsigned __LPTRINT_TYPE__ __readcr4(void);
+unsigned __int64 __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
 unsigned char __readfsbyte(unsigned long);
@ -124,8 +124,8 @@ void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
 void __writecr3(unsigned __INTPTR_TYPE__);
-void __writecr4(unsigned int);
-void __writecr8(unsigned int);
+void __writecr4(unsigned __INTPTR_TYPE__);
+void __writecr8(unsigned __int64);
 void __writedr(unsigned int, unsigned int);
 void __writefsbyte(unsigned long, unsigned char);
 void __writefsdword(unsigned long, unsigned long);
@ -330,33 +330,33 @@ static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
 }

-static inline unsigned char __inbyte(unsigned short port) {
+static __inline__ unsigned char __inbyte(unsigned short port) {
  unsigned char ret;
  __asm__ __volatile__("inb %w1, %b0" : "=a"(ret) : "Nd"(port));
  return ret;
 }

-static inline unsigned short __inword(unsigned short port) {
+static __inline__ unsigned short __inword(unsigned short port) {
  unsigned short ret;
  __asm__ __volatile__("inw %w1, %w0" : "=a"(ret) : "Nd"(port));
  return ret;
 }

-static inline unsigned long __indword(unsigned short port) {
+static __inline__ unsigned long __indword(unsigned short port) {
  unsigned long ret;
  __asm__ __volatile__("inl %w1, %k0" : "=a"(ret) : "Nd"(port));
  return ret;
 }

-static inline void __outbyte(unsigned short port, unsigned char data) {
+static __inline__ void __outbyte(unsigned short port, unsigned char data) {
  __asm__ __volatile__("outb %b0, %w1" : : "a"(data), "Nd"(port));
 }

-static inline void __outword(unsigned short port, unsigned short data) {
+static __inline__ void __outword(unsigned short port, unsigned short data) {
  __asm__ __volatile__("outw %w0, %w1" : : "a"(data), "Nd"(port));
 }

-static inline void __outdword(unsigned short port, unsigned long data) {
+static __inline__ void __outdword(unsigned short port, unsigned long data) {
  __asm__ __volatile__("outl %k0, %w1" : : "a"(data), "Nd"(port));
 }
 #endif
@ -396,6 +396,16 @@ unsigned short __readx18word(unsigned long offset);
 unsigned long __readx18dword(unsigned long offset);
 unsigned __int64 __readx18qword(unsigned long offset);

+void __addx18byte(unsigned long offset, unsigned char data);
+void __addx18word(unsigned long offset, unsigned short data);
+void __addx18dword(unsigned long offset, unsigned long data);
+void __addx18qword(unsigned long offset, unsigned __int64 data);
+
+void __incx18byte(unsigned long offset);
+void __incx18word(unsigned long offset);
+void __incx18dword(unsigned long offset);
+void __incx18qword(unsigned long offset);
+
 double _CopyDoubleFromInt64(__int64);
 float _CopyFloatFromInt32(__int32);
 __int32 _CopyInt32FromFloat(float);
--- a/lib/include/intrin0.h
+++ b/lib/include/intrin0.h
@ -44,7 +44,7 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
                                                 __int64 *_ComparandResult);
 #endif

-#ifdef __x86_64__ && !defined(__arm64ec__)
+#if defined(__x86_64__) && !defined(__arm64ec__)
 unsigned __int64 _umul128(unsigned __int64, unsigned __int64,
                          unsigned __int64 *);
 unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
@ -207,6 +207,9 @@ long _InterlockedExchange_rel(long volatile *_Target, long _Value);
 __int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
 __int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
 __int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
+void *_InterlockedExchangePointer_acq(void *volatile *_Target, void *_Value);
+void *_InterlockedExchangePointer_nf(void *volatile *_Target, void *_Value);
+void *_InterlockedExchangePointer_rel(void *volatile *_Target, void *_Value);

 /*----------------------------------------------------------------------------*\
 |* Interlocked Compare Exchange
@ -237,6 +240,12 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
 __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
                                          __int64 _Exchange,
                                          __int64 _Comparand);
+void *_InterlockedCompareExchangePointer_acq(void *volatile *_Destination,
+                                             void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_nf(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+void *_InterlockedCompareExchangePointer_rel(void *volatile *_Destination,
+                                             void *_Exchange, void *_Comparand);
 #endif

 #ifdef __cplusplus
--- a/lib/include/larchintrin.h
+++ b/lib/include/larchintrin.h
@ -228,17 +228,31 @@ extern __inline void
  ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif

-#define __frecipe_s(/*float*/ _1)                                              \
-  (float)__builtin_loongarch_frecipe_s((float)_1)
+#ifdef __loongarch_frecipe
+extern __inline float
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __frecipe_s(float _1) {
+  return __builtin_loongarch_frecipe_s(_1);
+}

-#define __frecipe_d(/*double*/ _1)                                             \
-  (double)__builtin_loongarch_frecipe_d((double)_1)
+extern __inline double
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __frecipe_d(double _1) {
+  return __builtin_loongarch_frecipe_d(_1);
+}

-#define __frsqrte_s(/*float*/ _1)                                              \
-  (float)__builtin_loongarch_frsqrte_s((float)_1)
+extern __inline float
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __frsqrte_s(float _1) {
+  return __builtin_loongarch_frsqrte_s(_1);
+}

-#define __frsqrte_d(/*double*/ _1)                                             \
-  (double)__builtin_loongarch_frsqrte_d((double)_1)
+extern __inline double
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __frsqrte_d(double _1) {
+  return __builtin_loongarch_frsqrte_d(_1);
+}
+#endif

 #ifdef __cplusplus
 }
--- a/lib/include/lasxintrin.h
+++ b/lib/include/lasxintrin.h
@ -1726,18 +1726,6 @@ extern __inline
  return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
 }

-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
-    __lasx_xvfrecipe_s(__m256 _1) {
-  return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
-}
-
-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
-    __lasx_xvfrecipe_d(__m256d _1) {
-  return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
-}
-
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
    __lasx_xvfrint_s(__m256 _1) {
@ -1762,18 +1750,6 @@ extern __inline
  return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
 }

-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
-    __lasx_xvfrsqrte_s(__m256 _1) {
-  return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
-}
-
-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
-    __lasx_xvfrsqrte_d(__m256d _1) {
-  return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
-}
-
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
    __lasx_xvflogb_s(__m256 _1) {
@ -2585,7 +2561,7 @@ extern __inline
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
    __lasx_xvorn_v(__m256i _1, __m256i _2) {
-  return (__m256i)__builtin_lasx_xvorn_v((v32i8)_1, (v32i8)_2);
+  return (__m256i)__builtin_lasx_xvorn_v((v32u8)_1, (v32u8)_2);
 }

 #define __lasx_xvldi(/*i13*/ _1) ((__m256i)__builtin_lasx_xvldi((_1)))
@ -3866,6 +3842,32 @@ extern __inline
  return (__m256i)__builtin_lasx_xvfcmp_sun_s((v8f32)_1, (v8f32)_2);
 }

+#if defined(__loongarch_frecipe)
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrecipe_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrecipe_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrsqrte_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrsqrte_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
+}
+#endif
+
 #define __lasx_xvpickve_d_f(/*__m256d*/ _1, /*ui2*/ _2)                        \
  ((__m256d)__builtin_lasx_xvpickve_d_f((v4f64)(_1), (_2)))

--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -111,11 +111,14 @@
 #define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
 #endif

-/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
-   that we don't have something like #pragma poison that could be used to
-   deprecate a macro - the code should just use LLONG_MAX and friends.
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension. Android's
+   bionic also defines them. It's too bad that we don't have something like
+   #pragma poison that could be used to deprecate a macro - the code should just
+   use LLONG_MAX and friends.
 */
-#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+#if (defined(__GNU_LIBRARY__) ? defined(__USE_GNU)                             \
+                              : !defined(__STRICT_ANSI__)) ||                  \
+    defined(__BIONIC__)

 #undef   LONG_LONG_MIN
 #undef   LONG_LONG_MAX
--- a/lib/include/llvm_libc_wrappers/ctype.h
+++ b/lib/include/llvm_libc_wrappers/ctype.h
@ -51,6 +51,19 @@
 #pragma push_macro("toascii")
 #pragma push_macro("tolower")
 #pragma push_macro("toupper")
+#pragma push_macro("isalnum_l")
+#pragma push_macro("isalpha_l")
+#pragma push_macro("isascii_l")
+#pragma push_macro("isblank_l")
+#pragma push_macro("iscntrl_l")
+#pragma push_macro("isdigit_l")
+#pragma push_macro("isgraph_l")
+#pragma push_macro("islower_l")
+#pragma push_macro("isprint_l")
+#pragma push_macro("ispunct_l")
+#pragma push_macro("isspace_l")
+#pragma push_macro("isupper_l")
+#pragma push_macro("isxdigit_l")

 #undef isalnum
 #undef isalpha
@ -68,6 +81,18 @@
 #undef toascii
 #undef tolower
 #undef toupper
+#undef isalnum_l
+#undef isalpha_l
+#undef iscntrl_l
+#undef isdigit_l
+#undef islower_l
+#undef isgraph_l
+#undef isprint_l
+#undef ispunct_l
+#undef isspace_l
+#undef isupper_l
+#undef isblank_l
+#undef isxdigit_l

 #pragma omp begin declare target

@ -93,6 +118,19 @@
 #pragma pop_macro("toascii")
 #pragma pop_macro("tolower")
 #pragma pop_macro("toupper")
+#pragma pop_macro("isalnum_l")
+#pragma pop_macro("isalpha_l")
+#pragma pop_macro("isascii_l")
+#pragma pop_macro("isblank_l")
+#pragma pop_macro("iscntrl_l")
+#pragma pop_macro("isdigit_l")
+#pragma pop_macro("isgraph_l")
+#pragma pop_macro("islower_l")
+#pragma pop_macro("isprint_l")
+#pragma pop_macro("ispunct_l")
+#pragma pop_macro("isspace_l")
+#pragma pop_macro("isupper_l")
+#pragma pop_macro("isxdigit_l")
 #endif

 #undef __LIBC_ATTRS
--- a/lib/include/llvm_libc_wrappers/stdlib.h
+++ b/lib/include/llvm_libc_wrappers/stdlib.h
@ -34,8 +34,16 @@ _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
 _Static_assert(__builtin_offsetof(ldiv_t, quot) == 0, "ABI mismatch!");
 _Static_assert(__builtin_offsetof(lldiv_t, quot) == 0, "ABI mismatch!");

+#if defined(__GLIBC__) && __cplusplus >= 201703L
+#define at_quick_exit atexit
+#endif
+
 #include <llvm-libc-decls/stdlib.h>

+#if defined(__GLIBC__) && __cplusplus >= 201703L
+#undef at_quick_exit
+#endif
+
 #pragma omp end declare target

 #undef __LIBC_ATTRS
--- a/lib/include/lsxintrin.h
+++ b/lib/include/lsxintrin.h
@ -1776,18 +1776,6 @@ extern __inline
  return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
 }

-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
-    __lsx_vfrecipe_s(__m128 _1) {
-  return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
-}
-
-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
-    __lsx_vfrecipe_d(__m128d _1) {
-  return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
-}
-
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
    __lsx_vfrint_s(__m128 _1) {
@ -1812,18 +1800,6 @@ extern __inline
  return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
 }

-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
-    __lsx_vfrsqrte_s(__m128 _1) {
-  return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
-}
-
-extern __inline
-    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
-    __lsx_vfrsqrte_d(__m128d _1) {
-  return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
-}
-
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
    __lsx_vflogb_s(__m128 _1) {
@ -3425,7 +3401,7 @@ extern __inline
 extern __inline
    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
    __lsx_vorn_v(__m128i _1, __m128i _2) {
-  return (__m128i)__builtin_lsx_vorn_v((v16i8)_1, (v16i8)_2);
+  return (__m128i)__builtin_lsx_vorn_v((v16u8)_1, (v16u8)_2);
 }

 #define __lsx_vldi(/*i13*/ _1) ((__m128i)__builtin_lsx_vldi((_1)))
@ -3738,6 +3714,32 @@ extern __inline
  return (__m128i)__builtin_lsx_vfcmp_sun_s((v4f32)_1, (v4f32)_2);
 }

+#if defined(__loongarch_frecipe)
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrecipe_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrecipe_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrsqrte_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrsqrte_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
+}
+#endif
+
 #define __lsx_vrepli_b(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_b((_1)))

 #define __lsx_vrepli_d(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_d((_1)))
--- a/lib/include/lzcntintrin.h
+++ b/lib/include/lzcntintrin.h
@ -15,7 +15,13 @@
 #define __LZCNTINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+#endif

 #ifndef _MSC_VER
 /// Counts the number of leading zero bits in the operand.
@ -43,8 +49,7 @@
 ///    bits in the operand.
 /// \see _lzcnt_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__lzcnt32(unsigned int __X)
-{
+__lzcnt32(unsigned int __X) {
  return __builtin_ia32_lzcnt_u32(__X);
 }

@ -60,8 +65,7 @@ __lzcnt32(unsigned int __X)
 ///    bits in the operand.
 /// \see __lzcnt32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_lzcnt_u32(unsigned int __X)
-{
+_lzcnt_u32(unsigned int __X) {
  return __builtin_ia32_lzcnt_u32(__X);
 }

@ -93,8 +97,7 @@ _lzcnt_u32(unsigned int __X)
 ///    bits in the operand.
 /// \see __lzcnt64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_lzcnt_u64(unsigned long long __X)
-{
+_lzcnt_u64(unsigned long long __X) {
  return __builtin_ia32_lzcnt_u64(__X);
 }
 #endif
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@ -21,10 +21,45 @@ typedef int __v2si __attribute__((__vector_size__(8)));
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));

+/* Unsigned types */
+typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
+typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v8qs __attribute__((__vector_size__(8)));
+
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
-                 __min_vector_width__(64)))
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
+                 __min_vector_width__(128)))
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)

 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@ -50,10 +85,10 @@ _mm_empty(void) {
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }

 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@ -67,10 +102,10 @@ _mm_cvtsi32_si64(int __i)
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }

 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@ -83,7 +118,7 @@ _mm_cvtsi64_si32(__m64 __m)
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
    return (__m64)__i;
@ -99,7 +134,7 @@ _mm_cvtsi64_m64(long long __i)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
    return (long long)__m;
@ -124,10 +159,11 @@ _mm_cvtm64_si64(__m64 __m)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packsswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }

 /// Converts, with saturation, 32-bit signed integers from both 64-bit integer
@ -149,10 +185,11 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __trunc64(__builtin_ia32_packssdw128(
+        (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){}));
 }

 /// Converts, with saturation, 16-bit signed integers from both 64-bit integer
@ -174,10 +211,11 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
 ///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_packuswb128(
+        (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){}));
 }

 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@ -201,10 +239,11 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          4, 12, 5, 13, 6, 14, 7, 15);
 }

 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@ -224,10 +263,11 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          2, 6, 3, 7);
 }

 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@ -245,10 +285,10 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }

 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@ -272,10 +312,11 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          0, 8, 1, 9, 2, 10, 3, 11);
 }

 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@ -295,10 +336,11 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          0, 4, 1, 5);
 }

 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@ -316,10 +358,10 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }

 /// Adds each 8-bit integer element of the first 64-bit integer vector
@ -337,10 +379,10 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }

 /// Adds each 16-bit integer element of the first 64-bit integer vector
@ -358,10 +400,10 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }

 /// Adds each 32-bit integer element of the first 64-bit integer vector
@ -379,10 +421,10 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }

 /// Adds, with saturation, each 8-bit signed integer element of the first
@ -403,10 +445,10 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2);
 }

 /// Adds, with saturation, each 16-bit signed integer element of the first
@ -427,10 +469,10 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2);
 }

 /// Adds, with saturation, each 8-bit unsigned integer element of the first
@ -450,10 +492,10 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2);
 }

 /// Adds, with saturation, each 16-bit unsigned integer element of the first
@ -473,10 +515,10 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2);
 }

 /// Subtracts each 8-bit integer element of the second 64-bit integer
@ -494,10 +536,10 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }

 /// Subtracts each 16-bit integer element of the second 64-bit integer
@ -515,10 +557,10 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }

 /// Subtracts each 32-bit integer element of the second 64-bit integer
@ -536,10 +578,10 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }

 /// Subtracts, with saturation, each 8-bit signed integer element of the second
@ -560,10 +602,10 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2);
 }

 /// Subtracts, with saturation, each 16-bit signed integer element of the
@ -584,10 +626,10 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2);
 }

 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@ -608,10 +650,10 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2);
 }

 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@ -632,10 +674,10 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2);
 }

 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -659,10 +701,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }

 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -680,10 +723,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }

 /// Multiplies each 16-bit signed integer element of the first 64-bit
@ -701,10 +745,10 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }

 /// Left-shifts each 16-bit signed integer element of the first
@ -724,10 +768,11 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }

 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@ -746,10 +791,11 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }

 /// Left-shifts each 32-bit signed integer element of the first
@ -769,10 +815,11 @@ _mm_slli_pi16(__m64 __m, int __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }

 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@ -791,10 +838,11 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
+                                              __count));
 }

 /// Left-shifts the first 64-bit integer parameter by the number of bits
@ -811,10 +859,11 @@ _mm_slli_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }

 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@ -831,10 +880,11 @@ _mm_sll_si64(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
+                                              __count));
 }

 /// Right-shifts each 16-bit integer element of the first parameter,
@ -855,10 +905,11 @@ _mm_slli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }

 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@ -878,10 +929,11 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
+                                              __count));
 }

 /// Right-shifts each 32-bit integer element of the first parameter,
@ -902,10 +954,11 @@ _mm_srai_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }

 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@ -925,10 +978,11 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
+                                              __count));
 }

 /// Right-shifts each 16-bit integer element of the first parameter,
@ -948,10 +1002,11 @@ _mm_srai_pi32(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }

 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@ -970,10 +1025,11 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }

 /// Right-shifts each 32-bit integer element of the first parameter,
@ -993,10 +1049,11 @@ _mm_srli_pi16(__m64 __m, int __count)
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }

 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@ -1015,10 +1072,11 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
+                                              __count));
 }

 /// Right-shifts the first 64-bit integer parameter by the number of bits
@ -1035,10 +1093,11 @@ _mm_srli_pi32(__m64 __m, int __count)
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
+                                             (__v2di)__anyext128(__count)));
 }

 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@ -1056,10 +1115,11 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
+                                              __count));
 }

 /// Performs a bitwise AND of two 64-bit integer vectors.
@ -1074,10 +1134,10 @@ _mm_srli_si64(__m64 __m, int __count)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }

 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@ -1095,10 +1155,10 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }

 /// Performs a bitwise OR of two 64-bit integer vectors.
@ -1113,10 +1173,10 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }

 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@ -1131,10 +1191,10 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }

 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@ -1153,10 +1213,10 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }

 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@ -1175,10 +1235,10 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }

 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@ -1197,10 +1257,10 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }

 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@ -1219,10 +1279,12 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+  /* This function always performs a signed comparison, but __v8qi is a char
+     which may be signed or unsigned, so use __v8qs. */
+    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }

 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@ -1241,10 +1303,10 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }

 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@ -1263,10 +1325,10 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }

 /// Constructs a 64-bit integer vector initialized to zero.
@ -1276,10 +1338,9 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setzero_si64(void)
-{
-    return __extension__ (__m64){ 0LL };
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setzero_si64(void) {
+  return __extension__(__m64){0LL};
 }

 /// Constructs a 64-bit integer vector initialized with the specified
@ -1297,10 +1358,9 @@ _mm_setzero_si64(void)
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi32(int __i1, int __i0)
-{
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set_pi32(int __i1, int __i0) {
+  return __extension__(__m64)(__v2si){__i0, __i1};
 }

 /// Constructs a 64-bit integer vector initialized with the specified
@ -1320,10 +1380,9 @@ _mm_set_pi32(int __i1, int __i0)
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
-{
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) {
+  return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }

 /// Constructs a 64-bit integer vector initialized with the specified
@ -1351,12 +1410,11 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
-            char __b1, char __b0)
-{
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+            char __b1, char __b0) {
+  return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                      __b4, __b5, __b6, __b7};
 }

 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@ -1372,10 +1430,9 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi32(int __i)
-{
-    return _mm_set_pi32(__i, __i);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi32(int __i) {
+  return _mm_set_pi32(__i, __i);
 }

 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
@ -1391,10 +1448,9 @@ _mm_set1_pi32(int __i)
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi16(short __w)
-{
-    return _mm_set_pi16(__w, __w, __w, __w);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi16(short __w) {
+  return _mm_set_pi16(__w, __w, __w, __w);
 }

 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
@ -1409,10 +1465,9 @@ _mm_set1_pi16(short __w)
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi8(char __b)
-{
-    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_set1_pi8(char __b) {
+  return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }

 /// Constructs a 64-bit integer vector, initialized in reverse order with
@ -1430,10 +1485,9 @@ _mm_set1_pi8(char __b)
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi32(int __i0, int __i1)
-{
-    return _mm_set_pi32(__i1, __i0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setr_pi32(int __i0, int __i1) {
+  return _mm_set_pi32(__i1, __i0);
 }

 /// Constructs a 64-bit integer vector, initialized in reverse order with
@ -1453,10 +1507,9 @@ _mm_setr_pi32(int __i0, int __i1)
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
-{
-    return _mm_set_pi16(__w3, __w2, __w1, __w0);
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }

 /// Constructs a 64-bit integer vector, initialized in reverse order with
@ -1484,14 +1537,15 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
-             char __b6, char __b7)
-{
-    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+             char __b6, char __b7) {
+  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }

-#undef __DEFAULT_FN_ATTRS
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2

 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@ -66,6 +66,8 @@ module _Builtin_intrinsics [system] [extern_c] {
    textual header "__wmmintrin_aes.h"
    textual header "__wmmintrin_pclmul.h"

+    textual header "mm3dnow.h"
+
    explicit module mm_malloc {
      requires !freestanding
      header "mm_malloc.h"
@ -122,10 +124,6 @@ module _Builtin_intrinsics [system] [extern_c] {
      header "popcntintrin.h"
    }

-    explicit module mm3dnow {
-      header "mm3dnow.h"
-    }
-
    explicit module aes_pclmul {
      header "wmmintrin.h"
      export aes
--- a/lib/include/movrs_avx10_2_512intrin.h
+++ b/lib/include/movrs_avx10_2_512intrin.h
@ -0,0 +1,98 @@
+/*===----- movrs_avx10_2_512intrin.h - AVX10.2-512-MOVRS intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <movrs_avx10_2_512intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __MOVRS_AVX10_2_512INTRIN_H
+#define __MOVRS_AVX10_2_512INTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("movrs, avx10.2-512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadrs_epi8(void const *__A) {
+  return (__m512i)__builtin_ia32_vmovrsb512((const __v64qi *)(__A));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadrs_epi8(__m512i __W, __mmask64 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectb_512(
+      (__mmask64)__U, (__v64qi)_mm512_loadrs_epi8(__A), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadrs_epi8(__mmask64 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+                                             (__v64qi)_mm512_loadrs_epi8(__A),
+                                             (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadrs_epi32(void const *__A) {
+  return (__m512i)__builtin_ia32_vmovrsd512((const __v16si *)(__A));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadrs_epi32(__m512i __W, __mmask16 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_loadrs_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadrs_epi32(__mmask16 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_loadrs_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadrs_epi64(void const *__A) {
+  return (__m512i)__builtin_ia32_vmovrsq512((const __v8di *)(__A));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadrs_epi64(__m512i __W, __mmask8 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_loadrs_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_loadrs_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadrs_epi16(void const *__A) {
+  return (__m512i)__builtin_ia32_vmovrsw512((const __v32hi *)(__A));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadrs_epi16(__m512i __W, __mmask32 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectw_512(
+      (__mmask32)__U, (__v32hi)_mm512_loadrs_epi16(__A), (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadrs_epi16(__mmask32 __U, void const *__A) {
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+                                             (__v32hi)_mm512_loadrs_epi16(__A),
+                                             (__v32hi)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif /* __x86_64__ */
+#endif /* __MOVRS_AVX10_2_512INTRIN_H */
--- a/lib/include/movrs_avx10_2intrin.h
+++ b/lib/include/movrs_avx10_2intrin.h
@ -0,0 +1,174 @@
+/*===--------- movrs_avx10_2intrin.h - AVX10.2-MOVRS intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <movrs_avx10_2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __MOVRS_AVX10_2INTRIN_H
+#define __MOVRS_AVX10_2INTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("movrs,avx10.2-256"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("movrs,avx10.2-256"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_loadrs_epi8(void const *__A) {
+  return (__m128i)__builtin_ia32_vmovrsb128((const __v16qi *)(__A));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadrs_epi8(__m128i __W, __mmask16 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectb_128(
+      (__mmask16)__U, (__v16qi)_mm_loadrs_epi8(__A), (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadrs_epi8(__mmask16 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+                                             (__v16qi)_mm_loadrs_epi8(__A),
+                                             (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadrs_epi8(void const *__A) {
+  return (__m256i)__builtin_ia32_vmovrsb256((const __v32qi *)(__A));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadrs_epi8(__m256i __W, __mmask32 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectb_256(
+      (__mmask32)__U, (__v32qi)_mm256_loadrs_epi8(__A), (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadrs_epi8(__mmask32 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+                                             (__v32qi)_mm256_loadrs_epi8(__A),
+                                             (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_loadrs_epi32(void const *__A) {
+  return (__m128i)__builtin_ia32_vmovrsd128((const __v4si *)(__A));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadrs_epi32(__m128i __W, __mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_loadrs_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadrs_epi32(__mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_loadrs_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadrs_epi32(void const *__A) {
+  return (__m256i)__builtin_ia32_vmovrsd256((const __v8si *)(__A));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadrs_epi32(__m256i __W, __mmask8 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_loadrs_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadrs_epi32(__mmask8 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_loadrs_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_loadrs_epi64(void const *__A) {
+  return (__m128i)__builtin_ia32_vmovrsq128((const __v2di *)(__A));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadrs_epi64(__m128i __W, __mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_loadrs_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_loadrs_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadrs_epi64(void const *__A) {
+  return (__m256i)__builtin_ia32_vmovrsq256((const __v4di *)(__A));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadrs_epi64(__m256i __W, __mmask8 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_loadrs_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadrs_epi64(__mmask8 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_loadrs_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_loadrs_epi16(void const *__A) {
+  return (__m128i)__builtin_ia32_vmovrsw128((const __v8hi *)(__A));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadrs_epi16(__m128i __W, __mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectw_128(
+      (__mmask8)__U, (__v8hi)_mm_loadrs_epi16(__A), (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadrs_epi16(__mmask8 __U, void const *__A) {
+  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+                                             (__v8hi)_mm_loadrs_epi16(__A),
+                                             (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadrs_epi16(void const *__A) {
+  return (__m256i)__builtin_ia32_vmovrsw256((const __v16hi *)(__A));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadrs_epi16(__m256i __W, __mmask16 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectw_256(
+      (__mmask16)__U, (__v16hi)_mm256_loadrs_epi16(__A), (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadrs_epi16(__mmask16 __U, void const *__A) {
+  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+                                             (__v16hi)_mm256_loadrs_epi16(__A),
+                                             (__v16hi)_mm256_setzero_si256());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __x86_64__ */
+#endif /* __MOVRS_AVX10_2INTRIN_H */
--- a/lib/include/movrsintrin.h
+++ b/lib/include/movrsintrin.h
@ -0,0 +1,59 @@
+/*===---------------- movrsintrin.h - MOVRS intrinsics ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef __IMMINTRIN_H
+#error "Never use <movrsintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __MOVRSINTRIN_H
+#define __MOVRSINTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("movrs")))
+
+#ifdef __x86_64__
+static __inline__ char __DEFAULT_FN_ATTRS _movrs_i8(const void *__A) {
+  return (char)__builtin_ia32_movrsqi((const void *)__A);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS _movrs_i16(const void *__A) {
+  return (short)__builtin_ia32_movrshi((const void *)__A);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS _movrs_i32(const void *__A) {
+  return (int)__builtin_ia32_movrssi((const void *)__A);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _movrs_i64(const void *__A) {
+  return (long long)__builtin_ia32_movrsdi((const void *)__A);
+}
+#endif // __x86_64__
+
+// Loads a memory sequence containing the specified memory address into
+/// the L3 data cache. Data will be shared (read/written) to by requesting
+/// core and other cores.
+///
+/// Note that the effect of this intrinsic is dependent on the processor
+/// implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHRS instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchrs(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchrs((const void *)__P);
+#pragma clang diagnostic pop
+}
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __MOVRSINTRIN_H
--- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
@ -10,17 +10,15 @@
 #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__

-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif

+#ifdef __NVPTX__
 #pragma omp begin declare variant match(                                       \
    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

+#pragma push_macro("__CUDA__")
 #define __CUDA__
 #define __OPENMP_NVPTX__

@ -31,9 +29,10 @@ extern "C" {
 #include <__clang_cuda_device_functions.h>

 #undef __OPENMP_NVPTX__
-#undef __CUDA__
+#pragma pop_macro("__CUDA__")

 #pragma omp end declare variant
+#endif

 #ifdef __AMDGCN__
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
--- a/lib/include/openmp_wrappers/complex_cmath.h
+++ b/lib/include/openmp_wrappers/complex_cmath.h
@ -64,8 +64,13 @@ template <class _Tp> __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
 }

 // conj
-
-template <class _Tp> std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
+#ifdef _GLIBCXX20_CONSTEXPR
+#define CXX20_CONSTEXPR_DEVICE __DEVICE__
+#else
+#define CXX20_CONSTEXPR_DEVICE
+#endif
+template <class _Tp>
+CXX20_CONSTEXPR_DEVICE std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
  return std::complex<_Tp>(__c.real(), -__c.imag());
 }

--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -17,9 +17,21 @@
 #include <emmintrin.h>

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("sse3,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3"),           \
+                 __min_vector_width__(128)))
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif

 /// Loads data from an unaligned memory location to elements in a 128-bit
 ///    vector.
@ -122,7 +134,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_movehdup_ps(__m128 __a)
 {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
@ -143,7 +155,7 @@ _mm_movehdup_ps(__m128 __a)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 ///    values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_moveldup_ps(__m128 __a)
 {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
@ -244,7 +256,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
 ///    [127:64] and [63:0] of the destination.
 /// \returns A 128-bit vector of [2 x double] containing the moved and
 ///    duplicated values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_movedup_pd(__m128d __a)
 {
  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
@ -297,5 +309,6 @@ _mm_mwait(unsigned __extensions, unsigned __hints)
 }

 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #endif /* __PMMINTRIN_H */
--- a/lib/include/popcntintrin.h
+++ b/lib/include/popcntintrin.h
@ -11,12 +11,13 @@
 #define __POPCNTINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
-
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("popcnt"))) constexpr
 #else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
 #endif

 /// Counts the number of bits in the source operand having a value of 1.
@ -29,7 +30,7 @@
 ///    An unsigned 32-bit integer operand.
 /// \returns A 32-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm_popcnt_u32(unsigned int __A)
 {
  return __builtin_popcount(__A);
@ -46,7 +47,7 @@ _mm_popcnt_u32(unsigned int __A)
 ///    An unsigned 64-bit integer operand.
 /// \returns A 64-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_popcnt_u64(unsigned long long __A)
 {
  return __builtin_popcountll(__A);
@ -54,6 +55,5 @@ _mm_popcnt_u64(unsigned long long __A)
 #endif /* __x86_64__ */

 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #endif /* __POPCNTINTRIN_H */
--- a/lib/include/ptrauth.h
+++ b/lib/include/ptrauth.h
@ -42,6 +42,9 @@ typedef enum {
     The extra data is always 0. */
  ptrauth_key_cxx_vtable_pointer = ptrauth_key_process_independent_data,

+  /* The key used to sign pointers in ELF .init_array/.fini_array. */
+  ptrauth_key_init_fini_pointer = ptrauth_key_process_independent_code,
+
  /* Other pointers signed under the ABI use private ABI rules. */

 } ptrauth_key;
@ -253,6 +256,9 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
  [[clang::ptrauth_vtable_pointer(key, address_discrimination,                 \
                                  extra_discrimination)]]

+/* The value is ptrauth_string_discriminator("init_fini") */
+#define __ptrauth_init_fini_discriminator 0xd9d4
+
 #else

 #define ptrauth_strip(__value, __key)                                          \
--- a/lib/include/riscv_corev_alu.h
+++ b/lib/include/riscv_corev_alu.h
@ -0,0 +1,128 @@
+/*===---- riscv_corev_alu.h - CORE-V ALU intrinsics ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_COREV_ALU_H
+#define __RISCV_COREV_ALU_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__riscv_xcvalu)
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_abs(long a) {
+  return __builtin_abs(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_slet(long a, long b) {
+  return __builtin_riscv_cv_alu_slet(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_sletu(unsigned long a, unsigned long b) {
+  return __builtin_riscv_cv_alu_sletu(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_min(long a, long b) {
+  return __builtin_elementwise_min(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_minu(unsigned long a, unsigned long b) {
+  return __builtin_elementwise_min(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_max(long a, long b) {
+  return __builtin_elementwise_max(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_maxu(unsigned long a, unsigned long b) {
+  return __builtin_elementwise_max(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_exths(int16_t a) {
+  return __builtin_riscv_cv_alu_exths(a);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_exthz(uint16_t a) {
+  return __builtin_riscv_cv_alu_exthz(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_extbs(int8_t a) {
+  return __builtin_riscv_cv_alu_extbs(a);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_extbz(uint8_t a) {
+  return __builtin_riscv_cv_alu_extbz(a);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_clip(long a,
+                                                              unsigned long b) {
+  return __builtin_riscv_cv_alu_clip(a, b);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_clipu(unsigned long a, unsigned long b) {
+  return __builtin_riscv_cv_alu_clipu(a, b);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addN(long a, long b,
+                                                              uint8_t shft) {
+  return __builtin_riscv_cv_alu_addN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_adduN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_adduN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_addRN(long a, long b,
+                                                               uint8_t shft) {
+  return __builtin_riscv_cv_alu_addRN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_adduRN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_adduRN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subN(long a, long b,
+                                                              uint8_t shft) {
+  return __builtin_riscv_cv_alu_subN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_subuN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_subuN(a, b, shft);
+}
+
+static __inline__ long __DEFAULT_FN_ATTRS __riscv_cv_alu_subRN(long a, long b,
+                                                               uint8_t shft) {
+  return __builtin_riscv_cv_alu_subRN(a, b, shft);
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__riscv_cv_alu_subuRN(unsigned long a, unsigned long b, uint8_t shft) {
+  return __builtin_riscv_cv_alu_subuRN(a, b, shft);
+}
+
+#endif // defined(__riscv_xcvalu)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // define __RISCV_COREV_ALU_H
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
@ -419,7 +419,6 @@ typedef __rvv_bfloat16m2x4_t vbfloat16m2x4_t;
 typedef __rvv_bfloat16m4_t vbfloat16m4_t;
 typedef __rvv_bfloat16m4x2_t vbfloat16m4x2_t;
 typedef __rvv_bfloat16m8_t vbfloat16m8_t;
-#define __riscv_v_intrinsic_overloading 1

 #ifdef __cplusplus
 }
--- a/lib/include/sm4evexintrin.h
+++ b/lib/include/sm4evexintrin.h
@ -0,0 +1,32 @@
+/*===--------------- sm4evexintrin.h - SM4 EVEX intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <sm4evexintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __SM4EVEXINTRIN_H
+#define __SM4EVEXINTRIN_H
+
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sm4,avx10.2-512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sm4key4_epi32(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_vsm4key4512((__v16su)__A, (__v16su)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sm4rnds4_epi32(__m512i __A, __m512i __B) {
+  return (__m512i)__builtin_ia32_vsm4rnds4512((__v16su)__A, (__v16su)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif // __SM4EVEXINTRIN_H
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -17,9 +17,15 @@
 #include <tmmintrin.h>

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
+                 __min_vector_width__(128)))
+#endif

 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT 0x00
--- a/lib/include/stdalign.h
+++ b/lib/include/stdalign.h
@ -10,10 +10,6 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H

-#if defined(__MVS__) && __has_include_next(<stdalign.h>)
-#include_next <stdalign.h>
-#else
-
 #if defined(__cplusplus) ||                                                    \
    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L)
 #ifndef __cplusplus
@ -25,5 +21,4 @@
 #define __alignof_is_defined 1
 #endif /* __STDC_VERSION__ */

-#endif /* __MVS__ */
 #endif /* __STDALIGN_H */
--- a/lib/include/tbmintrin.h
+++ b/lib/include/tbmintrin.h
@ -15,63 +15,60 @@
 #define __TBMINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) constexpr
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+#endif

 #define __bextri_u32(a, b) \
  ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
                                           (unsigned int)(b)))

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcfill_u32(unsigned int __a)
-{
+__blcfill_u32(unsigned int __a) {
  return __a & (__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blci_u32(unsigned int __a)
-{
+__blci_u32(unsigned int __a) {
  return __a | ~(__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcic_u32(unsigned int __a)
-{
+__blcic_u32(unsigned int __a) {
  return ~__a & (__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcmsk_u32(unsigned int __a)
-{
+__blcmsk_u32(unsigned int __a) {
  return __a ^ (__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcs_u32(unsigned int __a)
-{
+__blcs_u32(unsigned int __a) {
  return __a | (__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsfill_u32(unsigned int __a)
-{
+__blsfill_u32(unsigned int __a) {
  return __a | (__a - 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsic_u32(unsigned int __a)
-{
+__blsic_u32(unsigned int __a) {
  return ~__a | (__a - 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__t1mskc_u32(unsigned int __a)
-{
+__t1mskc_u32(unsigned int __a) {
  return ~__a | (__a + 1);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__tzmsk_u32(unsigned int __a)
-{
+__tzmsk_u32(unsigned int __a) {
  return ~__a & (__a - 1);
 }

@ -81,56 +78,47 @@ __tzmsk_u32(unsigned int __a)
                                                 (unsigned long long)(b)))

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcfill_u64(unsigned long long __a)
-{
+__blcfill_u64(unsigned long long __a) {
  return __a & (__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blci_u64(unsigned long long __a)
-{
+__blci_u64(unsigned long long __a) {
  return __a | ~(__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcic_u64(unsigned long long __a)
-{
+__blcic_u64(unsigned long long __a) {
  return ~__a & (__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcmsk_u64(unsigned long long __a)
-{
+__blcmsk_u64(unsigned long long __a) {
  return __a ^ (__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcs_u64(unsigned long long __a)
-{
+__blcs_u64(unsigned long long __a) {
  return __a | (__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsfill_u64(unsigned long long __a)
-{
+__blsfill_u64(unsigned long long __a) {
  return __a | (__a - 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsic_u64(unsigned long long __a)
-{
+__blsic_u64(unsigned long long __a) {
  return ~__a | (__a - 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__t1mskc_u64(unsigned long long __a)
-{
+__t1mskc_u64(unsigned long long __a) {
  return ~__a | (__a + 1);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__tzmsk_u64(unsigned long long __a)
-{
+__tzmsk_u64(unsigned long long __a) {
  return ~__a & (__a - 1);
 }
 #endif
--- a/lib/include/tmmintrin.h
+++ b/lib/include/tmmintrin.h
@ -17,13 +17,21 @@
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,ssse3,no-evex512"),                           \
-                 __min_vector_width__(64)))
+                 __target__("ssse3,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("ssse3"),          \
+                 __min_vector_width__(128)))
+#endif
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)

 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@ -37,10 +45,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+  return (__m64)__builtin_elementwise_abs((__v8qs)__a);
 }

 /// Computes the absolute value of each of the packed 8-bit signed
@ -73,10 +81,10 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return (__m64)__builtin_elementwise_abs((__v4hi)__a);
 }

 /// Computes the absolute value of each of the packed 16-bit signed
@ -109,10 +117,10 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return (__m64)__builtin_elementwise_abs((__v2si)__a);
 }

 /// Computes the absolute value of each of the packed 32-bit signed
@ -177,7 +185,10 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
+  ((__m64)__builtin_shufflevector(                                       \
+       __builtin_ia32_psrldqi128_byteshift(                              \
+           __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
+           (n)), __extension__ (__v2di){}, 0))

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@ -242,10 +253,11 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@ -265,10 +277,11 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phaddd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }

 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@ -317,10 +330,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phaddsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }

 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -386,10 +400,11 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }

 /// Horizontally subtracts the adjacent pairs of values contained in 2
@ -409,10 +424,11 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_phsubd128(
+        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }

 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@ -461,10 +477,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_phsubsw128(
+        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }

 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@ -525,10 +542,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                                 (__v16qi)__anyext128(__b)));
 }

 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@ -565,10 +583,11 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }

 /// Copies the 8-bit integers from a 128-bit integer vector to the
@ -614,12 +633,15 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
 ///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
+///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128(
+        (__v16qi)__builtin_shufflevector(
+            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
+        (__v16qi)__anyext128(__b)));
 }

 /// For each 8-bit integer in the first source operand, perform one of
@ -720,10 +742,11 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+                                              (__v16qi)__anyext128(__b)));
 }

 /// For each 16-bit integer in the first source operand, perform one of
@ -746,10 +769,11 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+                                              (__v8hi)__anyext128(__b)));
 }

 /// For each 32-bit integer in the first source operand, perform one of
@ -772,13 +796,15 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+                                              (__v4si)__anyext128(__b)));
 }

+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX

 #endif /* __TMMINTRIN_H */
--- a/lib/include/vecintrin.h
+++ b/lib/include/vecintrin.h
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@ -33,6 +33,7 @@ typedef unsigned long long __u64x2
    __attribute__((__vector_size__(16), __aligned__(16)));
 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16)));

 typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
 typedef unsigned char __u8x8
@ -956,7 +957,7 @@ static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
-  return (v128_t)__builtin_wasm_popcnt_i8x16((__i8x16)__a);
+  return (v128_t)__builtin_elementwise_popcount((__i8x16)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
@ -981,12 +982,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_add_sat_s_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_elementwise_add_sat((__i8x16)__a, (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_add_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_add_sat_u_i8x16((__u8x16)__a, (__u8x16)__b);
+  return (v128_t)__builtin_elementwise_add_sat((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
@ -996,32 +997,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_sat_s_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_elementwise_sub_sat((__i8x16)__a, (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_sub_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_sat_u_i8x16((__u8x16)__a, (__u8x16)__b);
+  return (v128_t)__builtin_elementwise_sub_sat((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_s_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_elementwise_min((__i8x16)__a, (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b);
+  return (v128_t)__builtin_elementwise_min((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_s_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_elementwise_max((__i8x16)__a, (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b);
+  return (v128_t)__builtin_elementwise_max((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
@ -1067,12 +1068,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_add_sat_s_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_elementwise_add_sat((__i16x8)__a, (__i16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_add_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_add_sat_u_i16x8((__u16x8)__a, (__u16x8)__b);
+  return (v128_t)__builtin_elementwise_add_sat((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
@ -1082,12 +1083,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_sat_s_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_elementwise_sub_sat((__i16x8)__a, (__i16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_sub_sat(v128_t __a,
                                                               v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_sat_u_i16x8((__u16x8)__a, (__u16x8)__b);
+  return (v128_t)__builtin_elementwise_sub_sat((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
@ -1097,22 +1098,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_s_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_elementwise_min((__i16x8)__a, (__i16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b);
+  return (v128_t)__builtin_elementwise_min((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_s_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_elementwise_max((__i16x8)__a, (__i16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b);
+  return (v128_t)__builtin_elementwise_max((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
@ -1168,22 +1169,22 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_mul(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_s_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_elementwise_min((__i32x4)__a, (__i32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b);
+  return (v128_t)__builtin_elementwise_min((__u32x4)__a, (__u32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_s_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_elementwise_max((__i32x4)__a, (__i32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b);
+  return (v128_t)__builtin_elementwise_max((__u32x4)__a, (__u32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_dot_i16x8(v128_t __a,
@ -1878,6 +1879,151 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) {
      (__i8x16)__a, (__i8x16)__b, (__i32x4)__c);
 }

+// FP16 intrinsics
+#define __FP16_FN_ATTRS                                                        \
+  __attribute__((__always_inline__, __nodebug__, __target__("fp16"),           \
+                 __min_vector_width__(128)))
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) {
+  return (v128_t)__builtin_wasm_splat_f16x8(__a);
+}
+
+#ifdef __wasm_fp16__
+// TODO Replace the following macros with regular C functions and use normal
+// target-independent vector code like the other replace/extract instructions.
+
+#define wasm_f16x8_extract_lane(__a, __i)                                      \
+  (__builtin_wasm_extract_lane_f16x8((__f16x8)(__a), __i))
+
+#define wasm_f16x8_replace_lane(__a, __i, __b)                                 \
+  ((v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)(__a), __i, __b))
+
+#endif
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) {
+  return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) {
+  return (v128_t)(-(__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) {
+  return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) {
+  return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) {
+  return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) {
+  return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a == (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a != (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a < (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a > (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a <= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) {
+  return (v128_t)((__f16x8)__a >= (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a + (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a - (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a * (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)((__f16x8)__a / (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a,
+                                                        v128_t __b) {
+  return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a,
+                                                         v128_t __b) {
+  return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_i16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS
+wasm_u16x8_trunc_sat_f16x8(v128_t __a) {
+  return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
+  return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
+                                                                 v128_t __b,
+                                                                 v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                   (__f16x8)__c);
+}
+
+static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a,
+                                                                  v128_t __b,
+                                                                  v128_t __c) {
+  return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b,
+                                                    (__f16x8)__c);
+}
+
 // Deprecated intrinsics

 static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle")
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -32,12 +32,41 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 #endif

 /* Define the default attributes for the functions in this file. */
+#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX                                                 \
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
+#else
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_SSE2                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
+                 __min_vector_width__(128)))
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
+#define __trunc64(x)                                                           \
+  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
+#define __anyext128(x)                                                         \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, -1, -1)
+#define __zeroupper64(x)                                                       \
+  (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
+                                    1, 4, 5)

 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@ -54,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
 ///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
  __a[0] += __b[0];
  return __a;
 }
@ -74,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the sums of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4sf)__a + (__v4sf)__b);
 }

@ -96,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 ///    copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
  __a[0] -= __b[0];
  return __a;
 }
@ -117,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the differences between
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4sf)__a - (__v4sf)__b);
 }

@ -139,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 ///    bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
  __a[0] *= __b[0];
  return __a;
 }
@ -159,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the products of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4sf)__a * (__v4sf)__b);
 }

@ -181,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 ///    upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
  __a[0] /= __b[0];
  return __a;
 }
@ -200,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the divisor.
 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4sf)__a / (__v4sf)__b);
 }

@ -416,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4su)__a & (__v4su)__b);
 }

@ -438,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    one's complement of the first operand and the values in the second
 ///    operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
  return (__m128)(~(__v4su)__a & (__v4su)__b);
 }

@ -456,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4su)__a | (__v4su)__b);
 }

@ -475,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 ///    of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
  return (__m128)((__v4su)__a ^ (__v4su)__b);
 }

@ -1448,10 +1465,10 @@ _mm_cvtss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }

 /// Converts two low-order float values in a 128-bit vector of
@ -1468,7 +1485,7 @@ _mm_cvtps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
  return _mm_cvtps_pi32(__a);
@ -1558,10 +1575,10 @@ _mm_cvttss_si64(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }

 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
@ -1579,7 +1596,7 @@ _mm_cvttps_pi32(__m128 __a)
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
  return _mm_cvttps_pi32(__a);
@ -1601,9 +1618,8 @@ _mm_cvtt_ps2pi(__m128 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_ss(__m128 __a, int __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
+                                                                     int __b) {
  __a[0] = __b;
  return __a;
 }
@ -1624,9 +1640,8 @@ _mm_cvtsi32_ss(__m128 __a, int __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvt_si2ss(__m128 __a, int __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
+                                                                    int __b) {
  return _mm_cvtsi32_ss(__a, __b);
 }

@ -1648,9 +1663,8 @@ _mm_cvt_si2ss(__m128 __a, int __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_ss(__m128 __a, long long __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi64_ss(__m128 __a, long long __b) {
  __a[0] = __b;
  return __a;
 }
@ -1674,10 +1688,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
+      (__v4sf)__a,
+      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+      4, 5, 2, 3);
 }

 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@ -1697,7 +1714,7 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
  return _mm_cvtpi32_ps(__a, __b);
@ -1714,9 +1731,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the extraction.
 /// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
  return __a[0];
 }

@ -1907,9 +1923,8 @@ _mm_undefined_ps(void)
 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
 ///    lower 32 bits contain the value provided in the source operand. The
 ///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
  return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
 }

@ -1925,9 +1940,8 @@ _mm_set_ss(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
  return __extension__ (__m128){ __w, __w, __w, __w };
 }

@ -1944,9 +1958,8 @@ _mm_set1_ps(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
    return _mm_set1_ps(__w);
 }

@ -1971,9 +1984,8 @@ _mm_set_ps1(float __w)
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
  return __extension__ (__m128){ __w, __x, __y, __z };
 }

@ -1999,9 +2011,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
 ///    A single-precision floating-point value used to initialize bits [127:96]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
  return __extension__ (__m128){ __z, __y, __x, __w };
 }

@ -2014,9 +2025,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }

@ -2231,10 +2241,10 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(void *__p, __m64 __a)
 {
-  __builtin_ia32_movntq((__m64 *)__p, __a);
+  __builtin_nontemporal_store(__a, (__m64 *)__p);
 }

 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@ -2296,7 +2306,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))

 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2342,10 +2352,10 @@ void _mm_sfence(void);
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
 }

 /// Compares each of the corresponding packed 8-bit unsigned integer
@ -2361,10 +2371,10 @@ _mm_max_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
 }

 /// Compares each of the corresponding packed 16-bit integer values of
@ -2380,10 +2390,10 @@ _mm_max_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
 }

 /// Compares each of the corresponding packed 8-bit unsigned integer
@ -2399,10 +2409,10 @@ _mm_min_pi16(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
 }

 /// Takes the most significant bit from each 8-bit element in a 64-bit
@ -2417,10 +2427,10 @@ _mm_min_pu8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }

 /// Multiplies packed 16-bit unsigned integer values and writes the
@ -2436,10 +2446,11 @@ _mm_movemask_pi8(__m64 __a)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
+                                             (__v8hi)__anyext128(__b)));
 }

 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@ -2476,8 +2487,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
-  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
+#define _mm_shuffle_pi16(a, n)                                                 \
+  ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
+                                  (n) & 0x3, ((n) >> 2) & 0x3,                 \
+                                  ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))

 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@ -2502,10 +2515,25 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
+  // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
+  // write might cause a trap where a 64-bit maskmovq would not. (Memory
+  // locations not selected by the mask bits might still cause traps.)
+  __m128i __d128  = __anyext128(__d);
+  __m128i __n128  = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
+      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+    // If there's a risk of spurious trap due to a 128-bit write, back up the
+    // pointer by 8 bytes and shift values in registers to match.
+    __p -= 8;
+    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+  }
+
+  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }

 /// Computes the rounded averages of the packed unsigned 8-bit integer
@ -2521,10 +2549,11 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
+                                           (__v16qi)__anyext128(__b)));
 }

 /// Computes the rounded averages of the packed unsigned 16-bit integer
@ -2540,10 +2569,11 @@ _mm_avg_pu8(__m64 __a, __m64 __b)
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
+                                           (__v8hi)__anyext128(__b)));
 }

 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@ -2562,10 +2592,11 @@ _mm_avg_pu16(__m64 __a, __m64 __b)
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
+                                            (__v16qi)__zext128(__b)));
 }

 #if defined(__cplusplus)
@ -2741,9 +2772,8 @@ void _mm_setcsr(unsigned int __i);
 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
 }

@ -2763,9 +2793,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
 }

@ -2785,9 +2814,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
 ///    written to the lower 32 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
  __a[0] = __b[0];
  return __a;
 }
@ -2807,9 +2835,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
 ///    written to the lower 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
 }

@ -2828,9 +2855,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
 ///    written to the upper 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
 }

@ -2846,22 +2872,10 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi16(__b, __a);
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hi)__a, __v4sf);
 }

 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
@ -2876,21 +2890,10 @@ _mm_cvtpi16_ps(__m64 __a)
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hu)__a, __v4sf);
 }

 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@ -2905,16 +2908,12 @@ _mm_cvtpu16_ps(__m64 __a)
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi8(__b, __a);
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
+                              0, 1, 2, 3), __v4sf);
 }

 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@ -2930,15 +2929,12 @@ _mm_cvtpi8_ps(__m64 __a)
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
+                              0, 1, 2, 3), __v4sf);
 }

 /// Converts the two 32-bit signed integer values from each 64-bit vector
@ -2957,16 +2953,12 @@ _mm_cvtpu8_ps(__m64 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
-  __m128 __c;
-
-  __c = _mm_setzero_ps();
-  __c = _mm_cvtpi32_ps(__c, __b);
-  __c = _mm_movelh_ps(__c, __c);
-
-  return _mm_cvtpi32_ps(__c, __a);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
+                              0, 1, 2, 3), __v4sf);
 }

 /// Converts each single-precision floating-point element of a 128-bit
@ -2986,16 +2978,11 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi32(__a);
-  __a = _mm_movehl_ps(__a, __a);
-  __c = _mm_cvtps_pi32(__a);
-
-  return _mm_packs_pi32(__b, __c);
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
 }

 /// Converts each single-precision floating-point element of a 128-bit
@ -3016,7 +3003,7 @@ _mm_cvtps_pi16(__m128 __a)
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
  __m64 __b, __c;
@ -3196,8 +3183,14 @@ do { \
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_

+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS_SSE2
+#undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR

 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)