update C headers to LLVM 15

release/15.x 37007475ca1b345b4c5d340e228bcd7a62732d81
2025-12-06 05:44:20 +00:00 · 2022-07-28 11:54:23 -07:00 · 2022-07-28 11:54:23 -07:00 · d3389eadf4
commit d3389eadf4
parent adb4a95302
60 changed files with 14251 additions and 107228 deletions
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -71,8 +71,8 @@
  }                                                                            \
  inline __device__ unsigned long long __FnName(                               \
      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
-    return static_cast<unsigned long long>(::__FnName(                         \
-        static_cast<unsigned long long>(__val), __offset, __width));           \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(static_cast<long long>(__val), __offset, __width));         \
  }                                                                            \
  inline __device__ double __FnName(double __val, __Type __offset,             \
                                    int __width = warpSize) {                  \
@ -139,8 +139,8 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
  inline __device__ unsigned long long __FnName(                               \
      unsigned int __mask, unsigned long long __val, __Type __offset,          \
      int __width = warpSize) {                                                \
-    return static_cast<unsigned long long>(::__FnName(                         \
-        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+    return static_cast<unsigned long long>(                                    \
+        ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
  }                                                                            \
  inline __device__ long __FnName(unsigned int __mask, long __val,             \
                                  __Type __offset, int __width = warpSize) {   \
@ -234,7 +234,7 @@ inline __device__ unsigned int __match32_any_sync(unsigned int mask,
  return __nvvm_match_any_sync_i32(mask, value);
 }

-inline __device__ unsigned long long
+inline __device__ unsigned int
 __match64_any_sync(unsigned int mask, unsigned long long value) {
  return __nvvm_match_any_sync_i64(mask, value);
 }
@ -244,7 +244,7 @@ __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
  return __nvvm_match_all_sync_i32p(mask, value, pred);
 }

-inline __device__ unsigned long long
+inline __device__ unsigned int
 __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
  return __nvvm_match_all_sync_i64p(mask, value, pred);
 }
--- a/lib/include/__wmmintrin_pclmul.h
+++ b/lib/include/__wmmintrin_pclmul.h
@ -22,23 +22,23 @@
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
-/// \param __X
+/// \param X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __Y
+/// \param Y
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __I
+/// \param I
 ///    An immediate value specifying which 64-bit values to select from the
-///    operands. Bit 0 is used to select a value from operand \a __X, and bit
-///    4 is used to select a value from operand \a __Y: \n
-///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
-///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
-///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
-///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+///    operands. Bit 0 is used to select a value from operand \a X, and bit
+///    4 is used to select a value from operand \a Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(X, Y, I) \
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -439,8 +439,6 @@ static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
@ -2407,15 +2407,15 @@ svuint64_t svcnt_s64_z(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
 svuint16_t svcnt_s16_z(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb)))
-uint64_t svcntb();
+uint64_t svcntb(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb_pat)))
 uint64_t svcntb_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd)))
-uint64_t svcntd();
+uint64_t svcntd(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd_pat)))
 uint64_t svcntd_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth)))
-uint64_t svcnth();
+uint64_t svcnth(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth_pat)))
 uint64_t svcnth_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b8)))
@ -2427,7 +2427,7 @@ uint64_t svcntp_b64(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b16)))
 uint64_t svcntp_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw)))
-uint64_t svcntw();
+uint64_t svcntw(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw_pat)))
 uint64_t svcntw_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
@ -6521,7 +6521,7 @@ int64_t svorv_s64(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
 int16_t svorv_s16(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse_b();
+svbool_t svpfalse_b(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst_b(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b8)))
@ -6627,13 +6627,13 @@ svbool_t svptrue_pat_b64(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b16)))
 svbool_t svptrue_pat_b16(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b8)))
-svbool_t svptrue_b8();
+svbool_t svptrue_b8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b32)))
-svbool_t svptrue_b32();
+svbool_t svptrue_b32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b64)))
-svbool_t svptrue_b64();
+svbool_t svptrue_b64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b16)))
-svbool_t svptrue_b16();
+svbool_t svptrue_b16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
 svint8_t svqadd_n_s8(svint8_t, int8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
@ -7011,7 +7011,7 @@ svint64_t svrbit_s64_z(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
 svint16_t svrbit_s16_z(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr)))
-svbool_t svrdffr();
+svbool_t svrdffr(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr_z)))
 svbool_t svrdffr_z(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
@ -7411,7 +7411,7 @@ svint64x4_t svset4_s64(svint64x4_t, uint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
 svint16x4_t svset4_s16(svint16x4_t, uint64_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsetffr)))
-void svsetffr();
+void svsetffr(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
 svuint8_t svsplice_u8(svbool_t, svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
@ -8285,93 +8285,93 @@ svfloat32_t svtssel_f32(svfloat32_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
 svfloat16_t svtssel_f16(svfloat16_t, svuint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u8)))
-svuint8x2_t svundef2_u8();
+svuint8x2_t svundef2_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u32)))
-svuint32x2_t svundef2_u32();
+svuint32x2_t svundef2_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u64)))
-svuint64x2_t svundef2_u64();
+svuint64x2_t svundef2_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u16)))
-svuint16x2_t svundef2_u16();
+svuint16x2_t svundef2_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s8)))
-svint8x2_t svundef2_s8();
+svint8x2_t svundef2_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f64)))
-svfloat64x2_t svundef2_f64();
+svfloat64x2_t svundef2_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f32)))
-svfloat32x2_t svundef2_f32();
+svfloat32x2_t svundef2_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f16)))
-svfloat16x2_t svundef2_f16();
+svfloat16x2_t svundef2_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s32)))
-svint32x2_t svundef2_s32();
+svint32x2_t svundef2_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s64)))
-svint64x2_t svundef2_s64();
+svint64x2_t svundef2_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s16)))
-svint16x2_t svundef2_s16();
+svint16x2_t svundef2_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u8)))
-svuint8x3_t svundef3_u8();
+svuint8x3_t svundef3_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u32)))
-svuint32x3_t svundef3_u32();
+svuint32x3_t svundef3_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u64)))
-svuint64x3_t svundef3_u64();
+svuint64x3_t svundef3_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u16)))
-svuint16x3_t svundef3_u16();
+svuint16x3_t svundef3_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s8)))
-svint8x3_t svundef3_s8();
+svint8x3_t svundef3_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f64)))
-svfloat64x3_t svundef3_f64();
+svfloat64x3_t svundef3_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f32)))
-svfloat32x3_t svundef3_f32();
+svfloat32x3_t svundef3_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f16)))
-svfloat16x3_t svundef3_f16();
+svfloat16x3_t svundef3_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s32)))
-svint32x3_t svundef3_s32();
+svint32x3_t svundef3_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s64)))
-svint64x3_t svundef3_s64();
+svint64x3_t svundef3_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s16)))
-svint16x3_t svundef3_s16();
+svint16x3_t svundef3_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u8)))
-svuint8x4_t svundef4_u8();
+svuint8x4_t svundef4_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u32)))
-svuint32x4_t svundef4_u32();
+svuint32x4_t svundef4_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u64)))
-svuint64x4_t svundef4_u64();
+svuint64x4_t svundef4_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u16)))
-svuint16x4_t svundef4_u16();
+svuint16x4_t svundef4_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s8)))
-svint8x4_t svundef4_s8();
+svint8x4_t svundef4_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f64)))
-svfloat64x4_t svundef4_f64();
+svfloat64x4_t svundef4_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f32)))
-svfloat32x4_t svundef4_f32();
+svfloat32x4_t svundef4_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f16)))
-svfloat16x4_t svundef4_f16();
+svfloat16x4_t svundef4_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s32)))
-svint32x4_t svundef4_s32();
+svint32x4_t svundef4_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s64)))
-svint64x4_t svundef4_s64();
+svint64x4_t svundef4_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s16)))
-svint16x4_t svundef4_s16();
+svint16x4_t svundef4_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u8)))
-svuint8_t svundef_u8();
+svuint8_t svundef_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u32)))
-svuint32_t svundef_u32();
+svuint32_t svundef_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u64)))
-svuint64_t svundef_u64();
+svuint64_t svundef_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u16)))
-svuint16_t svundef_u16();
+svuint16_t svundef_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s8)))
-svint8_t svundef_s8();
+svint8_t svundef_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f64)))
-svfloat64_t svundef_f64();
+svfloat64_t svundef_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f32)))
-svfloat32_t svundef_f32();
+svfloat32_t svundef_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f16)))
-svfloat16_t svundef_f16();
+svfloat16_t svundef_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s32)))
-svint32_t svundef_s32();
+svint32_t svundef_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s64)))
-svint64_t svundef_s64();
+svint64_t svundef_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s16)))
-svint16_t svundef_s16();
+svint16_t svundef_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
 svbool_t svunpkhi_b(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
@ -13830,8 +13830,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s64)))
 int64_t svorv(svbool_t, svint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
 int16_t svorv(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse();
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
+svbool_t svpfalse(void);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
@ -23456,13 +23456,13 @@ svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
 svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
-svbfloat16x2_t svundef2_bf16();
+svbfloat16x2_t svundef2_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
-svbfloat16x3_t svundef3_bf16();
+svbfloat16x3_t svundef3_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
-svbfloat16x4_t svundef4_bf16();
+svbfloat16x4_t svundef4_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
-svbfloat16_t svundef_bf16();
+svbfloat16_t svundef_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
 svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -92,25 +92,25 @@ _mm256_add_epi64(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 }

 #define _mm256_alignr_epi8(a, b, n) \
@ -628,25 +628,25 @@ _mm256_sub_epi64(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -617,7 +617,7 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -639,7 +639,7 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -661,7 +661,7 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -683,7 +683,7 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -950,7 +950,7 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -972,7 +972,7 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -994,7 +994,7 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1016,7 +1016,7 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1506,7 +1506,7 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1598,7 +1598,7 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1643,7 +1643,7 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1659,7 +1659,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B),
                                         (__v32hi)_mm512_setzero_si512());
 }

--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -1780,7 +1780,7 @@ _mm512_floor_ps(__m512 __A)
 {
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
                                                  _MM_FROUND_FLOOR,
-                                                  (__v16sf) __A, -1,
+                                                  (__v16sf) __A, (unsigned short)-1,
                                                  _MM_FROUND_CUR_DIRECTION);
 }

@ -1798,7 +1798,7 @@ _mm512_floor_pd(__m512d __A)
 {
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
                                                   _MM_FROUND_FLOOR,
-                                                   (__v8df) __A, -1,
+                                                   (__v8df) __A, (unsigned char)-1,
                                                   _MM_FROUND_CUR_DIRECTION);
 }

@ -1825,7 +1825,7 @@ _mm512_ceil_ps(__m512 __A)
 {
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
                                                  _MM_FROUND_CEIL,
-                                                  (__v16sf) __A, -1,
+                                                  (__v16sf) __A, (unsigned short)-1,
                                                  _MM_FROUND_CUR_DIRECTION);
 }

@ -1834,7 +1834,7 @@ _mm512_ceil_pd(__m512d __A)
 {
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
                                                   _MM_FROUND_CEIL,
-                                                   (__v8df) __A, -1,
+                                                   (__v8df) __A, (unsigned char)-1,
                                                   _MM_FROUND_CUR_DIRECTION);
 }

@ -5117,7 +5117,7 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5139,7 +5139,7 @@ _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5161,7 +5161,7 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5183,7 +5183,7 @@ _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5929,41 +5929,44 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
                                            (__v8di)_mm512_setzero_si512());
 }

-#define _mm512_ternarylogic_epi32(A, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
-                                             (__v16si)(__m512i)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)-1))
+/// \enum _MM_TERNLOG_ENUM
+///    A helper to represent the ternary logic operations among vector \a A,
+///    \a B and \a C. The representation is passed to \a imm.
+typedef enum {
+  _MM_TERNLOG_A = 0xF0,
+  _MM_TERNLOG_B = 0xCC,
+  _MM_TERNLOG_C = 0xAA
+} _MM_TERNLOG_ENUM;

-#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
-                                             (__v16si)(__m512i)(B), \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
-                                             (__mmask16)(U)))
+#define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
+  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
+      (unsigned char)(imm), (__mmask16)-1))

-#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
-                                              (__v16si)(__m512i)(B), \
-                                              (__v16si)(__m512i)(C), \
-                                              (int)(imm), (__mmask16)(U)))
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
+  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
+      (unsigned char)(imm), (__mmask16)(U)))

-#define _mm512_ternarylogic_epi64(A, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
-                                             (__v8di)(__m512i)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)-1))
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
+  ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
+      (unsigned char)(imm), (__mmask16)(U)))

-#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
-                                             (__v8di)(__m512i)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
+#define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
+  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
+      (unsigned char)(imm), (__mmask8)-1))

-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
-                                              (__v8di)(__m512i)(B), \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
+  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
+  ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

 #ifdef __x86_64__
 #define _mm_cvt_roundsd_i64(A, R) \
@ -6603,7 +6606,7 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -6626,7 +6629,7 @@ _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -9316,11 +9319,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 */

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-  return __builtin_ia32_reduce_add_q512(__W);
+  return __builtin_reduce_add((__v8di)__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-  return __builtin_ia32_reduce_mul_q512(__W);
+  return __builtin_reduce_mul((__v8di)__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
@ -9334,18 +9337,18 @@ static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi64(__M, __W);
-  return __builtin_ia32_reduce_add_q512(__W);
+  return __builtin_reduce_add((__v8di)__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-  return __builtin_ia32_reduce_mul_q512(__W);
+  return __builtin_reduce_mul((__v8di)__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
+  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
  return __builtin_reduce_and((__v8di)__W);
 }

@ -9380,12 +9383,12 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
-  return __builtin_ia32_reduce_add_d512((__v16si)__W);
+  return __builtin_reduce_add((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_epi32(__m512i __W) {
-  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+  return __builtin_reduce_mul((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
@ -9401,18 +9404,18 @@ _mm512_reduce_or_epi32(__m512i __W) {
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi32(__M, __W);
-  return __builtin_ia32_reduce_add_d512((__v16si)__W);
+  return __builtin_reduce_add((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+  return __builtin_reduce_mul((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
+  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
  return __builtin_reduce_and((__v16si)__W);
 }

@ -9484,7 +9487,7 @@ _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
+  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
  return __builtin_reduce_min((__v8du)__V);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
@ -9527,7 +9530,7 @@ _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {

 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
+  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
  return __builtin_reduce_min((__v16su)__V);
 }

@ -9598,7 +9601,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9606,7 +9609,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   dst[i+63:i] := MEM[addr+63:addr]
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))

@ -9618,7 +9621,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9630,7 +9633,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   FI
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
                           (base_addr), (scale))
@ -9641,7 +9644,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9649,7 +9652,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   dst[i+63:i] := MEM[addr+63:addr]
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))

@ -9660,7 +9663,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9672,7 +9675,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   FI
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
                              (base_addr), (scale))
@ -9683,14 +9686,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
 ///   MEM[addr+63:addr] := v1[i+63:i]
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))

@ -9702,7 +9705,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9711,7 +9714,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///     MEM[addr+63:addr] := a[i+63:i]
 ///   FI
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
                            _mm512_castsi512_si256(vindex), (v1), (scale))
@ -9722,14 +9725,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
 ///   MEM[addr+63:addr] := a[i+63:i]
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
  _mm512_i32scatter_epi64((base_addr),                                         \
                          _mm512_castsi512_si256(vindex), (v1), (scale))
@ -9741,7 +9744,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9750,7 +9753,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///     MEM[addr+63:addr] := a[i+63:i]
 ///   FI
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
                               _mm512_castsi512_si256(vindex), (v1), (scale))
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -417,7 +417,7 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  __v4sf __V = {__A, 0, 0, 0};
  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-  return __R[0];
+  return (__bfloat16)__R[0];
 }

 /// Convert Packed BF16 Data to Packed float Data.
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -1942,7 +1942,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
                                             (__v8hi)__W);
 }

@ -1950,7 +1950,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
                                             (__v8hi)_mm_setzero_si128());
 }

@ -1959,7 +1959,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
                                         (__v16hi)__W);
 }

@ -1967,7 +1967,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
                                         (__v16hi)_mm256_setzero_si256());
 }

@ -2095,7 +2095,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
                                             (__v8hi)__W);
 }

@ -2103,7 +2103,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
                                             (__v8hi)_mm_setzero_si128());
 }

@ -2112,7 +2112,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
                                         (__v16hi)__W);
 }

@ -2120,7 +2120,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
                                         (__v16hi)_mm256_setzero_si256());
 }

--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -4525,7 +4525,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }

@ -4533,7 +4533,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }

@ -4541,7 +4541,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }

@ -4549,7 +4549,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }

@ -4589,7 +4589,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
                                             (__v2di)__W);
 }

@ -4597,7 +4597,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
                                             (__v2di)_mm_setzero_si128());
 }

@ -4605,7 +4605,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
                                             (__v4di)__W);
 }

@ -4613,7 +4613,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
                                             (__v4di)_mm256_setzero_si256());
 }

@ -4869,7 +4869,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }

@ -4877,7 +4877,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }

@ -4885,7 +4885,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }

@ -4893,7 +4893,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }

@ -4933,7 +4933,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
                                             (__v2di)__W);
 }

@ -4941,7 +4941,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
                                             (__v2di)_mm_setzero_si128());
 }

@ -4949,7 +4949,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
                                             (__v4di)__W);
 }

@ -4957,7 +4957,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
                                             (__v4di)_mm256_setzero_si256());
 }

@ -6408,7 +6408,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }

@ -6416,7 +6416,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }

@ -6424,7 +6424,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }

@ -6432,7 +6432,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }

@ -6483,7 +6483,7 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srai_epi64(__m128i __A, unsigned int __imm)
 {
-  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm);
 }

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@ -6505,7 +6505,7 @@ _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srai_epi64(__m256i __A, unsigned int __imm)
 {
-  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -6525,79 +6525,65 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
                                        (__v4di)_mm256_setzero_si256());
 }

-#define _mm_ternarylogic_epi32(A, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
-                                             (__v4si)(__m128i)(B), \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)-1))
+#define _mm_ternarylogic_epi32(A, B, C, imm)                                   \
+  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)-1))

-#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
-                                             (__v4si)(__m128i)(B), \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm)                           \
+  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
-                                              (__v4si)(__m128i)(B), \
-                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm)                          \
+  ((__m128i)__builtin_ia32_pternlogd128_maskz(                                 \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm256_ternarylogic_epi32(A, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
-                                             (__v8si)(__m256i)(B), \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)-1))
+#define _mm256_ternarylogic_epi32(A, B, C, imm)                                \
+  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)-1))

-#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
-                                             (__v8si)(__m256i)(B), \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
+  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
-                                              (__v8si)(__m256i)(B), \
-                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
+  ((__m256i)__builtin_ia32_pternlogd256_maskz(                                 \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm_ternarylogic_epi64(A, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
-                                             (__v2di)(__m128i)(B), \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)-1))
+#define _mm_ternarylogic_epi64(A, B, C, imm)                                   \
+  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)-1))

-#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
-                                             (__v2di)(__m128i)(B), \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm)                           \
+  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
-                                              (__v2di)(__m128i)(B), \
-                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm)                          \
+  ((__m128i)__builtin_ia32_pternlogq128_maskz(                                 \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

-#define _mm256_ternarylogic_epi64(A, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
-                                             (__v4di)(__m256i)(B), \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
-                                             (__v4di)(__m256i)(B), \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
-                                              (__v4di)(__m256i)(B), \
-                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                              (__mmask8)(U)))
+#define _mm256_ternarylogic_epi64(A, B, C, imm)                                \
+  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)-1))

+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
+  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
+  ((__m256i)__builtin_ia32_pternlogq256_maskz(                                 \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
+      (unsigned char)(imm), (__mmask8)(U)))

 #define _mm256_shuffle_f32x4(A, B, imm) \
  ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -25,7 +25,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -34,7 +34,7 @@
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpbusd_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

@ -45,7 +45,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -54,7 +54,7 @@
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpbusds_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

@ -65,14 +65,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpwssd_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

@ -83,14 +83,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpwssds_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

@ -101,7 +101,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -110,7 +110,7 @@
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpbusd_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

@ -121,7 +121,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -130,7 +130,7 @@
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpbusds_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

@ -141,14 +141,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpwssd_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

@ -159,14 +159,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpwssds_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -1504,7 +1504,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
-///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
@ -1953,12 +1956,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi32(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit vector of [8 x i32].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [2:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
@ -1971,12 +1978,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi16(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [16 x i16].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [3:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
@ -1990,12 +2001,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// int _mm256_extract_epi8(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [32 x i8].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [4:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
@ -2010,12 +2025,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// long long _mm256_extract_epi64(__m256i X, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [4 x i64].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [1:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
@ -2030,18 +2049,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [8 x i32] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi32(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
                                        (int)(I), (int)(N)))
@ -2053,18 +2076,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [16 x i16] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i16 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi16(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
                                         (int)(I), (int)(N)))
@ -2075,18 +2102,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [32 x i8] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i8 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///    \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///    \a N with \a I.
 #define _mm256_insert_epi8(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
                                         (int)(I), (int)(N)))
@ -2098,18 +2129,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
+/// \code
+/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
+/// \endcode
+///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [4 x i64] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    A 64-bit integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-///     \a __imm with \a __b.
+/// \returns A copy of vector \a X, after replacing its element indexed by
+///     \a N with \a I.
 #define _mm256_insert_epi64(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
                                        (long long)(I), (int)(N)))
@ -3177,7 +3212,7 @@ _mm256_loadu_si256(__m256i_u const *__p)
 ///    A pointer to a 256-bit integer vector containing integer values.
 /// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_lddqu_si256(__m256i const *__p)
+_mm256_lddqu_si256(__m256i_u const *__p)
 {
  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
 }
--- a/lib/include/avxvnniintrin.h
+++ b/lib/include/avxvnniintrin.h
@ -50,7 +50,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -59,7 +59,7 @@
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -73,7 +73,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -82,7 +82,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -96,14 +96,14 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -117,14 +117,14 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -138,7 +138,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -147,7 +147,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -161,7 +161,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -170,7 +170,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -184,14 +184,14 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -205,14 +205,14 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -47,6 +47,7 @@ __tzcnt_u16(unsigned short __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _mm_tzcnt_32
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@ -63,10 +64,11 @@ __tzcnt_u32(unsigned int __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
+/// \see __tzcnt_u32
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
-  return __builtin_ia32_tzcnt_u32(__X);
+  return (int)__builtin_ia32_tzcnt_u32(__X);
 }

 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
@ -83,6 +85,7 @@ _mm_tzcnt_32(unsigned int __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _mm_tzcnt_64
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@ -99,10 +102,11 @@ __tzcnt_u64(unsigned long long __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
+/// \see __tzcnt_u64
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
-  return __builtin_ia32_tzcnt_u64(__X);
+  return (long long)__builtin_ia32_tzcnt_u64(__X);
 }

 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
--- a/lib/include/cetintrin.h
+++ b/lib/include/cetintrin.h
@ -19,7 +19,7 @@
  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))

 static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
-  __builtin_ia32_incsspd(__a);
+  __builtin_ia32_incsspd((unsigned int)__a);
 }

 #ifdef __x86_64__
@ -34,7 +34,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
 }
 #else /* __x86_64__ */
 static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspd((int)__a);
+  __builtin_ia32_incsspd(__a);
 }
 #endif /* __x86_64__ */

@ -42,9 +42,12 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
  return __builtin_ia32_rdsspd(__a);
 }

-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
  unsigned int t;
  return __builtin_ia32_rdsspd(t);
+#pragma clang diagnostic pop
 }

 #ifdef __x86_64__
@ -52,9 +55,12 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo
  return __builtin_ia32_rdsspq(__a);
 }

-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
  unsigned long long t;
  return __builtin_ia32_rdsspq(t);
+#pragma clang diagnostic pop
 }
 #endif /* __x86_64__ */

@ -68,7 +74,7 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
 }
 #endif /* __x86_64__ */

-static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
  __builtin_ia32_saveprevssp();
 }

@ -96,7 +102,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void *
 }
 #endif /* __x86_64__ */

-static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
  __builtin_ia32_setssbsy();
 }

--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
--- a/lib/include/f16cintrin.h
+++ b/lib/include/f16cintrin.h
@ -65,9 +65,9 @@ _cvtsh_ss(unsigned short __a)
 ///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) \
-  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0]))
+#define _cvtss_sh(a, imm) __extension__ ({ \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]); })

 /// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
--- a/lib/include/hlsl.h
+++ b/lib/include/hlsl.h
@ -0,0 +1,15 @@
+//===----- hlsl.h - HLSL definitions --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _HLSL_H_
+#define _HLSL_H_
+
+#include "hlsl/hlsl_basic_types.h"
+#include "hlsl/hlsl_intrinsics.h"
+
+#endif //_HLSL_H_
--- a/lib/include/hlsl_basic_types.h
+++ b/lib/include/hlsl_basic_types.h
@ -0,0 +1,64 @@
+//===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _HLSL_HLSL_BASIC_TYPES_H_
+#define _HLSL_HLSL_BASIC_TYPES_H_
+
+// built-in scalar data types:
+
+#ifdef __HLSL_ENABLE_16_BIT
+// 16-bit integer.
+typedef unsigned short uint16_t;
+typedef short int16_t;
+#endif
+
+// unsigned 32-bit integer.
+typedef unsigned int uint;
+
+// 64-bit integer.
+typedef unsigned long uint64_t;
+typedef long int64_t;
+
+// built-in vector data types:
+
+#ifdef __HLSL_ENABLE_16_BIT
+typedef vector<int16_t, 2> int16_t2;
+typedef vector<int16_t, 3> int16_t3;
+typedef vector<int16_t, 4> int16_t4;
+typedef vector<uint16_t, 2> uint16_t2;
+typedef vector<uint16_t, 3> uint16_t3;
+typedef vector<uint16_t, 4> uint16_t4;
+#endif
+
+typedef vector<int, 2> int2;
+typedef vector<int, 3> int3;
+typedef vector<int, 4> int4;
+typedef vector<uint, 2> uint2;
+typedef vector<uint, 3> uint3;
+typedef vector<uint, 4> uint4;
+typedef vector<int64_t, 2> int64_t2;
+typedef vector<int64_t, 3> int64_t3;
+typedef vector<int64_t, 4> int64_t4;
+typedef vector<uint64_t, 2> uint64_t2;
+typedef vector<uint64_t, 3> uint64_t3;
+typedef vector<uint64_t, 4> uint64_t4;
+
+#ifdef __HLSL_ENABLE_16_BIT
+typedef vector<half, 2> half2;
+typedef vector<half, 3> half3;
+typedef vector<half, 4> half4;
+#endif
+
+typedef vector<float, 2> float2;
+typedef vector<float, 3> float3;
+typedef vector<float, 4> float4;
+typedef vector<double, 2> double2;
+typedef vector<double, 3> double3;
+typedef vector<double, 4> double4;
+
+#endif //_HLSL_HLSL_BASIC_TYPES_H_
--- a/lib/include/hlsl_intrinsics.h
+++ b/lib/include/hlsl_intrinsics.h
@ -0,0 +1,15 @@
+//===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _HLSL_HLSL_INTRINSICS_H_
+#define _HLSL_HLSL_INTRINSICS_H_
+
+__attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
+WaveActiveCountBits(bool bBit);
+
+#endif //_HLSL_HLSL_INTRINSICS_H_
--- a/lib/include/hresetintrin.h
+++ b/lib/include/hresetintrin.h
@ -25,7 +25,7 @@
 ///
 /// This intrinsic corresponds to the <c> HRESET </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///    IF __eax == 0
 ///      // nop
 ///    ELSE
@ -35,7 +35,7 @@
 ///        FI
 ///      ENDFOR
 ///    FI
-/// \endoperation
+/// \endcode
 static __inline void __DEFAULT_FN_ATTRS
 _hreset(int __eax)
 {
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@ -40,7 +40,7 @@
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfd(int __A) {
-  return __builtin_ctz(__A);
+  return __builtin_ctz((unsigned int)__A);
 }

 /** Find the first set bit starting from the msb. Result is undefined if
@ -57,7 +57,7 @@ __bsfd(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrd(int __A) {
-  return 31 - __builtin_clz(__A);
+  return 31 - __builtin_clz((unsigned int)__A);
 }

 /** Swaps the bytes in the input. Converting little endian to big endian or
@ -73,12 +73,12 @@ __bsrd(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapd(int __A) {
-  return __builtin_bswap32(__A);
+  return (int)__builtin_bswap32((unsigned int)__A);
 }

 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _bswap(int __A) {
-  return __builtin_bswap32(__A);
+  return (int)__builtin_bswap32((unsigned int)__A);
 }

 #define _bit_scan_forward(A) __bsfd((A))
@ -99,7 +99,7 @@ _bswap(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfq(long long __A) {
-  return __builtin_ctzll(__A);
+  return (long long)__builtin_ctzll((unsigned long long)__A);
 }

 /** Find the first set bit starting from the msb. Result is undefined if
@ -116,7 +116,7 @@ __bsfq(long long __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrq(long long __A) {
-  return 63 - __builtin_clzll(__A);
+  return 63 - __builtin_clzll((unsigned long long)__A);
 }

 /** Swaps the bytes in the input. Converting little endian to big endian or
@ -132,7 +132,7 @@ __bsrq(long long __A) {
 */
 static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapq(long long __A) {
-  return __builtin_bswap64(__A);
+  return (long long)__builtin_bswap64((unsigned long long)__A);
 }

 #define _bswap64(A) __bswapq((A))
@ -395,23 +395,23 @@ __rorw(unsigned short __X, int __C) {

 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rold(unsigned int __X, int __C) {
-  return __builtin_rotateleft32(__X, __C);
+  return __builtin_rotateleft32(__X, (unsigned int)__C);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rord(unsigned int __X, int __C) {
-  return __builtin_rotateright32(__X, __C);
+  return __builtin_rotateright32(__X, (unsigned int)__C);
 }

 #ifdef __x86_64__
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolq(unsigned long long __X, int __C) {
-  return __builtin_rotateleft64(__X, __C);
+  return __builtin_rotateleft64(__X, (unsigned long long)__C);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorq(unsigned long long __X, int __C) {
-  return __builtin_rotateright64(__X, __C);
+  return __builtin_rotateright64(__X, (unsigned long long)__C);
 }
 #endif /* __x86_64__ */

--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -276,20 +276,20 @@ _rdpid_u32(void) {
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
-  return __builtin_ia32_rdrand16_step(__p);
+  return (int)__builtin_ia32_rdrand16_step(__p);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand32_step(unsigned int *__p)
 {
-  return __builtin_ia32_rdrand32_step(__p);
+  return (int)__builtin_ia32_rdrand32_step(__p);
 }

 #ifdef __x86_64__
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand64_step(unsigned long long *__p)
 {
-  return __builtin_ia32_rdrand64_step(__p);
+  return (int)__builtin_ia32_rdrand64_step(__p);
 }
 #endif
 #endif /* __RDRND__ */
@ -360,50 +360,50 @@ _writegsbase_u64(unsigned long long __V)
 static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i16(void const * __P) {
  struct __loadu_i16 {
-    short __v;
+    unsigned short __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
+  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i16(void * __P, short __D) {
  struct __storeu_i16 {
-    short __v;
+    unsigned short __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D);
+  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i32(void const * __P) {
  struct __loadu_i32 {
-    int __v;
+    unsigned int __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
+  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i32(void * __P, int __D) {
  struct __storeu_i32 {
-    int __v;
+    unsigned int __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D);
+  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
 }

 #ifdef __x86_64__
 static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i64(void const * __P) {
  struct __loadu_i64 {
-    long long __v;
+    unsigned long long __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
+  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i64(void * __P, long long __D) {
  struct __storeu_i64 {
-    long long __v;
+    unsigned long long __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D);
+  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
 }
 #endif
 #endif /* __MOVBE */
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -534,27 +534,6 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-#if defined(__i386__)
-#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
-  __asm("cpuid"                                                                \
-        : "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
-        : "0"(__leaf), "2"(__count))
-#else
-/* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
-  __asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n"                                      \
-        "cpuid\n"                                                              \
-        "xchg{q} {%%rbx, %q1|%q1, rbx}"                                        \
-        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
-        : "0"(__leaf), "2"(__count))
-#endif
-static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
-  __cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
-}
-static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
-                                                    int __ecx) {
-  __cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
-}
 static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
 }
@ -581,6 +560,18 @@ unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);

 __int64 __mulh(__int64 __a, __int64 __b);
 unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
+
+void __break(int);
+
+void __writex18byte(unsigned long offset, unsigned char data);
+void __writex18word(unsigned long offset, unsigned short data);
+void __writex18dword(unsigned long offset, unsigned long data);
+void __writex18qword(unsigned long offset, unsigned __int64 data);
+
+unsigned char __readx18byte(unsigned long offset);
+unsigned short __readx18word(unsigned long offset);
+unsigned long __readx18dword(unsigned long offset);
+unsigned __int64 __readx18qword(unsigned long offset);
 #endif

 /*----------------------------------------------------------------------------*\
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -46,7 +46,7 @@
 ///
 /// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
 ///   GP (0)
 /// FI
@ -91,7 +91,7 @@
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
               __m128i __enkey_lo, __m128i __enkey_hi) {
@ -106,7 +106,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 ///
 /// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// InputKey[127:0] := __key[127:0]
 /// KeyMetadata[2:0] := __htype[2:0]
 /// KeyMetadata[23:3] := 0 // Reserved for future usage
@ -126,7 +126,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
@ -141,7 +141,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 ///
 /// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// InputKey[127:0] := __key_lo[127:0]
 /// InputKey[255:128] := __key_hi[255:128]
 /// KeyMetadata[2:0] := __htype[2:0]
@ -163,7 +163,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
                     void *__h) {
@ -179,7 +179,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
 ///
 /// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -202,7 +202,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -216,7 +216,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -241,7 +241,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -255,7 +255,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
 ///                  (Handle[127:0] AND (CPL > 0)) ||
@ -280,7 +280,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -294,7 +294,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
 ///                   (Handle[127:0] AND (CPL > 0)) ||
@ -319,7 +319,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -346,7 +346,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle := MEM[__h+383:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -377,7 +377,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
@ -392,7 +392,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -423,7 +423,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
@ -438,7 +438,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -469,7 +469,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
@ -484,7 +484,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
 ///                   (Handle[127:0] AND (CPL > 0)) ||
@ -515,7 +515,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
--- a/lib/include/mm_malloc.h
+++ b/lib/include/mm_malloc.h
@ -28,9 +28,9 @@ extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size

 #if !(defined(_WIN32) && defined(_mm_malloc))
 static __inline__ void *__attribute__((__always_inline__, __nodebug__,
-                                       __malloc__))
-_mm_malloc(size_t __size, size_t __align)
-{
+                                       __malloc__, __alloc_size__(1),
+                                       __alloc_align__(2)))
+_mm_malloc(size_t __size, size_t __align) {
  if (__align == 1) {
    return malloc(__size);
  }
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -21,6 +21,7 @@
 #define cl_khr_subgroup_shuffle 1
 #define cl_khr_subgroup_shuffle_relative 1
 #define cl_khr_subgroup_clustered_reduce 1
+#define cl_khr_subgroup_rotate 1
 #define cl_khr_extended_bit_ops 1
 #define cl_khr_integer_dot_product 1
 #define __opencl_c_integer_dot_product_input_4x8bit 1
@ -67,6 +68,7 @@
 #if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
 // For the SPIR and SPIR-V target all features are supported.
 #if defined(__SPIR__) || defined(__SPIRV__)
+#define __opencl_c_work_group_collective_functions 1
 #define __opencl_c_atomic_order_seq_cst 1
 #define __opencl_c_atomic_scope_device 1
 #define __opencl_c_atomic_scope_all_devices 1
@ -80,6 +82,11 @@
 #define __opencl_c_named_address_space_builtins 1
 #endif // !defined(__opencl_c_generic_address_space)

+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
+// Internal feature macro to provide subgroup builtins.
+#define __opencl_subgroup_builtins 1
+#endif
+
 // built-in scalar data types:

 /**
@ -197,6 +204,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
 typedef double double16 __attribute__((ext_vector_type(16)));
 #endif

+// An internal alias for half, for use by OpenCLBuiltins.td.
+#define __half half
+
 #if defined(__OPENCL_CPP_VERSION__)
 #define NULL nullptr
 #elif defined(__OPENCL_C_VERSION__)
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -35,7 +35,7 @@
 ///    A pointer to a 128-bit integer vector containing integer values.
 /// \returns A 128-bit vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_lddqu_si128(__m128i const *__p)
+_mm_lddqu_si128(__m128i_u const *__p)
 {
  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
--- a/lib/include/ppc_wrappers/bmi2intrin.h
+++ b/lib/include/ppc_wrappers/bmi2intrin.h
@ -0,0 +1,134 @@
+/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMI2INTRIN_H_
+#define BMI2INTRIN_H_
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u32(unsigned int __X, unsigned int __Y) {
+  return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
+  unsigned long long __res = (unsigned long long)__X * __Y;
+  *__P = (unsigned int)(__res >> 32);
+  return (unsigned int)__res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
+  return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mulx_u64(unsigned long long __X, unsigned long long __Y,
+              unsigned long long *__P) {
+  unsigned __int128 __res = (unsigned __int128)__X * __Y;
+  *__P = (unsigned long long)(__res >> 64);
+  return (unsigned long long)__res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum.  */
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long __result = 0x0UL;
+  const unsigned long __mask = 0x8000000000000000UL;
+  unsigned long __m = __M;
+  unsigned long __c, __t;
+  unsigned long __p;
+
+  /* The pop-count of the mask gives the number of the bits from
+   source to process.  This is also needed to shift bits from the
+   source into the correct position for the result.  */
+  __p = 64 - __builtin_popcountl(__M);
+
+  /* The loop is for the number of '1' bits in the mask and clearing
+   each mask bit as it is processed.  */
+  while (__m != 0) {
+    __c = __builtin_clzl(__m);
+    __t = __X << (__p - __c);
+    __m ^= (__mask >> __c);
+    __result |= (__t & (__mask >> __c));
+    __p++;
+  }
+  return __result;
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u64(unsigned long long __X, unsigned long long __M) {
+  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
+  const unsigned long __mask = 0x8000000000000000UL;
+  unsigned long __m = __M;
+  unsigned long __c;
+  unsigned long __result;
+
+  /* if the mask is constant and selects 8 bits or less we can use
+   the Power8 Bit permute instruction.  */
+  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
+    /* Also if the pext mask is constant, then the popcount is
+     constant, we can evaluate the following loop at compile
+     time and use a constant bit permute vector.  */
+    long __i;
+    for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
+      __c = __builtin_clzl(__m);
+      __p = (__p << 8) | __c;
+      __m ^= (__mask >> __c);
+    }
+    __result = __builtin_bpermd(__p, __X);
+  } else {
+    __p = 64 - __builtin_popcountl(__M);
+    __result = 0;
+    /* We could a use a for loop here, but that combined with
+     -funroll-loops can expand to a lot of code.  The while
+     loop avoids unrolling and the compiler commons the xor
+     from clearing the mask bit with the (m != 0) test.  The
+     result is a more compact loop setup and body.  */
+    while (__m != 0) {
+      unsigned long __t;
+      __c = __builtin_clzl(__m);
+      __t = (__X & (__mask >> __c)) >> (__p - __c);
+      __m ^= (__mask >> __c);
+      __result |= (__t);
+      __p++;
+    }
+  }
+  return __result;
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+   which depend on _ARCH_PWR7.  */
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pdep_u32(unsigned int __X, unsigned int __Y) {
+  return _pdep_u64(__X, __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _pext_u32(unsigned int __X, unsigned int __Y) {
+  return _pext_u64(__X, __Y);
+}
+#endif /* _ARCH_PWR7  */
+#endif /* __PPC64__  */
+
+#endif /* BMI2INTRIN_H_ */
--- a/lib/include/ppc_wrappers/bmiintrin.h
+++ b/lib/include/ppc_wrappers/bmiintrin.h
@ -0,0 +1,165 @@
+/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMIINTRIN_H_
+#define BMIINTRIN_H_
+
+extern __inline unsigned short
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u16(unsigned short __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u32(unsigned int __X, unsigned int __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u32(unsigned int __X, unsigned int __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y >> 8) & 0xFF;
+  return (_bextr_u32(__X, __P, __L));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u32(unsigned int __X) {
+  return (__X & -__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u32(unsigned int __X) {
+  return __blsi_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u32(unsigned int __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u32(unsigned int __X) {
+  return __blsmsk_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u32(unsigned int __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u32(unsigned int __X) {
+  return __blsr_u32(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u32(unsigned int __X) {
+  return __builtin_ctz(__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+   for long long.  */
+#ifdef __PPC64__
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __andn_u64(unsigned long long __X, unsigned long long __Y) {
+  return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
+  return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __bextr_u64(unsigned long long __X, unsigned long long __Y) {
+  unsigned int __P, __L;
+  __P = __Y & 0xFF;
+  __L = (__Y & 0xFF00) >> 8;
+  return (_bextr_u64(__X, __P, __L));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsi_u64(unsigned long long __X) {
+  return __X & -__X;
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsi_u64(unsigned long long __X) {
+  return __blsi_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsmsk_u64(unsigned long long __X) {
+  return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsmsk_u64(unsigned long long __X) {
+  return __blsmsk_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __blsr_u64(unsigned long long __X) {
+  return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _blsr_u64(unsigned long long __X) {
+  return __blsr_u64(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+
+extern __inline unsigned long long
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _tzcnt_u64(unsigned long long __X) {
+  return __builtin_ctzll(__X);
+}
+#endif /* __PPC64__  */
+
+#endif /* BMIINTRIN_H_ */
--- a/lib/include/ppc_wrappers/emmintrin.h
+++ b/lib/include/ppc_wrappers/emmintrin.h
--- a/lib/include/ppc_wrappers/immintrin.h
+++ b/lib/include/ppc_wrappers/immintrin.h
@ -0,0 +1,27 @@
+/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef IMMINTRIN_H_
+#define IMMINTRIN_H_
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#endif /* IMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/mm_malloc.h
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@ -10,38 +10,33 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED

-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
+    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <stdlib.h>

 /* We can't depend on <stdlib.h> since the prototype of posix_memalign
   may not be visible.  */
 #ifndef __cplusplus
-extern int posix_memalign (void **, size_t, size_t);
+extern int posix_memalign(void **, size_t, size_t);
 #else
-extern "C" int posix_memalign (void **, size_t, size_t);
+extern "C" int posix_memalign(void **, size_t, size_t);
 #endif

-static __inline void *
-_mm_malloc (size_t size, size_t alignment)
-{
+static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
  /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
-  size_t vec_align = sizeof (__vector float);
-  void *ptr;
+  size_t __vec_align = sizeof(__vector float);
+  void *__ptr;

-  if (alignment < vec_align)
-    alignment = vec_align;
-  if (posix_memalign (&ptr, alignment, size) == 0)
-    return ptr;
+  if (__alignment < __vec_align)
+    __alignment = __vec_align;
+  if (posix_memalign(&__ptr, __alignment, __size) == 0)
+    return __ptr;
  else
    return NULL;
 }

-static __inline void
-_mm_free (void * ptr)
-{
-  free (ptr);
-}
+static __inline void _mm_free(void *__ptr) { free(__ptr); }

 #else
 #include_next <mm_malloc.h>
--- a/lib/include/ppc_wrappers/mmintrin.h
+++ b/lib/include/ppc_wrappers/mmintrin.h
--- a/lib/include/ppc_wrappers/pmmintrin.h
+++ b/lib/include/ppc_wrappers/pmmintrin.h
@ -32,120 +32,114 @@
   In the specific case of the monitor and mwait instructions there are
   no direct equivalent in the PowerISA at this time.  So those
   intrinsics are not implemented.  */
-#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#error                                                                         \
+    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
 #endif

 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_

-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
+    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 /* We need definitions from the SSE2 and SSE header files*/
 #include <emmintrin.h>

-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_addsub_ps (__m128 __X, __m128 __Y)
-{
-  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
-  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
-  return (__m128) vec_add (__X, even_neg_Y);
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_addsub_ps(__m128 __X, __m128 __Y) {
+  const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
+  __v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
+  return (__m128)vec_add(__X, __even_neg_Y);
 }

-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_addsub_pd (__m128d __X, __m128d __Y)
-{
-  const __v2df even_n0 = {-0.0, 0.0};
-  __v2df even_neg_Y = vec_xor(__Y, even_n0);
-  return (__m128d) vec_add (__X, even_neg_Y);
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_addsub_pd(__m128d __X, __m128d __Y) {
+  const __v2df __even_n0 = {-0.0, 0.0};
+  __v2df __even_neg_Y = vec_xor(__Y, __even_n0);
+  return (__m128d)vec_add(__X, __even_neg_Y);
 }

-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_ps (__m128 __X, __m128 __Y)
-{
-  __vector unsigned char xform2 = {
-      0x00, 0x01, 0x02, 0x03,
-      0x08, 0x09, 0x0A, 0x0B,
-      0x10, 0x11, 0x12, 0x13,
-      0x18, 0x19, 0x1A, 0x1B
-    };
-  __vector unsigned char xform1 = {
-      0x04, 0x05, 0x06, 0x07,
-      0x0C, 0x0D, 0x0E, 0x0F,
-      0x14, 0x15, 0x16, 0x17,
-      0x1C, 0x1D, 0x1E, 0x1F
-    };
-  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
-			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_ps(__m128 __X, __m128 __Y) {
+  __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
+                                     0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
+                                     0x18, 0x19, 0x1A, 0x1B};
+  __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
+                                     0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
+                                     0x1C, 0x1D, 0x1E, 0x1F};
+  return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
+                         vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
 }

-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_ps (__m128 __X, __m128 __Y)
-{
-  __vector unsigned char xform2 = {
-      0x00, 0x01, 0x02, 0x03,
-      0x08, 0x09, 0x0A, 0x0B,
-      0x10, 0x11, 0x12, 0x13,
-      0x18, 0x19, 0x1A, 0x1B
-    };
-  __vector unsigned char xform1 = {
-      0x04, 0x05, 0x06, 0x07,
-      0x0C, 0x0D, 0x0E, 0x0F,
-      0x14, 0x15, 0x16, 0x17,
-      0x1C, 0x1D, 0x1E, 0x1F
-    };
-  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
-			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_ps(__m128 __X, __m128 __Y) {
+  __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
+                                     0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
+                                     0x18, 0x19, 0x1A, 0x1B};
+  __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
+                                     0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
+                                     0x1C, 0x1D, 0x1E, 0x1F};
+  return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
+                         vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
 }

-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_pd (__m128d __X, __m128d __Y)
-{
-  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
-				  vec_mergel ((__v2df) __X, (__v2df)__Y));
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_pd(__m128d __X, __m128d __Y) {
+  return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
+                          vec_mergel((__v2df)__X, (__v2df)__Y));
 }

-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_pd (__m128d __X, __m128d __Y)
-{
-  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
-			    vec_mergel ((__v2df) __X, (__v2df)__Y));
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_pd(__m128d __X, __m128d __Y) {
+  return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
+                          vec_mergel((__v2df)__X, (__v2df)__Y));
 }

-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_movehdup_ps (__m128 __X)
-{
-  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
+#ifdef _ARCH_PWR8
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_movehdup_ps(__m128 __X) {
+  return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_moveldup_ps(__m128 __X) {
+  return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
+}
+#endif
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_loaddup_pd(double const *__P) {
+  return (__m128d)vec_splats(*__P);
 }

-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_moveldup_ps (__m128 __X)
-{
-  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_movedup_pd(__m128d __X) {
+  return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
 }

-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_loaddup_pd (double const *__P)
-{
-  return (__m128d) vec_splats (*__P);
-}
-
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_movedup_pd (__m128d __X)
-{
-  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_lddqu_si128 (__m128i const *__P)
-{
-  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_lddqu_si128(__m128i const *__P) {
+  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
 }

 /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */

 #else
 #include_next <pmmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
-        */
+#endif /* defined(__ppc64__) &&
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* PMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -29,11 +29,254 @@
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_

-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
+    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>
 #include <tmmintrin.h>

+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_ZERO 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_NEG_INF 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_round_pd(__m128d __A, int __rounding) {
+  __v2df __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC) {
+    /* Save enabled exceptions, disable all exceptions,
+       and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
+    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+    __fpscr_save.__fr = __builtin_mffs();
+    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+    __fpscr_save.__fpscr &= ~0xf8;
+    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+#endif
+    /* Insert an artificial "read/write" reference to the variable
+       read below, to ensure the compiler does not schedule
+       a read/use of the variable before the FPSCR is modified, above.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : "+wa"(__A));
+  }
+
+  switch (__rounding) {
+  case _MM_FROUND_TO_NEAREST_INT:
+    __fpscr_save.__fr = __builtin_mffsl();
+    __attribute__((fallthrough));
+  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+    __builtin_set_fpscr_rn(0b00);
+    /* Insert an artificial "read/write" reference to the variable
+       read below, to ensure the compiler does not schedule
+       a read/use of the variable before the FPSCR is modified, above.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : "+wa"(__A));
+
+    __r = vec_rint((__v2df)__A);
+
+    /* Insert an artificial "read" reference to the variable written
+       above, to ensure the compiler does not schedule the computation
+       of the value after the manipulation of the FPSCR, below.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : : "wa"(__r));
+    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    break;
+  case _MM_FROUND_TO_NEG_INF:
+  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+    __r = vec_floor((__v2df)__A);
+    break;
+  case _MM_FROUND_TO_POS_INF:
+  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+    __r = vec_ceil((__v2df)__A);
+    break;
+  case _MM_FROUND_TO_ZERO:
+  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+    __r = vec_trunc((__v2df)__A);
+    break;
+  case _MM_FROUND_CUR_DIRECTION:
+    __r = vec_rint((__v2df)__A);
+    break;
+  }
+  if (__rounding & _MM_FROUND_NO_EXC) {
+    /* Insert an artificial "read" reference to the variable written
+       above, to ensure the compiler does not schedule the computation
+       of the value after the manipulation of the FPSCR, below.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : : "wa"(__r));
+    /* Restore enabled exceptions.  */
+    __fpscr_save.__fr = __builtin_mffsl();
+    __fpscr_save.__fpscr |= __enables_save.__fpscr;
+    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+  }
+  return (__m128d)__r;
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
+  __B = _mm_round_pd(__B, __rounding);
+  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
+  return (__m128d)__r;
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_round_ps(__m128 __A, int __rounding) {
+  __v4sf __r;
+  union {
+    double __fr;
+    long long __fpscr;
+  } __enables_save, __fpscr_save;
+
+  if (__rounding & _MM_FROUND_NO_EXC) {
+    /* Save enabled exceptions, disable all exceptions,
+       and preserve the rounding mode.  */
+#ifdef _ARCH_PWR9
+    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
+    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+    __fpscr_save.__fr = __builtin_mffs();
+    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+    __fpscr_save.__fpscr &= ~0xf8;
+    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+#endif
+    /* Insert an artificial "read/write" reference to the variable
+       read below, to ensure the compiler does not schedule
+       a read/use of the variable before the FPSCR is modified, above.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : "+wa"(__A));
+  }
+
+  switch (__rounding) {
+  case _MM_FROUND_TO_NEAREST_INT:
+    __fpscr_save.__fr = __builtin_mffsl();
+    __attribute__((fallthrough));
+  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+    __builtin_set_fpscr_rn(0b00);
+    /* Insert an artificial "read/write" reference to the variable
+       read below, to ensure the compiler does not schedule
+       a read/use of the variable before the FPSCR is modified, above.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : "+wa"(__A));
+
+    __r = vec_rint((__v4sf)__A);
+
+    /* Insert an artificial "read" reference to the variable written
+       above, to ensure the compiler does not schedule the computation
+       of the value after the manipulation of the FPSCR, below.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : : "wa"(__r));
+    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    break;
+  case _MM_FROUND_TO_NEG_INF:
+  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+    __r = vec_floor((__v4sf)__A);
+    break;
+  case _MM_FROUND_TO_POS_INF:
+  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+    __r = vec_ceil((__v4sf)__A);
+    break;
+  case _MM_FROUND_TO_ZERO:
+  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+    __r = vec_trunc((__v4sf)__A);
+    break;
+  case _MM_FROUND_CUR_DIRECTION:
+    __r = vec_rint((__v4sf)__A);
+    break;
+  }
+  if (__rounding & _MM_FROUND_NO_EXC) {
+    /* Insert an artificial "read" reference to the variable written
+       above, to ensure the compiler does not schedule the computation
+       of the value after the manipulation of the FPSCR, below.
+       This can be removed if and when GCC PR102783 is fixed.
+     */
+    __asm__("" : : "wa"(__r));
+    /* Restore enabled exceptions.  */
+    __fpscr_save.__fr = __builtin_mffsl();
+    __fpscr_save.__fpscr |= __enables_save.__fpscr;
+    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+  }
+  return (__m128)__r;
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
+  __B = _mm_round_ps(__B, __rounding);
+  __v4sf __r = (__v4sf)__A;
+  __r[0] = ((__v4sf)__B)[0];
+  return (__m128)__r;
+}
+
+#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
+  __v16qi __result = (__v16qi)__A;
+
+  __result[__N & 0xf] = __D;
+
+  return (__m128i)__result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
+  __v4si __result = (__v4si)__A;
+
+  __result[__N & 3] = __D;
+
+  return (__m128i)__result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
+  __v2di __result = (__v2di)__A;
+
+  __result[__N & 1] = __D;
+
+  return (__m128i)__result;
+}
+
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_epi8(__m128i __X, const int __N) {
@ -58,6 +301,7 @@ extern __inline int
  return ((__v4si)__X)[__N & 3];
 }

+#ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
@ -69,42 +313,351 @@ extern __inline __m128i
 #endif
  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
 }
+#endif

 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
+#ifdef _ARCH_PWR10
+  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
+#else
  const __v16qu __seven = vec_splats((unsigned char)0x07);
  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
-  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
+#endif
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
+  __v16qu __pcv[] = {
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
+      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
+      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
+      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
+      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
+      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
+      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
+      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
+      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
+      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
+      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
+      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+  };
+  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128)__r;
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
+#ifdef _ARCH_PWR10
+  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
+#else
+  const __v4si __zero = {0};
+  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
+  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
+#endif
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
+  __v16qu __pcv[] = {
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
+      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
+      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
+  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
+  return (__m128d)__r;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
+#ifdef _ARCH_PWR10
+  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
+#else
+  const __v2di __zero = {0};
+  const __vector __bool long long __boolmask =
+      vec_cmplt((__v2di)__mask, __zero);
+  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
+#endif
+}
+#endif
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_testz_si128(__m128i __A, __m128i __B) {
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_testc_si128(__m128i __A, __m128i __B) {
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  const __v16qu __zero = {0};
+  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
+  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_testnzc_si128(__m128i __A, __m128i __B) {
+  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
+  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
+}
+
+#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
+
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_min_epi8(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
 }

 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
-  __v16qi result = (__v16qi)__A;
-  result[__N & 0xf] = __D;
-  return (__m128i)result;
+    _mm_min_epu16(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
 }

 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
-  __v4si result = (__v4si)__A;
-  result[__N & 3] = __D;
-  return (__m128i)result;
+    _mm_min_epi32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
 }

 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
-  __v2di result = (__v2di)__A;
-  result[__N & 1] = __D;
-  return (__m128i)result;
+    _mm_min_epu32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
 }

+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_max_epi8(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_max_epu16(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_max_epi32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_max_epu32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mullo_epi32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mul_epi32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
+}
+#endif
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi8_epi16(__m128i __A) {
+  return (__m128i)vec_unpackh((__v16qi)__A);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi8_epi32(__m128i __A) {
+  __A = (__m128i)vec_unpackh((__v16qi)__A);
+  return (__m128i)vec_unpackh((__v8hi)__A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi8_epi64(__m128i __A) {
+  __A = (__m128i)vec_unpackh((__v16qi)__A);
+  __A = (__m128i)vec_unpackh((__v8hi)__A);
+  return (__m128i)vec_unpackh((__v4si)__A);
+}
+#endif
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi16_epi32(__m128i __A) {
+  return (__m128i)vec_unpackh((__v8hi)__A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi16_epi64(__m128i __A) {
+  __A = (__m128i)vec_unpackh((__v8hi)__A);
+  return (__m128i)vec_unpackh((__v4si)__A);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepi32_epi64(__m128i __A) {
+  return (__m128i)vec_unpackh((__v4si)__A);
+}
+#endif
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu8_epi16(__m128i __A) {
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu8_epi32(__m128i __A) {
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
+  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
+  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu8_epi64(__m128i __A) {
+  const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
+  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
+  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
+  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
+  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu16_epi32(__m128i __A) {
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu16_epi64(__m128i __A) {
+  const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
+  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
+  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cvtepu32_epi64(__m128i __A) {
+  const __v4su __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
+#else  /* __BIG_ENDIAN__.  */
+  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
+#endif /* __BIG_ENDIAN__.  */
+  return __A;
+}
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_minpos_epu16(__m128i __A) {
+  union __u {
+    __m128i __m;
+    __v8hu __uh;
+  };
+  union __u __u = {.__m = __A}, __r = {.__m = {0}};
+  unsigned short __ridx = 0;
+  unsigned short __rmin = __u.__uh[__ridx];
+  unsigned long __i;
+  for (__i = 1; __i < 8; __i++) {
+    if (__u.__uh[__i] < __rmin) {
+      __rmin = __u.__uh[__i];
+      __ridx = __i;
+    }
+  }
+  __r.__uh[0] = __rmin;
+  __r.__uh[1] = __ridx;
+  return __r.__m;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_packus_epi32(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
+  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
 #else
 #include_next <smmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
-        */
+#endif /* defined(__ppc64__) &&
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/tmmintrin.h
+++ b/lib/include/ppc_wrappers/tmmintrin.h
@ -25,7 +25,8 @@
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_

-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
+    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))

 #include <altivec.h>

@ -33,464 +34,420 @@
 #include <pmmintrin.h>

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi16 (__m128i __A)
-{
-  return (__m128i) vec_abs ((__v8hi) __A);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_epi16(__m128i __A) {
+  return (__m128i)vec_abs((__v8hi)__A);
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi32 (__m128i __A)
-{
-  return (__m128i) vec_abs ((__v4si) __A);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_epi32(__m128i __A) {
+  return (__m128i)vec_abs((__v4si)__A);
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi8 (__m128i __A)
-{
-  return (__m128i) vec_abs ((__v16qi) __A);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_epi8(__m128i __A) {
+  return (__m128i)vec_abs((__v16qi)__A);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi16 (__m64 __A)
-{
-  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
-  return (__m64) ((__v2du) vec_abs (__B))[0];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_pi16(__m64 __A) {
+  __v8hi __B = (__v8hi)(__v2du){__A, __A};
+  return (__m64)((__v2du)vec_abs(__B))[0];
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi32 (__m64 __A)
-{
-  __v4si __B = (__v4si) (__v2du) { __A, __A };
-  return (__m64) ((__v2du) vec_abs (__B))[0];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_pi32(__m64 __A) {
+  __v4si __B = (__v4si)(__v2du){__A, __A};
+  return (__m64)((__v2du)vec_abs(__B))[0];
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi8 (__m64 __A)
-{
-  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
-  return (__m64) ((__v2du) vec_abs (__B))[0];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_abs_pi8(__m64 __A) {
+  __v16qi __B = (__v16qi)(__v2du){__A, __A};
+  return (__m64)((__v2du)vec_abs(__B))[0];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
-{
-  if (__builtin_constant_p (__count) && __count < 16)
-    {
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
+  if (__builtin_constant_p(__count) && __count < 16) {
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_reve ((__v16qu) __A);
-      __B = (__m128i) vec_reve ((__v16qu) __B);
+    __A = (__m128i)vec_reve((__v16qu)__A);
+    __B = (__m128i)vec_reve((__v16qu)__B);
 #endif
-      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
+    __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_reve ((__v16qu) __A);
+    __A = (__m128i)vec_reve((__v16qu)__A);
 #endif
-      return __A;
-    }
+    return __A;
+  }

  if (__count == 0)
    return __B;

-  if (__count >= 16)
-    {
-      if (__count >= 32)
-	{
-	  const __v16qu zero = { 0 };
-	  return (__m128i) zero;
-	}
-      else
-	{
-	  const __v16qu __shift =
-	    vec_splats ((unsigned char) ((__count - 16) * 8));
+  if (__count >= 16) {
+    if (__count >= 32) {
+      const __v16qu __zero = {0};
+      return (__m128i)__zero;
+    } else {
+      const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
 #ifdef __LITTLE_ENDIAN__
-	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
+      return (__m128i)vec_sro((__v16qu)__A, __shift);
 #else
-	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
+      return (__m128i)vec_slo((__v16qu)__A, __shift);
 #endif
-	}
    }
-  else
-    {
-      const __v16qu __shiftA =
-	vec_splats ((unsigned char) ((16 - __count) * 8));
-      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
+  } else {
+    const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
+    const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
-      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
+    __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
+    __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
 #else
-      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
-      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
+    __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
+    __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
 #endif
-      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
-    }
+    return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
+  }
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
-{
-  if (__count < 16)
-    {
-      __v2du __C = { __B, __A };
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
+  if (__count < 16) {
+    __v2du __C = {__B, __A};
 #ifdef __LITTLE_ENDIAN__
-      const __v4su __shift = { __count << 3, 0, 0, 0 };
-      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
+    const __v4su __shift = {__count << 3, 0, 0, 0};
+    __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
 #else
-      const __v4su __shift = { 0, 0, 0, __count << 3 };
-      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
+    const __v4su __shift = {0, 0, 0, __count << 3};
+    __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
 #endif
-      return (__m64) __C[0];
-    }
-  else
-    {
-      const __m64 __zero = { 0 };
-      return __zero;
-    }
+    return (__m64)__C[0];
+  } else {
+    const __m64 __zero = {0};
+    return __zero;
+  }
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_epi16 (__m128i __A, __m128i __B)
-{
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
-  return (__m128i) vec_add (__C, __D);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_epi16(__m128i __A, __m128i __B) {
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
+                       16, 17, 20, 21, 24, 25, 28, 29};
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
+                       18, 19, 22, 23, 26, 27, 30, 31};
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i)vec_add(__C, __D);
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_epi32 (__m128i __A, __m128i __B)
-{
-  const __v16qu __P =
-    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
-  const __v16qu __Q =
-    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
-  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
-  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
-  return (__m128i) vec_add (__C, __D);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_epi32(__m128i __A, __m128i __B) {
+  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
+                       16, 17, 18, 19, 24, 25, 26, 27};
+  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
+                       20, 21, 22, 23, 28, 29, 30, 31};
+  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
+  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
+  return (__m128i)vec_add(__C, __D);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_pi16 (__m64 __A, __m64 __B)
-{
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
-  __v8hi __D = vec_perm (__C, __C, __Q);
-  __C = vec_perm (__C, __C, __P);
-  __C = vec_add (__C, __D);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_pi16(__m64 __A, __m64 __B) {
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
+  __v8hi __D = vec_perm(__C, __C, __Q);
+  __C = vec_perm(__C, __C, __P);
+  __C = vec_add(__C, __D);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_pi32 (__m64 __A, __m64 __B)
-{
-  __v4si __C = (__v4si) (__v2du) { __A, __B };
-  const __v16qu __P =
-    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
-  const __v16qu __Q =
-    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
-  __v4si __D = vec_perm (__C, __C, __Q);
-  __C = vec_perm (__C, __C, __P);
-  __C = vec_add (__C, __D);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadd_pi32(__m64 __A, __m64 __B) {
+  __v4si __C = (__v4si)(__v2du){__A, __B};
+  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
+  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
+  __v4si __D = vec_perm(__C, __C, __Q);
+  __C = vec_perm(__C, __C, __P);
+  __C = vec_add(__C, __D);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadds_epi16 (__m128i __A, __m128i __B)
-{
-  __v4si __C = { 0 }, __D = { 0 };
-  __C = vec_sum4s ((__v8hi) __A, __C);
-  __D = vec_sum4s ((__v8hi) __B, __D);
-  __C = (__v4si) vec_packs (__C, __D);
-  return (__m128i) __C;
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadds_epi16(__m128i __A, __m128i __B) {
+  __v4si __C = {0}, __D = {0};
+  __C = vec_sum4s((__v8hi)__A, __C);
+  __D = vec_sum4s((__v8hi)__B, __D);
+  __C = (__v4si)vec_packs(__C, __D);
+  return (__m128i)__C;
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadds_pi16 (__m64 __A, __m64 __B)
-{
-  const __v4si __zero = { 0 };
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
-  __v4si __D = vec_sum4s (__C, __zero);
-  __C = vec_packs (__D, __D);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hadds_pi16(__m64 __A, __m64 __B) {
+  const __v4si __zero = {0};
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
+  __v4si __D = vec_sum4s(__C, __zero);
+  __C = vec_packs(__D, __D);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_epi16 (__m128i __A, __m128i __B)
-{
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
-  return (__m128i) vec_sub (__C, __D);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_epi16(__m128i __A, __m128i __B) {
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
+                       16, 17, 20, 21, 24, 25, 28, 29};
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
+                       18, 19, 22, 23, 26, 27, 30, 31};
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i)vec_sub(__C, __D);
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_epi32 (__m128i __A, __m128i __B)
-{
-  const __v16qu __P =
-    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
-  const __v16qu __Q =
-    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
-  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
-  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
-  return (__m128i) vec_sub (__C, __D);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_epi32(__m128i __A, __m128i __B) {
+  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
+                       16, 17, 18, 19, 24, 25, 26, 27};
+  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
+                       20, 21, 22, 23, 28, 29, 30, 31};
+  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
+  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
+  return (__m128i)vec_sub(__C, __D);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_pi16 (__m64 __A, __m64 __B)
-{
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
-  __v8hi __D = vec_perm (__C, __C, __Q);
-  __C = vec_perm (__C, __C, __P);
-  __C = vec_sub (__C, __D);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_pi16(__m64 __A, __m64 __B) {
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
+  __v8hi __D = vec_perm(__C, __C, __Q);
+  __C = vec_perm(__C, __C, __P);
+  __C = vec_sub(__C, __D);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_pi32 (__m64 __A, __m64 __B)
-{
-  const __v16qu __P =
-    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
-  const __v16qu __Q =
-    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
-  __v4si __C = (__v4si) (__v2du) { __A, __B };
-  __v4si __D = vec_perm (__C, __C, __Q);
-  __C = vec_perm (__C, __C, __P);
-  __C = vec_sub (__C, __D);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsub_pi32(__m64 __A, __m64 __B) {
+  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
+  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
+  __v4si __C = (__v4si)(__v2du){__A, __B};
+  __v4si __D = vec_perm(__C, __C, __Q);
+  __C = vec_perm(__C, __C, __P);
+  __C = vec_sub(__C, __D);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsubs_epi16 (__m128i __A, __m128i __B)
-{
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
-  return (__m128i) vec_subs (__C, __D);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsubs_epi16(__m128i __A, __m128i __B) {
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
+                       16, 17, 20, 21, 24, 25, 28, 29};
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
+                       18, 19, 22, 23, 26, 27, 30, 31};
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i)vec_subs(__C, __D);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsubs_pi16 (__m64 __A, __m64 __B)
-{
-  const __v16qu __P =
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
-  const __v16qu __Q =
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
-  __v8hi __D = vec_perm (__C, __C, __P);
-  __v8hi __E = vec_perm (__C, __C, __Q);
-  __C = vec_subs (__D, __E);
-  return (__m64) ((__v2du) __C)[1];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_hsubs_pi16(__m64 __A, __m64 __B) {
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
+  __v8hi __D = vec_perm(__C, __C, __P);
+  __v8hi __E = vec_perm(__C, __C, __Q);
+  __C = vec_subs(__D, __E);
+  return (__m64)((__v2du)__C)[1];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_epi8 (__m128i __A, __m128i __B)
-{
-  const __v16qi __zero = { 0 };
-  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
-  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
-  return (__m128i) vec_sel (__C, __zero, __select);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_shuffle_epi8(__m128i __A, __m128i __B) {
+  const __v16qi __zero = {0};
+  __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
+  __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
+  return (__m128i)vec_sel(__C, __zero, __select);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_pi8 (__m64 __A, __m64 __B)
-{
-  const __v16qi __zero = { 0 };
-  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
-  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
-  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
-  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
-  __C = vec_sel (__C, __zero, __select);
-  return (__m64) ((__v2du) (__C))[0];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_shuffle_pi8(__m64 __A, __m64 __B) {
+  const __v16qi __zero = {0};
+  __v16qi __C = (__v16qi)(__v2du){__A, __A};
+  __v16qi __D = (__v16qi)(__v2du){__B, __B};
+  __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
+  __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
+  __C = vec_sel(__C, __zero, __select);
+  return (__m64)((__v2du)(__C))[0];
 }

+#ifdef _ARCH_PWR8
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_epi8 (__m128i __A, __m128i __B)
-{
-  const __v16qi __zero = { 0 };
-  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_epi8(__m128i __A, __m128i __B) {
+  const __v16qi __zero = {0};
+  __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
  __v16qi __selectpos =
-    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
-  __v16qi __conv = vec_add (__selectneg, __selectpos);
-  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
+      (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
+  __v16qi __conv = vec_add(__selectneg, __selectpos);
+  return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_epi16(__m128i __A, __m128i __B) {
+  const __v8hi __zero = {0};
+  __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
+  __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
+  __v8hi __conv = vec_add(__selectneg, __selectpos);
+  return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_epi32(__m128i __A, __m128i __B) {
+  const __v4si __zero = {0};
+  __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
+  __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
+  __v4si __conv = vec_add(__selectneg, __selectpos);
+  return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_pi8(__m64 __A, __m64 __B) {
+  const __v16qi __zero = {0};
+  __v16qi __C = (__v16qi)(__v2du){__A, __A};
+  __v16qi __D = (__v16qi)(__v2du){__B, __B};
+  __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
+  return (__m64)((__v2du)(__C))[0];
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_pi16(__m64 __A, __m64 __B) {
+  const __v8hi __zero = {0};
+  __v8hi __C = (__v8hi)(__v2du){__A, __A};
+  __v8hi __D = (__v8hi)(__v2du){__B, __B};
+  __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
+  return (__m64)((__v2du)(__C))[0];
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_sign_pi32(__m64 __A, __m64 __B) {
+  const __v4si __zero = {0};
+  __v4si __C = (__v4si)(__v2du){__A, __A};
+  __v4si __D = (__v4si)(__v2du){__B, __B};
+  __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
+  return (__m64)((__v2du)(__C))[0];
+}
+#endif
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_maddubs_epi16(__m128i __A, __m128i __B) {
+  __v8hi __unsigned = vec_splats((signed short)0x00ff);
+  __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
+  __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
+  __v8hi __E = vec_unpackh((__v16qi)__B);
+  __v8hi __F = vec_unpackl((__v16qi)__B);
+  __C = vec_mul(__C, __E);
+  __D = vec_mul(__D, __F);
+  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
+                          16, 17, 20, 21, 24, 25, 28, 29};
+  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
+                           18, 19, 22, 23, 26, 27, 30, 31};
+  __E = vec_perm(__C, __D, __odds);
+  __F = vec_perm(__C, __D, __evens);
+  return (__m128i)vec_adds(__E, __F);
+}
+
+extern __inline __m64
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_maddubs_pi16(__m64 __A, __m64 __B) {
+  __v8hi __C = (__v8hi)(__v2du){__A, __A};
+  __C = vec_unpackl((__v16qi)__C);
+  const __v8hi __unsigned = vec_splats((signed short)0x00ff);
+  __C = vec_and(__C, __unsigned);
+  __v8hi __D = (__v8hi)(__v2du){__B, __B};
+  __D = vec_unpackl((__v16qi)__D);
+  __D = vec_mul(__C, __D);
+  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
+                          16, 17, 20, 21, 24, 25, 28, 29};
+  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
+                           18, 19, 22, 23, 26, 27, 30, 31};
+  __C = vec_perm(__D, __D, __odds);
+  __D = vec_perm(__D, __D, __evens);
+  __C = vec_adds(__C, __D);
+  return (__m64)((__v2du)(__C))[0];
 }

 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_epi16 (__m128i __A, __m128i __B)
-{
-  const __v8hi __zero = { 0 };
-  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
-  __v8hi __selectpos =
-    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
-  __v8hi __conv = vec_add (__selectneg, __selectpos);
-  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
-}
-
-extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_epi32 (__m128i __A, __m128i __B)
-{
-  const __v4si __zero = { 0 };
-  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
-  __v4si __selectpos =
-    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
-  __v4si __conv = vec_add (__selectneg, __selectpos);
-  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
+  __v4si __C = vec_unpackh((__v8hi)__A);
+  __v4si __D = vec_unpackh((__v8hi)__B);
+  __C = vec_mul(__C, __D);
+  __D = vec_unpackl((__v8hi)__A);
+  __v4si __E = vec_unpackl((__v8hi)__B);
+  __D = vec_mul(__D, __E);
+  const __v4su __shift = vec_splats((unsigned int)14);
+  __C = vec_sr(__C, __shift);
+  __D = vec_sr(__D, __shift);
+  const __v4si __ones = vec_splats((signed int)1);
+  __C = vec_add(__C, __ones);
+  __C = vec_sr(__C, (__v4su)__ones);
+  __D = vec_add(__D, __ones);
+  __D = vec_sr(__D, (__v4su)__ones);
+  return (__m128i)vec_pack(__C, __D);
 }

 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_pi8 (__m64 __A, __m64 __B)
-{
-  const __v16qi __zero = { 0 };
-  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
-  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
-  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
-  return (__m64) ((__v2du) (__C))[0];
-}
-
-extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_pi16 (__m64 __A, __m64 __B)
-{
-  const __v8hi __zero = { 0 };
-  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
-  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
-  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
-  return (__m64) ((__v2du) (__C))[0];
-}
-
-extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_pi32 (__m64 __A, __m64 __B)
-{
-  const __v4si __zero = { 0 };
-  __v4si __C = (__v4si) (__v2du) { __A, __A };
-  __v4si __D = (__v4si) (__v2du) { __B, __B };
-  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
-  return (__m64) ((__v2du) (__C))[0];
-}
-
-extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maddubs_epi16 (__m128i __A, __m128i __B)
-{
-  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
-  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
-  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
-  __v8hi __E = vec_unpackh ((__v16qi) __B);
-  __v8hi __F = vec_unpackl ((__v16qi) __B);
-  __C = vec_mul (__C, __E);
-  __D = vec_mul (__D, __F);
-  const __v16qu __odds  =
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-  const __v16qu __evens =
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-  __E = vec_perm (__C, __D, __odds);
-  __F = vec_perm (__C, __D, __evens);
-  return (__m128i) vec_adds (__E, __F);
-}
-
-extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maddubs_pi16 (__m64 __A, __m64 __B)
-{
-  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
-  __C = vec_unpackl ((__v16qi) __C);
-  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
-  __C = vec_and (__C, __unsigned);
-  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
-  __D = vec_unpackl ((__v16qi) __D);
-  __D = vec_mul (__C, __D);
-  const __v16qu __odds  =
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-  const __v16qu __evens =
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
-  __C = vec_perm (__D, __D, __odds);
-  __D = vec_perm (__D, __D, __evens);
-  __C = vec_adds (__C, __D);
-  return (__m64) ((__v2du) (__C))[0];
-}
-
-extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
-{
-  __v4si __C = vec_unpackh ((__v8hi) __A);
-  __v4si __D = vec_unpackh ((__v8hi) __B);
-  __C = vec_mul (__C, __D);
-  __D = vec_unpackl ((__v8hi) __A);
-  __v4si __E = vec_unpackl ((__v8hi) __B);
-  __D = vec_mul (__D, __E);
-  const __v4su __shift = vec_splats ((unsigned int) 14);
-  __C = vec_sr (__C, __shift);
-  __D = vec_sr (__D, __shift);
-  const __v4si __ones = vec_splats ((signed int) 1);
-  __C = vec_add (__C, __ones);
-  __C = vec_sr (__C, (__v4su) __ones);
-  __D = vec_add (__D, __ones);
-  __D = vec_sr (__D, (__v4su) __ones);
-  return (__m128i) vec_pack (__C, __D);
-}
-
-extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
-{
-  __v4si __C = (__v4si) (__v2du) { __A, __A };
-  __C = vec_unpackh ((__v8hi) __C);
-  __v4si __D = (__v4si) (__v2du) { __B, __B };
-  __D = vec_unpackh ((__v8hi) __D);
-  __C = vec_mul (__C, __D);
-  const __v4su __shift = vec_splats ((unsigned int) 14);
-  __C = vec_sr (__C, __shift);
-  const __v4si __ones = vec_splats ((signed int) 1);
-  __C = vec_add (__C, __ones);
-  __C = vec_sr (__C, (__v4su) __ones);
-  __v8hi __E = vec_pack (__C, __D);
-  return (__m64) ((__v2du) (__E))[0];
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
+  __v4si __C = (__v4si)(__v2du){__A, __A};
+  __C = vec_unpackh((__v8hi)__C);
+  __v4si __D = (__v4si)(__v2du){__B, __B};
+  __D = vec_unpackh((__v8hi)__D);
+  __C = vec_mul(__C, __D);
+  const __v4su __shift = vec_splats((unsigned int)14);
+  __C = vec_sr(__C, __shift);
+  const __v4si __ones = vec_splats((signed int)1);
+  __C = vec_add(__C, __ones);
+  __C = vec_sr(__C, (__v4su)__ones);
+  __v8hi __E = vec_pack(__C, __D);
+  return (__m64)((__v2du)(__E))[0];
 }

 #else
 #include_next <tmmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
-        */
+#endif /* defined(__ppc64__) &&
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */

 #endif /* TMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/x86gprintrin.h
+++ b/lib/include/ppc_wrappers/x86gprintrin.h
@ -0,0 +1,17 @@
+/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef X86GPRINTRIN_H_
+#define X86GPRINTRIN_H_
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#endif /* X86GPRINTRIN_H_ */
--- a/lib/include/ppc_wrappers/x86intrin.h
+++ b/lib/include/ppc_wrappers/x86intrin.h
@ -0,0 +1,28 @@
+/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef X86INTRIN_H_
+#define X86INTRIN_H_
+
+#ifdef __ALTIVEC__
+#include <immintrin.h>
+#endif /* __ALTIVEC__ */
+
+#endif /* X86INTRIN_H_ */
--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
--- a/lib/include/rdpruintrin.h
+++ b/lib/include/rdpruintrin.h
@ -0,0 +1,57 @@
+/*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H
+#error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDPRUINTRIN_H
+#define __RDPRUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("rdpru")))
+
+
+/// Reads the content of a processor register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPRU </c> instruction.
+///
+/// \param reg_id
+///    A processor register identifier.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdpru (int reg_id)
+{
+  return __builtin_ia32_rdpru(reg_id);
+}
+
+#define __RDPRU_MPERF 0
+#define __RDPRU_APERF 1
+
+/// Reads the content of processor register MPERF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
+/// register MPERF.
+#define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
+
+/// Reads the content of processor register APERF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
+/// register APERF.
+#define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDPRUINTRIN_H */
--- a/lib/include/rdseedintrin.h
+++ b/lib/include/rdseedintrin.h
@ -20,20 +20,20 @@
 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed16_step(unsigned short *__p)
 {
-  return __builtin_ia32_rdseed16_step(__p);
+  return (int) __builtin_ia32_rdseed16_step(__p);
 }

 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed32_step(unsigned int *__p)
 {
-  return __builtin_ia32_rdseed32_step(__p);
+  return (int) __builtin_ia32_rdseed32_step(__p);
 }

 #ifdef __x86_64__
 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed64_step(unsigned long long *__p)
 {
-  return __builtin_ia32_rdseed64_step(__p);
+  return (int) __builtin_ia32_rdseed64_step(__p);
 }
 #endif

--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
--- a/lib/include/rtmintrin.h
+++ b/lib/include/rtmintrin.h
@ -29,7 +29,7 @@
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _xbegin(void)
 {
-  return __builtin_ia32_xbegin();
+  return (unsigned int)__builtin_ia32_xbegin();
 }

 static __inline__ void __DEFAULT_FN_ATTRS
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -17,23 +17,25 @@
 #include <tmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
+                 __min_vector_width__(128)))

 /* SSE4 Rounding macros. */
-#define _MM_FROUND_TO_NEAREST_INT    0x00
-#define _MM_FROUND_TO_NEG_INF        0x01
-#define _MM_FROUND_TO_POS_INF        0x02
-#define _MM_FROUND_TO_ZERO           0x03
-#define _MM_FROUND_CUR_DIRECTION     0x04
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04

-#define _MM_FROUND_RAISE_EXC         0x00
-#define _MM_FROUND_NO_EXC            0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08

-#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
-#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
-#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
-#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
-#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)

 /// Rounds up each element of the 128-bit vector of [4 x float] to an
@ -51,7 +53,7 @@
 /// \param X
 ///    A 128-bit vector of [4 x float] values to be rounded up.
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)

 /// Rounds up each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
@ -68,7 +70,7 @@
 /// \param X
 ///    A 128-bit vector of [2 x double] values to be rounded up.
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)

 /// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
@ -93,7 +95,7 @@
 ///    of the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
-#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)

 /// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
@ -118,7 +120,7 @@
 ///    of the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
-#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)

 /// Rounds down each element of the 128-bit vector of [4 x float] to an
 ///    an integer and returns the rounded values in a 128-bit vector of
@ -135,7 +137,7 @@
 /// \param X
 ///    A 128-bit vector of [4 x float] values to be rounded down.
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)

 /// Rounds down each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
@ -152,7 +154,7 @@
 /// \param X
 ///    A 128-bit vector of [2 x double].
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)

 /// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
@ -177,7 +179,7 @@
 ///    of the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
-#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)

 /// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
@ -202,7 +204,7 @@
 ///    of the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
-#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)

 /// Rounds each element of the 128-bit vector of [4 x float] to an
 ///    integer value according to the rounding control specified by the second
@ -234,7 +236,7 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_round_ps(X, M) \
+#define _mm_round_ps(X, M)                                                     \
  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))

 /// Copies three upper elements of the first 128-bit vector operand to
@ -275,9 +277,9 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
-#define _mm_round_ss(X, Y, M) \
-  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                  (__v4sf)(__m128)(Y), (M)))
+#define _mm_round_ss(X, Y, M)                                                  \
+  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
+                                  (M)))

 /// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
@ -309,7 +311,7 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_round_pd(X, M) \
+#define _mm_round_pd(X, M)                                                     \
  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))

 /// Copies the upper element of the first 128-bit vector operand to the
@ -350,9 +352,9 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
-#define _mm_round_sd(X, Y, M) \
-  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                   (__v2df)(__m128d)(Y), (M)))
+#define _mm_round_sd(X, Y, M)                                                  \
+  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
+                                   (M)))

 /* SSE4 Packed Blending Intrinsics.  */
 /// Returns a 128-bit vector of [2 x double] where the values are
@ -379,9 +381,9 @@
 ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_blend_pd(V1, V2, M) \
-  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
-                                     (__v2df)(__m128d)(V2), (int)(M)))
+#define _mm_blend_pd(V1, V2, M)                                                \
+  ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
+                                   (__v2df)(__m128d)(V2), (int)(M)))

 /// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
@ -407,9 +409,9 @@
 ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_blend_ps(V1, V2, M) \
-  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
-                                    (__v4sf)(__m128)(V2), (int)(M)))
+#define _mm_blend_ps(V1, V2, M)                                                \
+  ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
+                                  (int)(M)))

 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@ -431,11 +433,11 @@
 ///    position in the result. When a mask bit is 1, the corresponding 64-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
-{
-  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
-                                            (__v2df)__M);
+static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
+                                                           __m128d __V2,
+                                                           __m128d __M) {
+  return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
+                                          (__v2df)__M);
 }

 /// Returns a 128-bit vector of [4 x float] where the values are
@ -458,11 +460,11 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 ///    position in the result. When a mask bit is 1, the corresponding 32-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
-{
-  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
-                                           (__v4sf)__M);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
+                                                          __m128 __V2,
+                                                          __m128 __M) {
+  return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
+                                         (__v4sf)__M);
 }

 /// Returns a 128-bit vector of [16 x i8] where the values are selected
@ -485,11 +487,11 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 ///    position in the result. When a mask bit is 1, the corresponding 8-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [16 x i8] containing the copied values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
-{
-  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
-                                               (__v16qi)__M);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
+                                                             __m128i __V2,
+                                                             __m128i __M) {
+  return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
+                                             (__v16qi)__M);
 }

 /// Returns a 128-bit vector of [8 x i16] where the values are selected
@ -516,9 +518,9 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
-#define _mm_blend_epi16(V1, V2, M) \
-  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
-                                        (__v8hi)(__m128i)(V2), (int)(M)))
+#define _mm_blend_epi16(V1, V2, M)                                             \
+  ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
+                                      (__v8hi)(__m128i)(V2), (int)(M)))

 /* SSE4 Dword Multiply Instructions.  */
 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -534,10 +536,9 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the products of both operands.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
+                                                             __m128i __V2) {
+  return (__m128i)((__v4su)__V1 * (__v4su)__V2);
 }

 /// Multiplies corresponding even-indexed elements of two 128-bit
@ -554,10 +555,9 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_mul_epi32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 }

 /* SSE4 Floating Point Dot Product Instructions.  */
@ -593,9 +593,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [4 x float] subvector. If a bit is set, the dot product is returned
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
-#define _mm_dp_ps(X, Y, M) \
-  ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                                (__v4sf)(__m128)(Y), (M)))
+#define _mm_dp_ps(X, Y, M)                                                     \
+  ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))

 /// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
@ -628,9 +627,9 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    to the lowest element and bit [1] corresponding to the highest element of
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
-#define _mm_dp_pd(X, Y, M) \
-  ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                 (__v2df)(__m128d)(Y), (M)))
+#define _mm_dp_pd(X, Y, M)                                                     \
+  ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
+                                (M)))

 /* SSE4 Streaming Load Hint Instruction.  */
 /// Loads integer values from a 128-bit aligned memory location to a
@ -645,10 +644,9 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    values.
 /// \returns A 128-bit integer vector containing the data stored at the
 ///    specified memory location.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128 (__m128i const *__V)
-{
-  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128(__m128i const *__V) {
+  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
 }

 /* SSE4 Packed Integer Min/Max Instructions.  */
@ -665,10 +663,9 @@ _mm_stream_load_si128 (__m128i const *__V)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8]
 /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_min_epi8 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
+                                                          __m128i __V2) {
+  return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -684,10 +681,9 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8].
 /// \returns A 128-bit vector of [16 x i8] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_max_epi8 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
+                                                          __m128i __V2) {
+  return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -703,10 +699,9 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [8 x u16].
 /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_min_epu16 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -722,10 +717,9 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [8 x u16].
 /// \returns A 128-bit vector of [8 x u16] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_max_epu16 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -741,10 +735,9 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_min_epi32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -760,10 +753,9 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [4 x i32] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_max_epi32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -779,10 +771,9 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x u32].
 /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_min_epu32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -798,10 +789,9 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x u32].
 /// \returns A 128-bit vector of [4 x u32] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
-_mm_max_epu32 (__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
+                                                           __m128i __V2) {
+  return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
 }

 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
@ -869,21 +859,24 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of parameter \a X are returned. \n
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
-#define _mm_extract_ps(X, N) \
-  __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
+#define _mm_extract_ps(X, N)                                                   \
+  __builtin_bit_cast(                                                          \
+      int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))

 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
-#define _MM_EXTRACT_FLOAT(D, X, N) \
-  do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
+#define _MM_EXTRACT_FLOAT(D, X, N)                                             \
+  do {                                                                         \
+    (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
+  } while (0)

 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
   an index suitable for _mm_insert_ps.  */
 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))

 /* Extract a float from X at index N into the first index of the return.  */
-#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
-                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+#define _MM_PICK_OUT_PS(X, N)                                                  \
+  _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))

 /* Insert int into packed integer array at index.  */
 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of
@ -926,9 +919,9 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1110: Bits [119:112] of the result are used for insertion. \n
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi8(X, I, N) \
-  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
-                                         (int)(I), (int)(N)))
+#define _mm_insert_epi8(X, I, N)                                               \
+  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
+                                         (int)(N)))

 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
@ -958,9 +951,9 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of the result are used for insertion. \n
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi32(X, I, N) \
-  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
-                                        (int)(I), (int)(N)))
+#define _mm_insert_epi32(X, I, N)                                              \
+  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
+                                        (int)(N)))

 #ifdef __x86_64__
 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -989,9 +982,9 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] of the result are used for insertion. \n
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi64(X, I, N) \
-  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
-                                        (long long)(I), (int)(N)))
+#define _mm_insert_epi64(X, I, N)                                              \
+  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
+                                        (int)(N)))
 #endif /* __x86_64__ */

 /* Extract int from packed integer array at index.  This returns the element
@ -1032,8 +1025,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  An unsigned integer, whose lower 8 bits are selected from the
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
-#define _mm_extract_epi8(X, N) \
-  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+#define _mm_extract_epi8(X, N)                                                 \
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
                                                    (int)(N)))

 /// Extracts a 32-bit element from the 128-bit integer vector of
@ -1058,10 +1051,9 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the parameter \a X are exracted.
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi32(X, N) \
+#define _mm_extract_epi32(X, N)                                                \
  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))

-#ifdef __x86_64__
 /// Extracts a 64-bit element from the 128-bit integer vector of
 ///    [2 x i64], using the immediate value parameter \a N as a selector.
 ///
@ -1071,7 +1063,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// long long _mm_extract_epi64(__m128i X, const int N);
 /// \endcode
 ///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
+/// in 64-bit mode.
 ///
 /// \param X
 ///    A 128-bit integer vector.
@ -1081,9 +1074,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] are returned. \n
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
-#define _mm_extract_epi64(X, N) \
+#define _mm_extract_epi64(X, N)                                                \
  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
-#endif /* __x86_64 */

 /* SSE4 128-bit Packed Integer Comparisons.  */
 /// Tests whether the specified bits in a 128-bit integer vector are all
@ -1098,9 +1090,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \param __V
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testz_si128(__m128i __M, __m128i __V)
-{
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
+                                                         __m128i __V) {
  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }

@ -1116,9 +1107,8 @@ _mm_testz_si128(__m128i __M, __m128i __V)
 /// \param __V
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testc_si128(__m128i __M, __m128i __V)
-{
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
+                                                         __m128i __V) {
  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }

@ -1135,9 +1125,8 @@ _mm_testc_si128(__m128i __M, __m128i __V)
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
 ///    FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testnzc_si128(__m128i __M, __m128i __V)
-{
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
+                                                           __m128i __V) {
  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }

@ -1193,7 +1182,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 /// \param V
 ///    A 128-bit integer vector selecting which bits to test in operand \a M.
 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))

 /* SSE4 64-bit Packed Integer Comparisons.  */
 /// Compares each of the corresponding 64-bit values of the 128-bit
@ -1208,9 +1197,8 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
+                                                             __m128i __V2) {
  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
 }

@ -1225,15 +1213,16 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
-///    extended to 16-bit values.
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+///    sign-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi16(__m128i __V)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
+                              7),
+      __v8hi);
 }

 /// Sign-extends each of the lower four 8-bit integer elements of a
@ -1249,12 +1238,11 @@ _mm_cvtepi8_epi16(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
 ///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi32(__m128i __V)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }

 /// Sign-extends each of the lower two 8-bit integer elements of a
@ -1270,12 +1258,11 @@ _mm_cvtepi8_epi32(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
 ///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi64(__m128i __V)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }

 /// Sign-extends each of the lower four 16-bit integer elements of a
@ -1291,10 +1278,9 @@ _mm_cvtepi8_epi64(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
 ///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi16_epi32(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }

 /// Sign-extends each of the lower two 16-bit integer elements of a
@ -1310,10 +1296,9 @@ _mm_cvtepi16_epi32(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
 ///     sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi16_epi64(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }

 /// Sign-extends each of the lower two 32-bit integer elements of a
@ -1329,10 +1314,9 @@ _mm_cvtepi16_epi64(__m128i __V)
 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
 ///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi32_epi64(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
 }

 /* SSE4 Packed Integer Zero-Extension.  */
@ -1349,10 +1333,11 @@ _mm_cvtepi32_epi64(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
 ///    zero-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi16(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
+                              7),
+      __v8hi);
 }

 /// Zero-extends each of the lower four 8-bit integer elements of a
@ -1368,10 +1353,9 @@ _mm_cvtepu8_epi16(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
 ///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi32(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }

 /// Zero-extends each of the lower two 8-bit integer elements of a
@ -1387,10 +1371,9 @@ _mm_cvtepu8_epi32(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi64(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }

 /// Zero-extends each of the lower four 16-bit integer elements of a
@ -1406,10 +1389,9 @@ _mm_cvtepu8_epi64(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
 ///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu16_epi32(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }

 /// Zero-extends each of the lower two 16-bit integer elements of a
@ -1425,10 +1407,9 @@ _mm_cvtepu16_epi32(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu16_epi64(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }

 /// Zero-extends each of the lower two 32-bit integer elements of a
@ -1444,10 +1425,9 @@ _mm_cvtepu16_epi64(__m128i __V)
 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu32_epi64(__m128i __V)
-{
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
+  return (__m128i) __builtin_convertvector(
+      __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
 }

 /* SSE4 Pack with Unsigned Saturation.  */
@ -1473,10 +1453,9 @@ _mm_cvtepu32_epi64(__m128i __V)
 ///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_packus_epi32(__m128i __V1, __m128i __V2)
-{
-  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
+                                                              __m128i __V2) {
+  return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
 }

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
@ -1515,9 +1494,9 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 ///    \endcode
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
-#define _mm_mpsadbw_epu8(X, Y, M) \
-  ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                       (__v16qi)(__m128i)(Y), (M)))
+#define _mm_mpsadbw_epu8(X, Y, M)                                              \
+  ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
+                                      (__v16qi)(__m128i)(Y), (M)))

 /// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
@ -1532,10 +1511,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 /// \returns A 128-bit value where bits [15:0] contain the minimum value found
 ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
 ///    and the remaining bits are set to 0.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_minpos_epu16(__m128i __V)
-{
-  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
+  return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
 }

 /* Handle the sse4.2 definitions here. */
@ -1544,33 +1521,34 @@ _mm_minpos_epu16(__m128i __V)
   so we'll do the same.  */

 #undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))

 /* These specify the type of data that we're comparing.  */
-#define _SIDD_UBYTE_OPS                 0x00
-#define _SIDD_UWORD_OPS                 0x01
-#define _SIDD_SBYTE_OPS                 0x02
-#define _SIDD_SWORD_OPS                 0x03
+#define _SIDD_UBYTE_OPS 0x00
+#define _SIDD_UWORD_OPS 0x01
+#define _SIDD_SBYTE_OPS 0x02
+#define _SIDD_SWORD_OPS 0x03

 /* These specify the type of comparison operation.  */
-#define _SIDD_CMP_EQUAL_ANY             0x00
-#define _SIDD_CMP_RANGES                0x04
-#define _SIDD_CMP_EQUAL_EACH            0x08
-#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+#define _SIDD_CMP_EQUAL_ANY 0x00
+#define _SIDD_CMP_RANGES 0x04
+#define _SIDD_CMP_EQUAL_EACH 0x08
+#define _SIDD_CMP_EQUAL_ORDERED 0x0c

 /* These macros specify the polarity of the operation.  */
-#define _SIDD_POSITIVE_POLARITY         0x00
-#define _SIDD_NEGATIVE_POLARITY         0x10
-#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
-#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_NEGATIVE_POLARITY 0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30

 /* These macros are used in _mm_cmpXstri() to specify the return.  */
-#define _SIDD_LEAST_SIGNIFICANT         0x00
-#define _SIDD_MOST_SIGNIFICANT          0x40
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40

 /* These macros are used in _mm_cmpXstri() to specify the return.  */
-#define _SIDD_BIT_MASK                  0x00
-#define _SIDD_UNIT_MASK                 0x40
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40

 /* SSE4.2 Packed Comparison Intrinsics.  */
 /// Uses the immediate operand \a M to perform a comparison of string
@ -1625,8 +1603,8 @@ _mm_minpos_epu16(__m128i __V)
 ///         repeating each bit 8 or 16 times).
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
-#define _mm_cmpistrm(A, B, M) \
-  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistrm(A, B, M)                                                  \
+  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
                                        (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -1679,9 +1657,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      0: The index of the least significant set bit. \n
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpistri(A, B, M) \
-  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
-                                   (__v16qi)(__m128i)(B), (int)(M)))
+#define _mm_cmpistri(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
+                                    (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -1739,9 +1717,9 @@ _mm_minpos_epu16(__m128i __V)
 ///         repeating each bit 8 or 16 times). \n
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
-#define _mm_cmpestrm(A, LA, B, LB, M) \
-  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
-                                        (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestrm(A, LA, B, LB, M)                                          \
+  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
+                                        (__v16qi)(__m128i)(B), (int)(LB),      \
                                        (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -1798,9 +1776,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      0: The index of the least significant set bit. \n
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpestri(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestri(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
+                                    (__v16qi)(__m128i)(B), (int)(LB),          \
                                    (int)(M)))

 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
@ -1850,8 +1828,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum; otherwise, returns 0.
-#define _mm_cmpistra(A, B, M) \
-  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistra(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -1899,8 +1877,8 @@ _mm_minpos_epu16(__m128i __V)
 ///      11: Negate the bit mask only for bits with an index less than or equal
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
-#define _mm_cmpistrc(A, B, M) \
-  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistrc(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -1947,8 +1925,8 @@ _mm_minpos_epu16(__m128i __V)
 ///      11: Negate the bit mask only for bits with an index less than or equal
 ///          to the size of \a A or \a B. \n
 /// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpistro(A, B, M) \
-  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistro(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -1997,8 +1975,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
-#define _mm_cmpistrs(A, B, M) \
-  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistrs(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2047,8 +2025,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
-#define _mm_cmpistrz(A, B, M) \
-  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+#define _mm_cmpistrz(A, B, M)                                                  \
+  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2101,9 +2079,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum, otherwise, returns 0.
-#define _mm_cmpestra(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
-                                     (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestra(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
+                                     (__v16qi)(__m128i)(B), (int)(LB),         \
                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2155,9 +2133,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      11: Negate the bit mask only for bits with an index less than or equal
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
-#define _mm_cmpestrc(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
-                                     (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestrc(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
+                                     (__v16qi)(__m128i)(B), (int)(LB),         \
                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2208,9 +2186,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      11: Negate the bit mask only for bits with an index less than or equal
 ///          to the size of \a A or \a B.
 /// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpestro(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
-                                     (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestro(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
+                                     (__v16qi)(__m128i)(B), (int)(LB),         \
                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2263,9 +2241,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
-#define _mm_cmpestrs(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
-                                     (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestrs(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
+                                     (__v16qi)(__m128i)(B), (int)(LB),         \
                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
@ -2317,9 +2295,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
-#define _mm_cmpestrz(A, LA, B, LB, M) \
-  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
-                                     (__v16qi)(__m128i)(B), (int)(LB), \
+#define _mm_cmpestrz(A, LA, B, LB, M)                                          \
+  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
+                                     (__v16qi)(__m128i)(B), (int)(LB),         \
                                     (int)(M)))

 /* SSE4.2 Compare Packed Data -- Greater Than.  */
@ -2336,9 +2314,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
+                                                             __m128i __V2) {
  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }

--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -17,7 +17,8 @@
 * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
 * to the clang resource header until that is fully supported.
 */
-#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
+#if __STDC_HOSTED__ &&                                                         \
+    __has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
 # include_next <stdatomic.h>
 #else

@ -158,10 +159,6 @@ typedef _Atomic(uintmax_t)          atomic_uintmax_t;
 typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;

 #define ATOMIC_FLAG_INIT { 0 }
-#if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
-/* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
-#pragma clang deprecated(ATOMIC_FLAG_INIT)
-#endif

 /* These should be provided by the libc implementation. */
 #ifdef __cplusplus
--- a/lib/include/stdbool.h
+++ b/lib/include/stdbool.h
@ -10,8 +10,13 @@
 #ifndef __STDBOOL_H
 #define __STDBOOL_H

-/* Don't define bool, true, and false in C++, except as a GNU extension. */
-#ifndef __cplusplus
+#define __bool_true_false_are_defined 1
+
+#if __STDC_VERSION__ > 201710L
+/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
+ * to system headers which include this header file unconditionally.
+ */
+#elif !defined(__cplusplus)
 #define bool _Bool
 #define true 1
 #define false 0
@ -20,12 +25,10 @@
 #define _Bool bool
 #if __cplusplus < 201103L
 /* For C++98, define bool, false, true as a GNU extension. */
-#define bool  bool
+#define bool bool
 #define false false
-#define true  true
+#define true true
 #endif
 #endif

-#define __bool_true_false_are_defined 1
-
 #endif /* __STDBOOL_H */
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@ -62,7 +62,7 @@ typedef __SIZE_TYPE__ rsize_t;
 #endif /* defined(__need_STDDEF_H_misc) */

 #if defined(__need_wchar_t)
-#ifndef __cplusplus
+#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
 /* Always define wchar_t when modules are available. */
 #if !defined(_WCHAR_T) || __has_feature(modules)
 #if !__has_feature(modules)
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@ -13,4 +13,17 @@
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1

+#if __STDC_VERSION__ > 201710L &&                                              \
+    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
+/* The noreturn macro is deprecated in C2x. We do not mark it as such because
+   including the header file in C2x is also deprecated and we do not want to
+   issue a confusing diagnostic for code which includes <stdnoreturn.h>
+   followed by code that writes [[noreturn]]. The issue with such code is not
+   with the attribute, or the use of 'noreturn', but the inclusion of the
+   header. */
+/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
+ * to system headers which include this header file unconditionally.
+ */
+#endif
+
 #endif /* __STDNORETURN_H */
--- a/lib/include/uintrintrin.h
+++ b/lib/include/uintrintrin.h
@ -39,9 +39,9 @@ struct __uintr_frame
 ///
 /// This intrinsic corresponds to the <c> CLUI </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///   UIF := 0
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _clui (void)
 {
@ -60,9 +60,9 @@ _clui (void)
 ///
 /// This intrinsic corresponds to the <c> STUI </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///   UIF := 1
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _stui (void)
 {
@ -81,7 +81,7 @@ _stui (void)
 ///
 /// \returns The current value of the user interrupt flag (UIF).
 ///
-/// \operation
+/// \code{.operation}
 ///   CF := UIF
 ///   ZF := 0
 ///   AF := 0
@ -89,7 +89,7 @@ _stui (void)
 ///   PF := 0
 ///   SF := 0
 ///   dst := CF
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _testui (void)
 {
@ -110,7 +110,7 @@ _testui (void)
 ///    Index of user-interrupt target table entry in user-interrupt target
 ///    table.
 ///
-/// \operation
+/// \code{.operation}
 ///   IF __a > UITTSZ
 ///     GP (0)
 ///   FI
@ -143,7 +143,7 @@ _testui (void)
 ///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
 ///     FI
 ///   FI
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _senduipi (unsigned long long __a)
 {
--- a/lib/include/unwind.h
+++ b/lib/include/unwind.h
@ -62,7 +62,8 @@ typedef intptr_t _sleb128_t;
 typedef uintptr_t _uleb128_t;

 struct _Unwind_Context;
-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
+                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 struct _Unwind_Control_Block;
 typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
 #else
@ -72,7 +73,7 @@ typedef struct _Unwind_Exception _Unwind_Exception;
 typedef enum {
  _URC_NO_REASON = 0,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
-    !defined(__ARM_DWARF_EH__)
+    !defined(__ARM_DWARF_EH__) && !defined(__SEH__)
  _URC_OK = 0, /* used by ARM EHABI */
 #endif
  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
@ -86,7 +87,7 @@ typedef enum {
  _URC_INSTALL_CONTEXT = 7,
  _URC_CONTINUE_UNWIND = 8,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
-    !defined(__ARM_DWARF_EH__)
+    !defined(__ARM_DWARF_EH__) && !defined(__SEH__)
  _URC_FAILURE = 9 /* used by ARM EHABI */
 #endif
 } _Unwind_Reason_Code;
@ -103,7 +104,8 @@ typedef enum {
 typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
                                             _Unwind_Exception *);

-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
+                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 typedef struct _Unwind_Control_Block _Unwind_Control_Block;
 typedef uint32_t _Unwind_EHT_Header;

@ -167,7 +169,8 @@ typedef _Unwind_Personality_Fn __personality_routine;
 typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
                                                void *);

-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) ||                \
+                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 typedef enum {
  _UVRSC_CORE = 0,        /* integer register */
  _UVRSC_VFP = 1,         /* vfp */
--- a/lib/include/velintrin.h
+++ b/lib/include/velintrin.h
@ -0,0 +1,71 @@
+/*===---- velintrin.h - VEL intrinsics for VE ------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __VEL_INTRIN_H__
+#define __VEL_INTRIN_H__
+
+// Vector registers
+typedef double __vr __attribute__((__vector_size__(2048)));
+
+// Vector mask registers
+#if __STDC_VERSION__ >= 199901L
+// For C99
+typedef _Bool __vm    __attribute__((ext_vector_type(256)));
+typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
+typedef _Bool __vm512 __attribute__((ext_vector_type(512)));
+#else
+#ifdef __cplusplus
+// For C++
+typedef bool __vm    __attribute__((ext_vector_type(256)));
+typedef bool __vm256 __attribute__((ext_vector_type(256)));
+typedef bool __vm512 __attribute__((ext_vector_type(512)));
+#else
+#error need C++ or C99 to use vector intrinsics for VE
+#endif
+#endif
+
+enum VShuffleCodes {
+  VE_VSHUFFLE_YUYU = 0,
+  VE_VSHUFFLE_YUYL = 1,
+  VE_VSHUFFLE_YUZU = 2,
+  VE_VSHUFFLE_YUZL = 3,
+  VE_VSHUFFLE_YLYU = 4,
+  VE_VSHUFFLE_YLYL = 5,
+  VE_VSHUFFLE_YLZU = 6,
+  VE_VSHUFFLE_YLZL = 7,
+  VE_VSHUFFLE_ZUYU = 8,
+  VE_VSHUFFLE_ZUYL = 9,
+  VE_VSHUFFLE_ZUZU = 10,
+  VE_VSHUFFLE_ZUZL = 11,
+  VE_VSHUFFLE_ZLYU = 12,
+  VE_VSHUFFLE_ZLYL = 13,
+  VE_VSHUFFLE_ZLZU = 14,
+  VE_VSHUFFLE_ZLZL = 15,
+};
+
+// Use generated intrinsic name definitions
+#include <velintrin_gen.h>
+
+// Use helper functions
+#include <velintrin_approx.h>
+
+// pack
+
+#define _vel_pack_f32p __builtin_ve_vl_pack_f32p
+#define _vel_pack_f32a __builtin_ve_vl_pack_f32a
+
+static inline unsigned long int _vel_pack_i32(unsigned int a, unsigned int b) {
+  return (((unsigned long int)a) << 32) | b;
+}
+
+#define _vel_extract_vm512u(vm) __builtin_ve_vl_extract_vm512u(vm)
+#define _vel_extract_vm512l(vm) __builtin_ve_vl_extract_vm512l(vm)
+#define _vel_insert_vm512u(vm512, vm) __builtin_ve_vl_insert_vm512u(vm512, vm)
+#define _vel_insert_vm512l(vm512, vm) __builtin_ve_vl_insert_vm512l(vm512, vm)
+
+#endif
--- a/lib/include/velintrin_approx.h
+++ b/lib/include/velintrin_approx.h
@ -0,0 +1,120 @@
+/*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __VEL_INTRIN_APPROX_H__
+#define __VEL_INTRIN_APPROX_H__
+
+static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
+  float s0;
+  __vr v2, v3, v4, v5;
+  v5 = _vel_vrcps_vvl(v1, l);
+  s0 = 1.0;
+  v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
+  v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
+  v2 = _vel_vfmuls_vvvl(v0, v3, l);
+  v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
+  v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
+  v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
+  v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
+  float s0;
+  __vr v2, v3, v4, v5;
+  v5 = _vel_pvrcp_vvl(v1, l);
+  s0 = 1.0;
+  v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
+  v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
+  v2 = _vel_pvfmul_vvvl(v0, v3, l);
+  v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
+  v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
+  v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
+  v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
+  float s1;
+  __vr v1, v2, v3, v4;
+  v4 = _vel_vrcps_vvl(v0, l);
+  s1 = 1.0;
+  v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
+  v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
+  v1 = _vel_vfmuls_vsvl(s0, v2, l);
+  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
+  v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
+  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
+  v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
+  float s1;
+  __vr v1, v2;
+  s1 = 1.0f / s0;
+  v1 = _vel_vfmuls_vsvl(s1, v0, l);
+  v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
+  v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
+  __vr v1, v2, v3;
+  v2 = _vel_vrcpd_vvl(v0, l);
+  double s1 = 1.0;
+  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
+  v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
+  v1 = _vel_vaddul_vsvl(1, v1, l);
+  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
+  v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  v1 = _vel_vfmuld_vsvl(s0, v3, l);
+  v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
+  double s0, s1;
+  __vr v1, v2, v3;
+  v2 = _vel_vrsqrtdnex_vvl(v0, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  s0 = 1.0;
+  s1 = 0.5;
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  return v0;
+}
+
+static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
+  float s0, s1;
+  __vr v1, v2, v3;
+  v0 = _vel_vcvtds_vvl(v0, l);
+  v2 = _vel_vrsqrtdnex_vvl(v0, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  s0 = 1.0;
+  s1 = 0.5;
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
+  v1 = _vel_vfmuld_vvvl(v0, v2, l);
+  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
+  v3 = _vel_vfmuld_vsvl(s1, v3, l);
+  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
+  v0 = _vel_vcvtsd_vvl(v0, l);
+  return v0;
+}
+
+#endif
--- a/lib/include/velintrin_gen.h
+++ b/lib/include/velintrin_gen.h
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@ -1405,12 +1405,12 @@ wasm_f64x2_convert_low_u32x4(v128_t __a) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_trunc_sat_f64x2_zero(v128_t __a) {
-  return (v128_t)__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4((__f64x2)__a);
+  return (v128_t)__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4((__f64x2)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u32x4_trunc_sat_f64x2_zero(v128_t __a) {
-  return (v128_t)__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4((__f64x2)__a);
+  return (v128_t)__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4((__f64x2)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@ -25,11 +25,29 @@
 #include <crc32intrin.h>
 #endif

-#define __SSC_MARK(Tag)                                                        \
-  __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; "                      \
-                       "mov {%0, %%ebx|ebx, %0}; "                          \
+#if defined(__i386__)
+#define __FULLBX "ebx"
+#define __TMPGPR "eax"
+#else
+// When in 64-bit target, the 32-bit operands generate a 32-bit result,
+// zero-extended to a 64-bit result in the destination general-purpose,
+// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
+// should preserve the 64-bit register rbx.
+#define __FULLBX "rbx"
+#define __TMPGPR "rax"
+#endif
+
+#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
+
+#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
+#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
+
+#define __SSC_MARK(__Tag)                                                      \
+  __asm__ __volatile__( __SAVE_GPRBX                                           \
+                       "mov {%0, %%ebx|ebx, %0}; "                             \
                       ".byte 0x64, 0x67, 0x90; "                              \
-                       "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag)            \
-                       : "%eax");
+                        __RESTORE_GPRBX                                        \
+                       ::"i"(__Tag)                                            \
+                       :  __TMPGPR );

 #endif /* __X86GPRINTRIN_H */
--- a/lib/include/x86intrin.h
+++ b/lib/include/x86intrin.h
@ -59,5 +59,9 @@
 #include <clzerointrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RDPRU__)
+#include <rdpruintrin.h>
+#endif

 #endif /* __X86INTRIN_H */
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -2086,7 +2086,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// void _mm_prefetch(const void * a, const int sel);
+/// void _mm_prefetch(const void *a, const int sel);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
@ -2360,7 +2360,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    00: assigned from bits [15:0] of \a a. \n
 ///    01: assigned from bits [31:16] of \a a. \n
 ///    10: assigned from bits [47:32] of \a a. \n
-///    11: assigned from bits [63:48] of \a a.
+///    11: assigned from bits [63:48] of \a a. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
@ -2602,7 +2605,10 @@ void _mm_setcsr(unsigned int __i);
 ///    00: Bits [31:0] copied from the specified operand. \n
 ///    01: Bits [63:32] copied from the specified operand. \n
 ///    10: Bits [95:64] copied from the specified operand. \n
-///    11: Bits [127:96] copied from the specified operand.
+///    11: Bits [127:96] copied from the specified operand. \n
+///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
+///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
+///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \