mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 05:44:20 +00:00
update C lib/include/ files from clang 10 to 11rc1
This commit is contained in:
parent
c6e0df6213
commit
16513fee6c
46 changed files with 51827 additions and 17644 deletions
41
lib/include/__clang_cuda_cmath.h
vendored
41
lib/include/__clang_cuda_cmath.h
vendored
|
|
@ -12,7 +12,9 @@
|
|||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
#include <limits>
|
||||
#endif
|
||||
|
||||
// CUDA lets us use various std math functions on the device side. This file
|
||||
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
|
||||
|
|
@ -30,32 +32,16 @@
|
|||
// implementation. Declaring in the global namespace and pulling into namespace
|
||||
// std covers all of the known knowns.
|
||||
|
||||
#ifdef _OPENMP
|
||||
#define __DEVICE__ static __attribute__((always_inline))
|
||||
#ifdef __OPENMP_NVPTX__
|
||||
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
|
||||
#else
|
||||
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
// For C++ 17 we need to include noexcept attribute to be compatible
|
||||
// with the header-defined version. This may be removed once
|
||||
// variant is supported.
|
||||
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
|
||||
#define __NOEXCEPT noexcept
|
||||
#else
|
||||
#define __NOEXCEPT
|
||||
#endif
|
||||
|
||||
#if !(defined(_OPENMP) && defined(__cplusplus))
|
||||
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
|
||||
__DEVICE__ long abs(long __n) { return ::labs(__n); }
|
||||
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
|
||||
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
|
||||
#endif
|
||||
// TODO: remove once variat is supported.
|
||||
#if defined(_OPENMP) && defined(__cplusplus)
|
||||
__DEVICE__ const float abs(const float __x) { return ::fabsf((float)__x); }
|
||||
__DEVICE__ const double abs(const double __x) { return ::fabs((double)__x); }
|
||||
#endif
|
||||
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
|
||||
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
|
||||
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
|
||||
|
|
@ -64,11 +50,9 @@ __DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
|
|||
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
|
||||
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
|
||||
__DEVICE__ float exp(float __x) { return ::expf(__x); }
|
||||
__DEVICE__ float fabs(float __x) __NOEXCEPT { return ::fabsf(__x); }
|
||||
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
|
||||
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
|
||||
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
|
||||
// TODO: remove when variant is supported
|
||||
#ifndef _OPENMP
|
||||
__DEVICE__ int fpclassify(float __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
|
|
@ -77,14 +61,15 @@ __DEVICE__ int fpclassify(double __x) {
|
|||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__ float frexp(float __arg, int *__exp) {
|
||||
return ::frexpf(__arg, __exp);
|
||||
}
|
||||
|
||||
// For inscrutable reasons, the CUDA headers define these functions for us on
|
||||
// Windows.
|
||||
#ifndef _MSC_VER
|
||||
// Windows. For OpenMP we omit these as some old system headers have
|
||||
// non-conforming `isinf(float)` and `isnan(float)` implementations that return
|
||||
// an `int`. The system versions of these functions should be fine anyway.
|
||||
#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
|
|
@ -161,6 +146,8 @@ __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
|
|||
// libdevice doesn't provide an implementation, and we don't want to be in the
|
||||
// business of implementing tricky libm functions in this header.
|
||||
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
|
||||
// Now we've defined everything we promised we'd define in
|
||||
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
|
||||
// fix up our math functions.
|
||||
|
|
@ -457,10 +444,7 @@ using ::remainderf;
|
|||
using ::remquof;
|
||||
using ::rintf;
|
||||
using ::roundf;
|
||||
// TODO: remove once variant is supported
|
||||
#ifndef _OPENMP
|
||||
using ::scalblnf;
|
||||
#endif
|
||||
using ::scalbnf;
|
||||
using ::sinf;
|
||||
using ::sinhf;
|
||||
|
|
@ -479,7 +463,8 @@ _GLIBCXX_END_NAMESPACE_VERSION
|
|||
} // namespace std
|
||||
#endif
|
||||
|
||||
#undef __NOEXCEPT
|
||||
#endif // __OPENMP_NVPTX__
|
||||
|
||||
#undef __DEVICE__
|
||||
|
||||
#endif
|
||||
|
|
|
|||
268
lib/include/__clang_cuda_complex_builtins.h
vendored
268
lib/include/__clang_cuda_complex_builtins.h
vendored
|
|
@ -13,10 +13,57 @@
|
|||
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
|
||||
// libgcc functions that clang assumes are available when compiling c99 complex
|
||||
// operations. (These implementations come from libc++, and have been modified
|
||||
// to work with CUDA.)
|
||||
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
|
||||
|
||||
extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
|
||||
double __c, double __d) {
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#ifdef _OPENMP
|
||||
#pragma omp declare target
|
||||
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
|
||||
#else
|
||||
#define __DEVICE__ __device__ inline
|
||||
#endif
|
||||
|
||||
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
|
||||
// different but equivalent function versions. TODO: For OpenMP we currently
|
||||
// select the native builtins as the overload support for templates is lacking.
|
||||
#if !defined(_OPENMP)
|
||||
#define _ISNANd std::isnan
|
||||
#define _ISNANf std::isnan
|
||||
#define _ISINFd std::isinf
|
||||
#define _ISINFf std::isinf
|
||||
#define _ISFINITEd std::isfinite
|
||||
#define _ISFINITEf std::isfinite
|
||||
#define _COPYSIGNd std::copysign
|
||||
#define _COPYSIGNf std::copysign
|
||||
#define _SCALBNd std::scalbn
|
||||
#define _SCALBNf std::scalbn
|
||||
#define _ABSd std::abs
|
||||
#define _ABSf std::abs
|
||||
#define _LOGBd std::logb
|
||||
#define _LOGBf std::logb
|
||||
#else
|
||||
#define _ISNANd __nv_isnand
|
||||
#define _ISNANf __nv_isnanf
|
||||
#define _ISINFd __nv_isinfd
|
||||
#define _ISINFf __nv_isinff
|
||||
#define _ISFINITEd __nv_isfinited
|
||||
#define _ISFINITEf __nv_finitef
|
||||
#define _COPYSIGNd __nv_copysign
|
||||
#define _COPYSIGNf __nv_copysignf
|
||||
#define _SCALBNd __nv_scalbn
|
||||
#define _SCALBNf __nv_scalbnf
|
||||
#define _ABSd __nv_fabs
|
||||
#define _ABSf __nv_fabsf
|
||||
#define _LOGBd __nv_logb
|
||||
#define _LOGBf __nv_logbf
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
|
||||
double __d) {
|
||||
double __ac = __a * __c;
|
||||
double __bd = __b * __d;
|
||||
double __ad = __a * __d;
|
||||
|
|
@ -24,50 +71,49 @@ extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
|
|||
double _Complex z;
|
||||
__real__(z) = __ac - __bd;
|
||||
__imag__(z) = __ad + __bc;
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
|
||||
int __recalc = 0;
|
||||
if (std::isinf(__a) || std::isinf(__b)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
if (_ISINFd(__a) || _ISINFd(__b)) {
|
||||
__a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
|
||||
__b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
|
||||
if (_ISNANd(__c))
|
||||
__c = _COPYSIGNd(0, __c);
|
||||
if (_ISNANd(__d))
|
||||
__d = _COPYSIGNd(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (std::isinf(__c) || std::isinf(__d)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (_ISINFd(__c) || _ISINFd(__d)) {
|
||||
__c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
|
||||
__d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
|
||||
if (_ISNANd(__a))
|
||||
__a = _COPYSIGNd(0, __a);
|
||||
if (_ISNANd(__b))
|
||||
__b = _COPYSIGNd(0, __b);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
|
||||
std::isinf(__ad) || std::isinf(__bc))) {
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
if (!__recalc &&
|
||||
(_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
|
||||
if (_ISNANd(__a))
|
||||
__a = _COPYSIGNd(0, __a);
|
||||
if (_ISNANd(__b))
|
||||
__b = _COPYSIGNd(0, __b);
|
||||
if (_ISNANd(__c))
|
||||
__c = _COPYSIGNd(0, __c);
|
||||
if (_ISNANd(__d))
|
||||
__d = _COPYSIGNd(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (__recalc) {
|
||||
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
|
||||
// a device overload (and isn't constexpr before C++11, naturally).
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
|
||||
__real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
|
||||
__imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
|
||||
}
|
||||
}
|
||||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
|
||||
float __c, float __d) {
|
||||
__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
|
||||
float __ac = __a * __c;
|
||||
float __bd = __b * __d;
|
||||
float __ad = __a * __d;
|
||||
|
|
@ -75,36 +121,36 @@ extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
|
|||
float _Complex z;
|
||||
__real__(z) = __ac - __bd;
|
||||
__imag__(z) = __ad + __bc;
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
|
||||
int __recalc = 0;
|
||||
if (std::isinf(__a) || std::isinf(__b)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
if (_ISINFf(__a) || _ISINFf(__b)) {
|
||||
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
|
||||
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
|
||||
if (_ISNANf(__c))
|
||||
__c = _COPYSIGNf(0, __c);
|
||||
if (_ISNANf(__d))
|
||||
__d = _COPYSIGNf(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (std::isinf(__c) || std::isinf(__d)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (_ISINFf(__c) || _ISINFf(__d)) {
|
||||
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
|
||||
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
|
||||
if (_ISNANf(__a))
|
||||
__a = _COPYSIGNf(0, __a);
|
||||
if (_ISNANf(__b))
|
||||
__b = _COPYSIGNf(0, __b);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
|
||||
std::isinf(__ad) || std::isinf(__bc))) {
|
||||
if (std::isnan(__a))
|
||||
__a = std::copysign(0, __a);
|
||||
if (std::isnan(__b))
|
||||
__b = std::copysign(0, __b);
|
||||
if (std::isnan(__c))
|
||||
__c = std::copysign(0, __c);
|
||||
if (std::isnan(__d))
|
||||
__d = std::copysign(0, __d);
|
||||
if (!__recalc &&
|
||||
(_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
|
||||
if (_ISNANf(__a))
|
||||
__a = _COPYSIGNf(0, __a);
|
||||
if (_ISNANf(__b))
|
||||
__b = _COPYSIGNf(0, __b);
|
||||
if (_ISNANf(__c))
|
||||
__c = _COPYSIGNf(0, __c);
|
||||
if (_ISNANf(__d))
|
||||
__d = _COPYSIGNf(0, __d);
|
||||
__recalc = 1;
|
||||
}
|
||||
if (__recalc) {
|
||||
|
|
@ -115,36 +161,36 @@ extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
|
|||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
|
||||
double __c, double __d) {
|
||||
__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
|
||||
double __d) {
|
||||
int __ilogbw = 0;
|
||||
// Can't use std::max, because that's defined in <algorithm>, and we don't
|
||||
// want to pull that in for every compile. The CUDA headers define
|
||||
// ::max(float, float) and ::max(double, double), which is sufficient for us.
|
||||
double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
|
||||
if (std::isfinite(__logbw)) {
|
||||
double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
|
||||
if (_ISFINITEd(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = std::scalbn(__c, -__ilogbw);
|
||||
__d = std::scalbn(__d, -__ilogbw);
|
||||
__c = _SCALBNd(__c, -__ilogbw);
|
||||
__d = _SCALBNd(__d, -__ilogbw);
|
||||
}
|
||||
double __denom = __c * __c + __d * __d;
|
||||
double _Complex z;
|
||||
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
|
||||
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
|
||||
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
|
||||
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
|
||||
std::isfinite(__d)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
|
||||
} else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
|
||||
std::isfinite(__b)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
|
||||
__real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
|
||||
if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
|
||||
__real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
|
||||
__imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
|
||||
} else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
|
||||
_ISFINITEd(__d)) {
|
||||
__a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
|
||||
__b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
|
||||
__real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
|
||||
__imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
|
||||
} else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
|
||||
_ISFINITEd(__b)) {
|
||||
__c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
|
||||
__d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
|
||||
__real__(z) = 0.0 * (__a * __c + __b * __d);
|
||||
__imag__(z) = 0.0 * (__b * __c - __a * __d);
|
||||
}
|
||||
|
|
@ -152,33 +198,32 @@ extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
|
|||
return z;
|
||||
}
|
||||
|
||||
extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
|
||||
float __c, float __d) {
|
||||
__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
|
||||
int __ilogbw = 0;
|
||||
float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
|
||||
if (std::isfinite(__logbw)) {
|
||||
float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
|
||||
if (_ISFINITEf(__logbw)) {
|
||||
__ilogbw = (int)__logbw;
|
||||
__c = std::scalbn(__c, -__ilogbw);
|
||||
__d = std::scalbn(__d, -__ilogbw);
|
||||
__c = _SCALBNf(__c, -__ilogbw);
|
||||
__d = _SCALBNf(__d, -__ilogbw);
|
||||
}
|
||||
float __denom = __c * __c + __d * __d;
|
||||
float _Complex z;
|
||||
__real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
|
||||
if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
|
||||
__real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
|
||||
__imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
|
||||
} else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
|
||||
std::isfinite(__d)) {
|
||||
__a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
|
||||
__b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
|
||||
__real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
|
||||
__imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
|
||||
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
|
||||
if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
|
||||
__real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
|
||||
__imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
|
||||
} else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
|
||||
_ISFINITEf(__d)) {
|
||||
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
|
||||
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
|
||||
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
|
||||
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
|
||||
} else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
|
||||
std::isfinite(__b)) {
|
||||
__c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
|
||||
__d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
|
||||
} else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
|
||||
_ISFINITEf(__b)) {
|
||||
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
|
||||
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
|
||||
__real__(z) = 0 * (__a * __c + __b * __d);
|
||||
__imag__(z) = 0 * (__b * __c - __a * __d);
|
||||
}
|
||||
|
|
@ -186,4 +231,29 @@ extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
|
|||
return z;
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#undef _ISNANd
|
||||
#undef _ISNANf
|
||||
#undef _ISINFd
|
||||
#undef _ISINFf
|
||||
#undef _COPYSIGNd
|
||||
#undef _COPYSIGNf
|
||||
#undef _ISFINITEd
|
||||
#undef _ISFINITEf
|
||||
#undef _SCALBNd
|
||||
#undef _SCALBNf
|
||||
#undef _ABSd
|
||||
#undef _ABSf
|
||||
#undef _LOGBd
|
||||
#undef _LOGBf
|
||||
|
||||
#ifdef _OPENMP
|
||||
#pragma omp end declare target
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
#endif // __CLANG_CUDA_COMPLEX_BUILTINS
|
||||
|
|
|
|||
333
lib/include/__clang_cuda_device_functions.h
vendored
333
lib/include/__clang_cuda_device_functions.h
vendored
|
|
@ -10,7 +10,7 @@
|
|||
#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
|
||||
#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
#if CUDA_VERSION < 9000
|
||||
#error This file is intended to be used with CUDA-9+ only.
|
||||
#endif
|
||||
|
|
@ -20,32 +20,12 @@
|
|||
// we implement in this file. We need static in order to avoid emitting unused
|
||||
// functions and __forceinline__ helps inlining these wrappers at -O1.
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#ifdef _OPENMP
|
||||
#define __DEVICE__ static __attribute__((always_inline))
|
||||
#ifdef __OPENMP_NVPTX__
|
||||
#define __DEVICE__ static __attribute__((always_inline, nothrow))
|
||||
#else
|
||||
#define __DEVICE__ static __device__ __forceinline__
|
||||
#endif
|
||||
|
||||
// libdevice provides fast low precision and slow full-recision implementations
|
||||
// for some functions. Which one gets selected depends on
|
||||
// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
|
||||
// -ffast-math or -fcuda-approx-transcendentals are in effect.
|
||||
#pragma push_macro("__FAST_OR_SLOW")
|
||||
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
|
||||
#define __FAST_OR_SLOW(fast, slow) fast
|
||||
#else
|
||||
#define __FAST_OR_SLOW(fast, slow) slow
|
||||
#endif
|
||||
|
||||
// For C++ 17 we need to include noexcept attribute to be compatible
|
||||
// with the header-defined version. This may be removed once
|
||||
// variant is supported.
|
||||
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
|
||||
#define __NOEXCEPT noexcept
|
||||
#else
|
||||
#define __NOEXCEPT
|
||||
#endif
|
||||
|
||||
__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); }
|
||||
__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); }
|
||||
__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); }
|
||||
|
|
@ -359,10 +339,10 @@ __DEVICE__ int __iAtomicAdd(int *__p, int __v) {
|
|||
return __nvvm_atom_add_gen_i(__p, __v);
|
||||
}
|
||||
__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) {
|
||||
__nvvm_atom_cta_add_gen_i(__p, __v);
|
||||
return __nvvm_atom_cta_add_gen_i(__p, __v);
|
||||
}
|
||||
__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) {
|
||||
__nvvm_atom_sys_add_gen_i(__p, __v);
|
||||
return __nvvm_atom_sys_add_gen_i(__p, __v);
|
||||
}
|
||||
__DEVICE__ int __iAtomicAnd(int *__p, int __v) {
|
||||
return __nvvm_atom_and_gen_i(__p, __v);
|
||||
|
|
@ -1483,152 +1463,17 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
|
|||
return r;
|
||||
}
|
||||
#endif // CUDA_VERSION >= 9020
|
||||
__DEVICE__ int abs(int __a) __NOEXCEPT { return __nv_abs(__a); }
|
||||
__DEVICE__ double fabs(double __a) __NOEXCEPT { return __nv_fabs(__a); }
|
||||
__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
|
||||
__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
|
||||
__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
|
||||
__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
|
||||
__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
|
||||
__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
|
||||
__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
|
||||
__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
|
||||
__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
|
||||
__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
|
||||
__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
|
||||
__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
|
||||
__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
|
||||
__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
|
||||
__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
|
||||
__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
|
||||
__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
|
||||
__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
|
||||
#ifndef _OPENMP
|
||||
__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); }
|
||||
|
||||
// For OpenMP we require the user to include <time.h> as we need to know what
|
||||
// clock_t is on the system.
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
__DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); }
|
||||
#endif
|
||||
__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); }
|
||||
#endif
|
||||
__DEVICE__ double copysign(double __a, double __b) {
|
||||
return __nv_copysign(__a, __b);
|
||||
}
|
||||
__DEVICE__ float copysignf(float __a, float __b) {
|
||||
return __nv_copysignf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
|
||||
__DEVICE__ float cosf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
|
||||
}
|
||||
__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
|
||||
__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
|
||||
__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
|
||||
__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
|
||||
__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
|
||||
__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
|
||||
__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
|
||||
__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
|
||||
__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
|
||||
__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
|
||||
__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
|
||||
__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
|
||||
__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
|
||||
__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
|
||||
__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
|
||||
__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
|
||||
__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
|
||||
__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
|
||||
__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
|
||||
__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
|
||||
__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
|
||||
__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
|
||||
__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
|
||||
__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
|
||||
__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
|
||||
__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
|
||||
__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
|
||||
__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
|
||||
__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
|
||||
__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
|
||||
__DEVICE__ float fdividef(float __a, float __b) {
|
||||
#if __FAST_MATH__ && !__CUDA_PREC_DIV
|
||||
return __nv_fast_fdividef(__a, __b);
|
||||
#else
|
||||
return __a / __b;
|
||||
#endif
|
||||
}
|
||||
__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
|
||||
__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
|
||||
__DEVICE__ double fma(double __a, double __b, double __c) {
|
||||
return __nv_fma(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float fmaf(float __a, float __b, float __c) {
|
||||
return __nv_fmaf(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
|
||||
__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
|
||||
__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
|
||||
__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
|
||||
__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
|
||||
__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
|
||||
__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
|
||||
__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
|
||||
__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
|
||||
__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
|
||||
__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
|
||||
__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
|
||||
__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
|
||||
__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
|
||||
__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
|
||||
__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
|
||||
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
|
||||
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
|
||||
#if defined(__LP64__) || defined(_WIN64)
|
||||
__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_llabs(__a); };
|
||||
#else
|
||||
__DEVICE__ long labs(long __a) __NOEXCEPT { return __nv_abs(__a); };
|
||||
#endif
|
||||
__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
|
||||
__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
|
||||
__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
|
||||
__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
|
||||
__DEVICE__ long long llabs(long long __a) __NOEXCEPT { return __nv_llabs(__a); }
|
||||
__DEVICE__ long long llmax(long long __a, long long __b) {
|
||||
return __nv_llmax(__a, __b);
|
||||
}
|
||||
__DEVICE__ long long llmin(long long __a, long long __b) {
|
||||
return __nv_llmin(__a, __b);
|
||||
}
|
||||
__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
|
||||
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
|
||||
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
|
||||
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
|
||||
__DEVICE__ double log(double __a) { return __nv_log(__a); }
|
||||
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
|
||||
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
|
||||
__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
|
||||
__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
|
||||
__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
|
||||
__DEVICE__ float log2f(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
|
||||
}
|
||||
__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
|
||||
__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
|
||||
__DEVICE__ float logf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
|
||||
}
|
||||
#if defined(__LP64__) || defined(_WIN64)
|
||||
__DEVICE__ long lrint(double __a) { return llrint(__a); }
|
||||
__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
|
||||
__DEVICE__ long lround(double __a) { return llround(__a); }
|
||||
__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
|
||||
#else
|
||||
__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
|
||||
__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
|
||||
__DEVICE__ long lround(double __a) { return round(__a); }
|
||||
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
|
||||
#endif
|
||||
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
|
||||
|
||||
// These functions shouldn't be declared when including this header
|
||||
// for math function resolution purposes.
|
||||
#ifndef _OPENMP
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
|
||||
return __builtin_memcpy(__a, __b, __c);
|
||||
}
|
||||
|
|
@ -1636,158 +1481,6 @@ __DEVICE__ void *memset(void *__a, int __b, size_t __c) {
|
|||
return __builtin_memset(__a, __b, __c);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
|
||||
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
|
||||
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
|
||||
__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
|
||||
__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
|
||||
__DEVICE__ double nextafter(double __a, double __b) {
|
||||
return __nv_nextafter(__a, __b);
|
||||
}
|
||||
__DEVICE__ float nextafterf(float __a, float __b) {
|
||||
return __nv_nextafterf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double norm(int __dim, const double *__t) {
|
||||
return __nv_norm(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double norm3d(double __a, double __b, double __c) {
|
||||
return __nv_norm3d(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float norm3df(float __a, float __b, float __c) {
|
||||
return __nv_norm3df(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
|
||||
return __nv_norm4d(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
|
||||
return __nv_norm4df(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
|
||||
__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
|
||||
__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
|
||||
__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
|
||||
__DEVICE__ float normf(int __dim, const float *__t) {
|
||||
return __nv_normf(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
|
||||
__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
|
||||
__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
|
||||
__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
|
||||
__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
|
||||
__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
|
||||
__DEVICE__ double remainder(double __a, double __b) {
|
||||
return __nv_remainder(__a, __b);
|
||||
}
|
||||
__DEVICE__ float remainderf(float __a, float __b) {
|
||||
return __nv_remainderf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double remquo(double __a, double __b, int *__c) {
|
||||
return __nv_remquo(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float remquof(float __a, float __b, int *__c) {
|
||||
return __nv_remquof(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double rhypot(double __a, double __b) {
|
||||
return __nv_rhypot(__a, __b);
|
||||
}
|
||||
__DEVICE__ float rhypotf(float __a, float __b) {
|
||||
return __nv_rhypotf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
|
||||
__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
|
||||
__DEVICE__ double rnorm(int __a, const double *__b) {
|
||||
return __nv_rnorm(__a, __b);
|
||||
}
|
||||
__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
|
||||
return __nv_rnorm3d(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
|
||||
return __nv_rnorm3df(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
|
||||
return __nv_rnorm4d(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
|
||||
return __nv_rnorm4df(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float rnormf(int __dim, const float *__t) {
|
||||
return __nv_rnormf(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double round(double __a) { return __nv_round(__a); }
|
||||
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
|
||||
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
|
||||
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
|
||||
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
|
||||
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
|
||||
// TODO: remove once variant is supported
|
||||
#ifndef _OPENMP
|
||||
__DEVICE__ double scalbln(double __a, long __b) {
|
||||
if (__b > INT_MAX)
|
||||
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
|
||||
if (__b < INT_MIN)
|
||||
return __a > 0 ? 0.0 : -0.0;
|
||||
return scalbn(__a, (int)__b);
|
||||
}
|
||||
__DEVICE__ float scalblnf(float __a, long __b) {
|
||||
if (__b > INT_MAX)
|
||||
return __a > 0 ? HUGE_VALF : -HUGE_VALF;
|
||||
if (__b < INT_MIN)
|
||||
return __a > 0 ? 0.f : -0.f;
|
||||
return scalbnf(__a, (int)__b);
|
||||
}
|
||||
#endif
|
||||
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
|
||||
__DEVICE__ void sincos(double __a, double *__s, double *__c) {
|
||||
return __nv_sincos(__a, __s, __c);
|
||||
}
|
||||
__DEVICE__ void sincosf(float __a, float *__s, float *__c) {
|
||||
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
|
||||
}
|
||||
__DEVICE__ void sincospi(double __a, double *__s, double *__c) {
|
||||
return __nv_sincospi(__a, __s, __c);
|
||||
}
|
||||
__DEVICE__ void sincospif(float __a, float *__s, float *__c) {
|
||||
return __nv_sincospif(__a, __s, __c);
|
||||
}
|
||||
__DEVICE__ float sinf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
|
||||
}
|
||||
__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
|
||||
__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
|
||||
__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
|
||||
__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
|
||||
__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
|
||||
__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
|
||||
__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
|
||||
__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
|
||||
__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
|
||||
__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
|
||||
__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
|
||||
__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
|
||||
__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
|
||||
__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
|
||||
__DEVICE__ unsigned long long ullmax(unsigned long long __a,
|
||||
unsigned long long __b) {
|
||||
return __nv_ullmax(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned long long ullmin(unsigned long long __a,
|
||||
unsigned long long __b) {
|
||||
return __nv_ullmin(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
|
||||
return __nv_umax(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
|
||||
return __nv_umin(__a, __b);
|
||||
}
|
||||
__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
|
||||
__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
|
||||
__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
|
||||
__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
|
||||
__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
|
||||
__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
|
||||
|
||||
#undef __NOEXCEPT
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
#pragma pop_macro("__FAST_OR_SLOW")
|
||||
#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#if defined(__OPENMP_NVPTX__)
|
||||
#define __DEVICE__
|
||||
#elif defined(__CUDA__)
|
||||
#define __DEVICE__ __device__
|
||||
|
|
|
|||
347
lib/include/__clang_cuda_math.h
vendored
Normal file
347
lib/include/__clang_cuda_math.h
vendored
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CLANG_CUDA_MATH_H__
|
||||
#define __CLANG_CUDA_MATH_H__
|
||||
#ifndef __CUDA__
|
||||
#error "This file is for CUDA compilation only."
|
||||
#endif
|
||||
|
||||
#ifndef __OPENMP_NVPTX__
|
||||
#if CUDA_VERSION < 9000
|
||||
#error This file is intended to be used with CUDA-9+ only.
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __DEVICE__ is a helper macro with common set of attributes for the wrappers
|
||||
// we implement in this file. We need static in order to avoid emitting unused
|
||||
// functions and __forceinline__ helps inlining these wrappers at -O1.
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#ifdef __OPENMP_NVPTX__
|
||||
#if defined(__cplusplus)
|
||||
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
|
||||
#else
|
||||
#define __DEVICE__ static __attribute__((always_inline, nothrow))
|
||||
#endif
|
||||
#else
|
||||
#define __DEVICE__ static __device__ __forceinline__
|
||||
#endif
|
||||
|
||||
// Specialized version of __DEVICE__ for functions with void return type. Needed
|
||||
// because the OpenMP overlay requires constexpr functions here but prior to
|
||||
// c++14 void return functions could not be constexpr.
|
||||
#pragma push_macro("__DEVICE_VOID__")
|
||||
#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
|
||||
#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
|
||||
#else
|
||||
#define __DEVICE_VOID__ __DEVICE__
|
||||
#endif
|
||||
|
||||
// libdevice provides fast low precision and slow full-recision implementations
|
||||
// for some functions. Which one gets selected depends on
|
||||
// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
|
||||
// -ffast-math or -fcuda-approx-transcendentals are in effect.
|
||||
#pragma push_macro("__FAST_OR_SLOW")
|
||||
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
|
||||
#define __FAST_OR_SLOW(fast, slow) fast
|
||||
#else
|
||||
#define __FAST_OR_SLOW(fast, slow) slow
|
||||
#endif
|
||||
|
||||
__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
|
||||
__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
|
||||
__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
|
||||
__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
|
||||
__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
|
||||
__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
|
||||
__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
|
||||
__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
|
||||
__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
|
||||
__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
|
||||
__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
|
||||
__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
|
||||
__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
|
||||
__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
|
||||
__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
|
||||
__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
|
||||
__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
|
||||
__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
|
||||
__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
|
||||
__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
|
||||
__DEVICE__ double copysign(double __a, double __b) {
|
||||
return __nv_copysign(__a, __b);
|
||||
}
|
||||
__DEVICE__ float copysignf(float __a, float __b) {
|
||||
return __nv_copysignf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
|
||||
__DEVICE__ float cosf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
|
||||
}
|
||||
__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
|
||||
__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
|
||||
__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
|
||||
__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
|
||||
__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
|
||||
__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
|
||||
__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
|
||||
__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
|
||||
__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
|
||||
__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
|
||||
__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
|
||||
__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
|
||||
__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
|
||||
__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
|
||||
__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
|
||||
__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
|
||||
__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
|
||||
__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
|
||||
__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
|
||||
__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
|
||||
__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
|
||||
__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
|
||||
__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
|
||||
__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
|
||||
__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
|
||||
__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
|
||||
__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
|
||||
__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
|
||||
__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
|
||||
__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
|
||||
__DEVICE__ float fdividef(float __a, float __b) {
|
||||
#if __FAST_MATH__ && !__CUDA_PREC_DIV
|
||||
return __nv_fast_fdividef(__a, __b);
|
||||
#else
|
||||
return __a / __b;
|
||||
#endif
|
||||
}
|
||||
__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
|
||||
__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
|
||||
__DEVICE__ double fma(double __a, double __b, double __c) {
|
||||
return __nv_fma(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float fmaf(float __a, float __b, float __c) {
|
||||
return __nv_fmaf(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
|
||||
__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
|
||||
__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
|
||||
__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
|
||||
__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
|
||||
__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
|
||||
__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
|
||||
__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
|
||||
__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
|
||||
__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
|
||||
__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
|
||||
__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
|
||||
__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
|
||||
__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
|
||||
__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
|
||||
__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
|
||||
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
|
||||
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
|
||||
#if defined(__LP64__) || defined(_WIN64)
|
||||
__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
|
||||
#else
|
||||
__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
|
||||
#endif
|
||||
__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
|
||||
__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
|
||||
__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
|
||||
__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
|
||||
__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
|
||||
__DEVICE__ long long llmax(long long __a, long long __b) {
|
||||
return __nv_llmax(__a, __b);
|
||||
}
|
||||
__DEVICE__ long long llmin(long long __a, long long __b) {
|
||||
return __nv_llmin(__a, __b);
|
||||
}
|
||||
__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
|
||||
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
|
||||
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
|
||||
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
|
||||
__DEVICE__ double log(double __a) { return __nv_log(__a); }
|
||||
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
|
||||
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
|
||||
__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
|
||||
__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
|
||||
__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
|
||||
__DEVICE__ float log2f(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
|
||||
}
|
||||
__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
|
||||
__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
|
||||
__DEVICE__ float logf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
|
||||
}
|
||||
#if defined(__LP64__) || defined(_WIN64)
|
||||
__DEVICE__ long lrint(double __a) { return llrint(__a); }
|
||||
__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
|
||||
__DEVICE__ long lround(double __a) { return llround(__a); }
|
||||
__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
|
||||
#else
|
||||
__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
|
||||
__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
|
||||
__DEVICE__ long lround(double __a) { return round(__a); }
|
||||
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
|
||||
#endif
|
||||
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
|
||||
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
|
||||
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
|
||||
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
|
||||
__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
|
||||
__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
|
||||
__DEVICE__ double nextafter(double __a, double __b) {
|
||||
return __nv_nextafter(__a, __b);
|
||||
}
|
||||
__DEVICE__ float nextafterf(float __a, float __b) {
|
||||
return __nv_nextafterf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double norm(int __dim, const double *__t) {
|
||||
return __nv_norm(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double norm3d(double __a, double __b, double __c) {
|
||||
return __nv_norm3d(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float norm3df(float __a, float __b, float __c) {
|
||||
return __nv_norm3df(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
|
||||
return __nv_norm4d(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
|
||||
return __nv_norm4df(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
|
||||
__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
|
||||
__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
|
||||
__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
|
||||
__DEVICE__ float normf(int __dim, const float *__t) {
|
||||
return __nv_normf(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
|
||||
__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
|
||||
__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
|
||||
__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
|
||||
__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
|
||||
__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
|
||||
__DEVICE__ double remainder(double __a, double __b) {
|
||||
return __nv_remainder(__a, __b);
|
||||
}
|
||||
__DEVICE__ float remainderf(float __a, float __b) {
|
||||
return __nv_remainderf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double remquo(double __a, double __b, int *__c) {
|
||||
return __nv_remquo(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float remquof(float __a, float __b, int *__c) {
|
||||
return __nv_remquof(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double rhypot(double __a, double __b) {
|
||||
return __nv_rhypot(__a, __b);
|
||||
}
|
||||
__DEVICE__ float rhypotf(float __a, float __b) {
|
||||
return __nv_rhypotf(__a, __b);
|
||||
}
|
||||
__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
|
||||
__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
|
||||
__DEVICE__ double rnorm(int __a, const double *__b) {
|
||||
return __nv_rnorm(__a, __b);
|
||||
}
|
||||
__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
|
||||
return __nv_rnorm3d(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
|
||||
return __nv_rnorm3df(__a, __b, __c);
|
||||
}
|
||||
__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
|
||||
return __nv_rnorm4d(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
|
||||
return __nv_rnorm4df(__a, __b, __c, __d);
|
||||
}
|
||||
__DEVICE__ float rnormf(int __dim, const float *__t) {
|
||||
return __nv_rnormf(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double round(double __a) { return __nv_round(__a); }
|
||||
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
|
||||
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
|
||||
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
|
||||
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
|
||||
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
|
||||
__DEVICE__ double scalbln(double __a, long __b) {
|
||||
if (__b > INT_MAX)
|
||||
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
|
||||
if (__b < INT_MIN)
|
||||
return __a > 0 ? 0.0 : -0.0;
|
||||
return scalbn(__a, (int)__b);
|
||||
}
|
||||
__DEVICE__ float scalblnf(float __a, long __b) {
|
||||
if (__b > INT_MAX)
|
||||
return __a > 0 ? HUGE_VALF : -HUGE_VALF;
|
||||
if (__b < INT_MIN)
|
||||
return __a > 0 ? 0.f : -0.f;
|
||||
return scalbnf(__a, (int)__b);
|
||||
}
|
||||
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
|
||||
__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
|
||||
return __nv_sincos(__a, __s, __c);
|
||||
}
|
||||
__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
|
||||
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
|
||||
}
|
||||
__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
|
||||
return __nv_sincospi(__a, __s, __c);
|
||||
}
|
||||
__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
|
||||
return __nv_sincospif(__a, __s, __c);
|
||||
}
|
||||
__DEVICE__ float sinf(float __a) {
|
||||
return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
|
||||
}
|
||||
__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
|
||||
__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
|
||||
__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
|
||||
__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
|
||||
__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
|
||||
__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
|
||||
__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
|
||||
__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
|
||||
__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
|
||||
__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
|
||||
__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
|
||||
__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
|
||||
__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
|
||||
__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
|
||||
__DEVICE__ unsigned long long ullmax(unsigned long long __a,
|
||||
unsigned long long __b) {
|
||||
return __nv_ullmax(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned long long ullmin(unsigned long long __a,
|
||||
unsigned long long __b) {
|
||||
return __nv_ullmin(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
|
||||
return __nv_umax(__a, __b);
|
||||
}
|
||||
__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
|
||||
return __nv_umin(__a, __b);
|
||||
}
|
||||
__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
|
||||
__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
|
||||
__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
|
||||
__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
|
||||
__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
|
||||
__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
#pragma pop_macro("__DEVICE_VOID__")
|
||||
#pragma pop_macro("__FAST_OR_SLOW")
|
||||
|
||||
#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
|
||||
41
lib/include/__clang_cuda_math_forward_declares.h
vendored
41
lib/include/__clang_cuda_math_forward_declares.h
vendored
|
|
@ -8,8 +8,8 @@
|
|||
*/
|
||||
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
||||
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
|
||||
#ifndef __CUDA__
|
||||
#error "This file is for CUDA compilation only."
|
||||
#if !defined(__CUDA__) && !__HIP__
|
||||
#error "This file is for CUDA/HIP compilation only."
|
||||
#endif
|
||||
|
||||
// This file forward-declares of some math functions we (or the CUDA headers)
|
||||
|
|
@ -20,37 +20,14 @@
|
|||
// would preclude the use of our own __device__ overloads for these functions.
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#ifdef _OPENMP
|
||||
#define __DEVICE__ static __inline__ __attribute__((always_inline))
|
||||
#else
|
||||
#define __DEVICE__ \
|
||||
static __inline__ __attribute__((always_inline)) __attribute__((device))
|
||||
#endif
|
||||
|
||||
// For C++ 17 we need to include noexcept attribute to be compatible
|
||||
// with the header-defined version. This may be removed once
|
||||
// variant is supported.
|
||||
#if defined(_OPENMP) && defined(__cplusplus) && __cplusplus >= 201703L
|
||||
#define __NOEXCEPT noexcept
|
||||
#else
|
||||
#define __NOEXCEPT
|
||||
#endif
|
||||
|
||||
#if !(defined(_OPENMP) && defined(__cplusplus))
|
||||
__DEVICE__ long abs(long);
|
||||
__DEVICE__ long long abs(long long);
|
||||
__DEVICE__ double abs(double);
|
||||
__DEVICE__ float abs(float);
|
||||
#endif
|
||||
// While providing the CUDA declarations and definitions for math functions,
|
||||
// we may manually define additional functions.
|
||||
// TODO: Once variant is supported the additional functions will have
|
||||
// to be removed.
|
||||
#if defined(_OPENMP) && defined(__cplusplus)
|
||||
__DEVICE__ const double abs(const double);
|
||||
__DEVICE__ const float abs(const float);
|
||||
#endif
|
||||
__DEVICE__ int abs(int) __NOEXCEPT;
|
||||
__DEVICE__ int abs(int);
|
||||
__DEVICE__ double acos(double);
|
||||
__DEVICE__ float acos(float);
|
||||
__DEVICE__ double acosh(double);
|
||||
|
|
@ -85,8 +62,8 @@ __DEVICE__ double exp(double);
|
|||
__DEVICE__ float exp(float);
|
||||
__DEVICE__ double expm1(double);
|
||||
__DEVICE__ float expm1(float);
|
||||
__DEVICE__ double fabs(double) __NOEXCEPT;
|
||||
__DEVICE__ float fabs(float) __NOEXCEPT;
|
||||
__DEVICE__ double fabs(double);
|
||||
__DEVICE__ float fabs(float);
|
||||
__DEVICE__ double fdim(double, double);
|
||||
__DEVICE__ float fdim(float, float);
|
||||
__DEVICE__ double floor(double);
|
||||
|
|
@ -136,12 +113,12 @@ __DEVICE__ bool isnormal(double);
|
|||
__DEVICE__ bool isnormal(float);
|
||||
__DEVICE__ bool isunordered(double, double);
|
||||
__DEVICE__ bool isunordered(float, float);
|
||||
__DEVICE__ long labs(long) __NOEXCEPT;
|
||||
__DEVICE__ long labs(long);
|
||||
__DEVICE__ double ldexp(double, int);
|
||||
__DEVICE__ float ldexp(float, int);
|
||||
__DEVICE__ double lgamma(double);
|
||||
__DEVICE__ float lgamma(float);
|
||||
__DEVICE__ long long llabs(long long) __NOEXCEPT;
|
||||
__DEVICE__ long long llabs(long long);
|
||||
__DEVICE__ long long llrint(double);
|
||||
__DEVICE__ long long llrint(float);
|
||||
__DEVICE__ double log10(double);
|
||||
|
|
@ -152,9 +129,6 @@ __DEVICE__ double log2(double);
|
|||
__DEVICE__ float log2(float);
|
||||
__DEVICE__ double logb(double);
|
||||
__DEVICE__ float logb(float);
|
||||
#if defined(_OPENMP) && defined(__cplusplus)
|
||||
__DEVICE__ long double log(long double);
|
||||
#endif
|
||||
__DEVICE__ double log(double);
|
||||
__DEVICE__ float log(float);
|
||||
__DEVICE__ long lrint(double);
|
||||
|
|
@ -302,7 +276,6 @@ _GLIBCXX_END_NAMESPACE_VERSION
|
|||
} // namespace std
|
||||
#endif
|
||||
|
||||
#undef __NOEXCEPT
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
#endif
|
||||
|
|
|
|||
17
lib/include/__clang_cuda_runtime_wrapper.h
vendored
17
lib/include/__clang_cuda_runtime_wrapper.h
vendored
|
|
@ -31,11 +31,17 @@
|
|||
// Include some forward declares that must come before cmath.
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
|
||||
// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
|
||||
// enabled depend on it to avoid using __float128, which is unsupported in
|
||||
// CUDA.
|
||||
#define __CUDACC__
|
||||
|
||||
// Include some standard headers to avoid CUDA headers including them
|
||||
// while some required macros (like __THROW) are in a weird state.
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#undef __CUDACC__
|
||||
|
||||
// Preserve common macros that will be changed below by us or by CUDA
|
||||
// headers.
|
||||
|
|
@ -83,13 +89,15 @@
|
|||
#if CUDA_VERSION < 9000
|
||||
#define __CUDABE__
|
||||
#else
|
||||
#define __CUDACC__
|
||||
#define __CUDA_LIBDEVICE__
|
||||
#endif
|
||||
// Disables definitions of device-side runtime support stubs in
|
||||
// cuda_device_runtime_api.h
|
||||
#include "host_defines.h"
|
||||
#undef __CUDACC__
|
||||
#include "driver_types.h"
|
||||
#include "host_config.h"
|
||||
#include "host_defines.h"
|
||||
|
||||
// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
|
||||
// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
|
||||
|
|
@ -141,11 +149,12 @@ inline __host__ double __signbitd(double x) {
|
|||
// to provide our own.
|
||||
#include <__clang_cuda_libdevice_declares.h>
|
||||
|
||||
// Wrappers for many device-side standard library functions became compiler
|
||||
// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
|
||||
// provides its own implementation of the wrappers.
|
||||
// Wrappers for many device-side standard library functions, incl. math
|
||||
// functions, became compiler builtins in CUDA-9 and have been removed from the
|
||||
// CUDA headers. Clang now provides its own implementation of the wrappers.
|
||||
#if CUDA_VERSION >= 9000
|
||||
#include <__clang_cuda_device_functions.h>
|
||||
#include <__clang_cuda_math.h>
|
||||
#endif
|
||||
|
||||
// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
|
||||
|
|
|
|||
326
lib/include/__clang_hip_libdevice_declares.h
vendored
Normal file
326
lib/include/__clang_hip_libdevice_declares.h
vendored
Normal file
|
|
@ -0,0 +1,326 @@
|
|||
/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
|
||||
extern "C" {
|
||||
|
||||
// BEGIN FLOAT
|
||||
__device__ __attribute__((const)) float __ocml_acos_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_asin_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_atan_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_ceil_f32(float);
|
||||
__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
|
||||
float);
|
||||
__device__ float __ocml_cos_f32(float);
|
||||
__device__ float __ocml_native_cos_f32(float);
|
||||
__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
|
||||
__device__ float __ocml_cospi_f32(float);
|
||||
__device__ float __ocml_i0_f32(float);
|
||||
__device__ float __ocml_i1_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erf_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_exp_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fabs_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_floor_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
|
||||
__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
|
||||
float);
|
||||
__device__ float __ocml_frexp_f32(float,
|
||||
__attribute__((address_space(5))) int *);
|
||||
__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
|
||||
__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f32(float);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f32(float);
|
||||
__device__ float __ocml_j0_f32(float);
|
||||
__device__ float __ocml_j1_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
|
||||
__device__ float __ocml_lgamma_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log2_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_logb_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_log_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
|
||||
__device__ float __ocml_modf_f32(float,
|
||||
__attribute__((address_space(5))) float *);
|
||||
__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
|
||||
float);
|
||||
__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
|
||||
__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
|
||||
__device__ float __ocml_remquo_f32(float, float,
|
||||
__attribute__((address_space(5))) int *);
|
||||
__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_rint_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
|
||||
float);
|
||||
__device__ __attribute__((const)) float __ocml_round_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
|
||||
__device__ __attribute__((const)) int __ocml_signbit_f32(float);
|
||||
__device__ float __ocml_sincos_f32(float,
|
||||
__attribute__((address_space(5))) float *);
|
||||
__device__ float __ocml_sincospi_f32(float,
|
||||
__attribute__((address_space(5))) float *);
|
||||
__device__ float __ocml_sin_f32(float);
|
||||
__device__ float __ocml_native_sin_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
|
||||
__device__ float __ocml_sinpi_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
|
||||
__device__ float __ocml_tan_f32(float);
|
||||
__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
|
||||
__device__ float __ocml_tgamma_f32(float);
|
||||
__device__ __attribute__((const)) float __ocml_trunc_f32(float);
|
||||
__device__ float __ocml_y0_f32(float);
|
||||
__device__ float __ocml_y1_f32(float);
|
||||
|
||||
// BEGIN INTRINSICS
|
||||
__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
|
||||
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
|
||||
// END INTRINSICS
|
||||
// END FLOAT
|
||||
|
||||
// BEGIN DOUBLE
|
||||
__device__ __attribute__((const)) double __ocml_acos_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_asin_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_atan_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_ceil_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
|
||||
__device__ double __ocml_cos_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
|
||||
__device__ double __ocml_cospi_f64(double);
|
||||
__device__ double __ocml_i0_f64(double);
|
||||
__device__ double __ocml_i1_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erf_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_exp_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fabs_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_floor_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
|
||||
__device__ double __ocml_frexp_f64(double,
|
||||
__attribute__((address_space(5))) int *);
|
||||
__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
|
||||
__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f64(double);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f64(double);
|
||||
__device__ double __ocml_j0_f64(double);
|
||||
__device__ double __ocml_j1_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
|
||||
__device__ double __ocml_lgamma_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log10_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log2_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_logb_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_log_f64(double);
|
||||
__device__ double __ocml_modf_f64(double,
|
||||
__attribute__((address_space(5))) double *);
|
||||
__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
|
||||
double);
|
||||
__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
|
||||
__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
|
||||
__device__ double __ocml_remquo_f64(double, double,
|
||||
__attribute__((address_space(5))) int *);
|
||||
__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_rint_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
|
||||
double, double);
|
||||
__device__ __attribute__((const)) double __ocml_round_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
|
||||
__device__ __attribute__((const)) int __ocml_signbit_f64(double);
|
||||
__device__ double __ocml_sincos_f64(double,
|
||||
__attribute__((address_space(5))) double *);
|
||||
__device__ double
|
||||
__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
|
||||
__device__ double __ocml_sin_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
|
||||
__device__ double __ocml_sinpi_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
|
||||
__device__ double __ocml_tan_f64(double);
|
||||
__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
|
||||
__device__ double __ocml_tgamma_f64(double);
|
||||
__device__ __attribute__((const)) double __ocml_trunc_f64(double);
|
||||
__device__ double __ocml_y0_f64(double);
|
||||
__device__ double __ocml_y1_f64(double);
|
||||
|
||||
// BEGIN INTRINSICS
|
||||
__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
|
||||
double);
|
||||
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
|
||||
double);
|
||||
|
||||
__device__ __attribute__((const)) double
|
||||
__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
|
||||
__device__ __attribute__((const)) double
|
||||
__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
|
||||
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
|
||||
_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
||||
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
||||
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
||||
__device__ _Float16 __ocml_sin_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
||||
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
||||
|
||||
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
||||
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
||||
|
||||
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
|
||||
float c, bool s);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
||||
__device__ __attribute__((const))
|
||||
__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
||||
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
||||
__device__ inline __2f16
|
||||
__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
|
||||
{
|
||||
return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
|
||||
}
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
||||
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
||||
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
||||
|
||||
} // extern "C"
|
||||
|
||||
#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
|
||||
1185
lib/include/__clang_hip_math.h
vendored
Normal file
1185
lib/include/__clang_hip_math.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
64
lib/include/__clang_hip_runtime_wrapper.h
vendored
Normal file
64
lib/include/__clang_hip_runtime_wrapper.h
vendored
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
/*
|
||||
* WARNING: This header is intended to be directly -include'd by
|
||||
* the compiler and is not supposed to be included by users.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
|
||||
#define __CLANG_HIP_RUNTIME_WRAPPER_H__
|
||||
|
||||
#if __HIP__
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define __host__ __attribute__((host))
|
||||
#define __device__ __attribute__((device))
|
||||
#define __global__ __attribute__((global))
|
||||
#define __shared__ __attribute__((shared))
|
||||
#define __constant__ __attribute__((constant))
|
||||
|
||||
#if __HIP_ENABLE_DEVICE_MALLOC__
|
||||
extern "C" __device__ void *__hip_malloc(size_t __size);
|
||||
extern "C" __device__ void *__hip_free(void *__ptr);
|
||||
static inline __device__ void *malloc(size_t __size) {
|
||||
return __hip_malloc(__size);
|
||||
}
|
||||
static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
|
||||
#else
|
||||
static inline __device__ void *malloc(size_t __size) {
|
||||
__builtin_trap();
|
||||
return nullptr;
|
||||
}
|
||||
static inline __device__ void *free(void *__ptr) {
|
||||
__builtin_trap();
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
#include <__clang_hip_libdevice_declares.h>
|
||||
#include <__clang_hip_math.h>
|
||||
|
||||
#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <new>
|
||||
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
|
||||
#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
|
||||
|
||||
#endif // __HIP__
|
||||
#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
|
||||
402
lib/include/altivec.h
vendored
402
lib/include/altivec.h
vendored
|
|
@ -16761,6 +16761,408 @@ static vector signed short __ATTRS_o_ai vec_nabs(vector signed short __a) {
|
|||
static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
|
||||
return __builtin_altivec_vminsb(__a, -__a);
|
||||
}
|
||||
|
||||
#ifdef __POWER10_VECTOR__
|
||||
/* vec_pdep */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_pdep(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vpdepd(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_pext */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_pext(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vpextd(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_cfuge */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_cfuge(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vcfuged(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_gnb */
|
||||
|
||||
#define vec_gnb(__a, __b) __builtin_altivec_vgnb(__a, __b)
|
||||
|
||||
/* vec_ternarylogic */
|
||||
#ifdef __VSX__
|
||||
#define vec_ternarylogic(__a, __b, __c, __imm) \
|
||||
_Generic((__a), vector unsigned char \
|
||||
: __builtin_vsx_xxeval((vector unsigned long long)(__a), \
|
||||
(vector unsigned long long)(__b), \
|
||||
(vector unsigned long long)(__c), (__imm)), \
|
||||
vector unsigned short \
|
||||
: __builtin_vsx_xxeval((vector unsigned long long)(__a), \
|
||||
(vector unsigned long long)(__b), \
|
||||
(vector unsigned long long)(__c), (__imm)), \
|
||||
vector unsigned int \
|
||||
: __builtin_vsx_xxeval((vector unsigned long long)(__a), \
|
||||
(vector unsigned long long)(__b), \
|
||||
(vector unsigned long long)(__c), (__imm)), \
|
||||
vector unsigned long long \
|
||||
: __builtin_vsx_xxeval((vector unsigned long long)(__a), \
|
||||
(vector unsigned long long)(__b), \
|
||||
(vector unsigned long long)(__c), (__imm)), \
|
||||
vector unsigned __int128 \
|
||||
: __builtin_vsx_xxeval((vector unsigned long long)(__a), \
|
||||
(vector unsigned long long)(__b), \
|
||||
(vector unsigned long long)(__c), (__imm)))
|
||||
#endif /* __VSX__ */
|
||||
|
||||
/* vec_genpcvm */
|
||||
|
||||
#ifdef __VSX__
|
||||
#define vec_genpcvm(__a, __imm) \
|
||||
_Generic((__a), vector unsigned char \
|
||||
: __builtin_vsx_xxgenpcvbm((__a), (int)(__imm)), \
|
||||
vector unsigned short \
|
||||
: __builtin_vsx_xxgenpcvhm((__a), (int)(__imm)), \
|
||||
vector unsigned int \
|
||||
: __builtin_vsx_xxgenpcvwm((__a), (int)(__imm)), \
|
||||
vector unsigned long long \
|
||||
: __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
|
||||
#endif /* __VSX__ */
|
||||
|
||||
/* vec_clrl */
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_clrl(vector signed char __a, unsigned int __n) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vclrrb(__a, __n);
|
||||
#else
|
||||
return __builtin_altivec_vclrlb( __a, __n);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_clrl(vector unsigned char __a, unsigned int __n) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vclrrb((vector signed char)__a, __n);
|
||||
#else
|
||||
return __builtin_altivec_vclrlb((vector signed char)__a, __n);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_clrr */
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_clrr(vector signed char __a, unsigned int __n) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vclrlb(__a, __n);
|
||||
#else
|
||||
return __builtin_altivec_vclrrb( __a, __n);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_clrr(vector unsigned char __a, unsigned int __n) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vclrlb((vector signed char)__a, __n);
|
||||
#else
|
||||
return __builtin_altivec_vclrrb((vector signed char)__a, __n);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_cntlzm */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_cntlzm(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vclzdm(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_cnttzm */
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_cnttzm(vector unsigned long long __a, vector unsigned long long __b) {
|
||||
return __builtin_altivec_vctzdm(__a, __b);
|
||||
}
|
||||
|
||||
/* vec_sldbi */
|
||||
|
||||
#define vec_sldb(__a, __b, __c) __builtin_altivec_vsldbi(__a, __b, (__c & 0x7))
|
||||
|
||||
/* vec_srdbi */
|
||||
|
||||
#define vec_srdb(__a, __b, __c) __builtin_altivec_vsrdbi(__a, __b, (__c & 0x7))
|
||||
|
||||
/* vec_insertl */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_insertl(unsigned char __a, vector unsigned char __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsbrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsblx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_insertl(unsigned short __a, vector unsigned short __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinshrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinshlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_insertl(unsigned int __a, vector unsigned int __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinswrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinswlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_insertl(unsigned long long __a, vector unsigned long long __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsdrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsdlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_insertl(vector unsigned char __a, vector unsigned char __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsbvrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsbvlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_insertl(vector unsigned short __a, vector unsigned short __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinshvrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinshvlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_insertl(vector unsigned int __a, vector unsigned int __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinswvrx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinswvlx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* vec_inserth */
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_inserth(unsigned char __a, vector unsigned char __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsblx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsbrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_inserth(unsigned short __a, vector unsigned short __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinshlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinshrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_inserth(unsigned int __a, vector unsigned int __b, unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinswlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinswrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_inserth(unsigned long long __a, vector unsigned long long __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsdlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsdrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_inserth(vector unsigned char __a, vector unsigned char __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinsbvlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinsbvrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_inserth(vector unsigned short __a, vector unsigned short __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinshvlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinshvrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_inserth(vector unsigned int __a, vector unsigned int __b,
|
||||
unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
return __builtin_altivec_vinswvlx(__b, __c, __a);
|
||||
#else
|
||||
return __builtin_altivec_vinswvrx(__b, __c, __a);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef __VSX__
|
||||
|
||||
/* vec_permx */
|
||||
|
||||
#define vec_permx(__a, __b, __c, __d) \
|
||||
__builtin_vsx_xxpermx((__a), (__b), (__c), (__d))
|
||||
|
||||
/* vec_blendv */
|
||||
|
||||
static __inline__ vector signed char __ATTRS_o_ai
|
||||
vec_blendv(vector signed char __a, vector signed char __b,
|
||||
vector unsigned char __c) {
|
||||
return __builtin_vsx_xxblendvb(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||
vec_blendv(vector unsigned char __a, vector unsigned char __b,
|
||||
vector unsigned char __c) {
|
||||
return __builtin_vsx_xxblendvb(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector signed short __ATTRS_o_ai
|
||||
vec_blendv(vector signed short __a, vector signed short __b,
|
||||
vector unsigned short __c) {
|
||||
return __builtin_vsx_xxblendvh(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||
vec_blendv(vector unsigned short __a, vector unsigned short __b,
|
||||
vector unsigned short __c) {
|
||||
return __builtin_vsx_xxblendvh(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai
|
||||
vec_blendv(vector signed int __a, vector signed int __b,
|
||||
vector unsigned int __c) {
|
||||
return __builtin_vsx_xxblendvw(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||
vec_blendv(vector unsigned int __a, vector unsigned int __b,
|
||||
vector unsigned int __c) {
|
||||
return __builtin_vsx_xxblendvw(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector signed long long __ATTRS_o_ai
|
||||
vec_blendv(vector signed long long __a, vector signed long long __b,
|
||||
vector unsigned long long __c) {
|
||||
return __builtin_vsx_xxblendvd(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||
vec_blendv(vector unsigned long long __a, vector unsigned long long __b,
|
||||
vector unsigned long long __c) {
|
||||
return __builtin_vsx_xxblendvd(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai
|
||||
vec_blendv(vector float __a, vector float __b, vector unsigned int __c) {
|
||||
return __builtin_vsx_xxblendvw(__a, __b, __c);
|
||||
}
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai
|
||||
vec_blendv(vector double __a, vector double __b,
|
||||
vector unsigned long long __c) {
|
||||
return __builtin_vsx_xxblendvd(__a, __b, __c);
|
||||
}
|
||||
|
||||
/* vec_splati */
|
||||
|
||||
#define vec_splati(__a) \
|
||||
_Generic((__a), signed int \
|
||||
: ((vector signed int)__a), unsigned int \
|
||||
: ((vector unsigned int)__a), float \
|
||||
: ((vector float)__a))
|
||||
|
||||
/* vec_spatid */
|
||||
|
||||
static __inline__ vector double __ATTRS_o_ai vec_splatid(const float __a) {
|
||||
return ((vector double)((double)__a));
|
||||
}
|
||||
|
||||
/* vec_splati_ins */
|
||||
|
||||
static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
|
||||
vector signed int __a, const unsigned int __b, const signed int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__a[1 - __b] = __c;
|
||||
__a[3 - __b] = __c;
|
||||
#else
|
||||
__a[__b] = __c;
|
||||
__a[2 + __b] = __c;
|
||||
#endif
|
||||
return __a;
|
||||
}
|
||||
|
||||
static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
|
||||
vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__a[1 - __b] = __c;
|
||||
__a[3 - __b] = __c;
|
||||
#else
|
||||
__a[__b] = __c;
|
||||
__a[2 + __b] = __c;
|
||||
#endif
|
||||
return __a;
|
||||
}
|
||||
|
||||
static __inline__ vector float __ATTRS_o_ai
|
||||
vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
__a[1 - __b] = __c;
|
||||
__a[3 - __b] = __c;
|
||||
#else
|
||||
__a[__b] = __c;
|
||||
__a[2 + __b] = __c;
|
||||
#endif
|
||||
return __a;
|
||||
}
|
||||
|
||||
/* vec_test_lsbb_all_ones */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai
|
||||
vec_test_lsbb_all_ones(vector unsigned char __a) {
|
||||
return __builtin_vsx_xvtlsbb(__a, 1);
|
||||
}
|
||||
|
||||
/* vec_test_lsbb_all_zeros */
|
||||
|
||||
static __inline__ int __ATTRS_o_ai
|
||||
vec_test_lsbb_all_zeros(vector unsigned char __a) {
|
||||
return __builtin_vsx_xvtlsbb(__a, 0);
|
||||
}
|
||||
#endif /* __VSX__ */
|
||||
#endif /* __POWER10_VECTOR__ */
|
||||
|
||||
#undef __ATTRS_o_ai
|
||||
|
||||
#endif /* __ALTIVEC_H */
|
||||
|
|
|
|||
225
lib/include/amxintrin.h
vendored
Normal file
225
lib/include/amxintrin.h
vendored
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===------------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif /* __IMMINTRIN_H */
|
||||
|
||||
#ifndef __AMXINTRIN_H
|
||||
#define __AMXINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
/// number of bytes per row, and the number of rows. If the specified
|
||||
/// palette_id is zero, that signifies the init state for both the tile
|
||||
/// config and the tile data, and the tiles are zeroed. Any invalid
|
||||
/// configurations will result in #GP fault.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
|
||||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_loadconfig(const void *__config)
|
||||
{
|
||||
__builtin_ia32_tile_loadconfig(__config);
|
||||
}
|
||||
|
||||
/// Stores the current tile configuration to a 64-byte memory location
|
||||
/// specified by "mem_addr". The tile configuration includes the tile type
|
||||
/// palette, the number of bytes per row, and the number of rows. If tiles
|
||||
/// are not configured, all zeroes will be stored to memory.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
|
||||
///
|
||||
/// \param __config
|
||||
/// A pointer to 512-bits configuration
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_storeconfig(void *__config)
|
||||
{
|
||||
__builtin_ia32_tile_storeconfig(__config);
|
||||
}
|
||||
|
||||
/// Release the tile configuration to return to the init state, which
|
||||
/// releases all storage it currently holds.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_tile_release(void)
|
||||
{
|
||||
__builtin_ia32_tilerelease();
|
||||
}
|
||||
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst" using the tile configuration previously configured
|
||||
/// via "_tile_loadconfig".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst" using the tile configuration previously configured
|
||||
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
|
||||
/// that the data will likely not be reused in the near future and the data
|
||||
/// caching can be optimized accordingly.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
#define _tile_stream_loadd(dst, base, stride) \
|
||||
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||
/// "stride" using the tile configuration previously configured via
|
||||
/// "_tile_loadconfig".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be stored in memory.
|
||||
#define _tile_stored(dst, base, stride) \
|
||||
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
|
||||
|
||||
/// Zero the tile specified by "tdest".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
||||
///
|
||||
/// \param tile
|
||||
/// The destination tile to be zero. Max size is 1024 Bytes.
|
||||
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
||||
/// "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
16
lib/include/arm_acle.h
vendored
16
lib/include/arm_acle.h
vendored
|
|
@ -22,31 +22,43 @@ extern "C" {
|
|||
|
||||
/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
|
||||
/* 8.3 Memory barriers */
|
||||
#if !defined(_MSC_VER)
|
||||
#if !__has_builtin(__dmb)
|
||||
#define __dmb(i) __builtin_arm_dmb(i)
|
||||
#endif
|
||||
#if !__has_builtin(__dsb)
|
||||
#define __dsb(i) __builtin_arm_dsb(i)
|
||||
#endif
|
||||
#if !__has_builtin(__isb)
|
||||
#define __isb(i) __builtin_arm_isb(i)
|
||||
#endif
|
||||
|
||||
/* 8.4 Hints */
|
||||
|
||||
#if !defined(_MSC_VER)
|
||||
#if !__has_builtin(__wfi)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
|
||||
__builtin_arm_wfi();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__wfe)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
|
||||
__builtin_arm_wfe();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__sev)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
|
||||
__builtin_arm_sev();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__sevl)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
|
||||
__builtin_arm_sevl();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__yield)
|
||||
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
|
||||
__builtin_arm_yield();
|
||||
}
|
||||
|
|
|
|||
20
lib/include/arm_bf16.h
vendored
Normal file
20
lib/include/arm_bf16.h
vendored
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
|
||||
*
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __ARM_BF16_H
|
||||
#define __ARM_BF16_H
|
||||
|
||||
typedef __bf16 bfloat16_t;
|
||||
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
|
||||
#undef __ai
|
||||
|
||||
#endif
|
||||
410
lib/include/arm_cde.h
vendored
Normal file
410
lib/include/arm_cde.h
vendored
Normal file
|
|
@ -0,0 +1,410 @@
|
|||
/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
|
||||
*
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __ARM_CDE_H
|
||||
#define __ARM_CDE_H
|
||||
|
||||
#if !__ARM_FEATURE_CDE
|
||||
#error "CDE support not enabled"
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
|
||||
uint32_t __arm_cx1(int, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
|
||||
uint32_t __arm_cx1a(int, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
|
||||
uint64_t __arm_cx1d(int, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
|
||||
uint64_t __arm_cx1da(int, uint64_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
|
||||
uint32_t __arm_cx2(int, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
|
||||
uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
|
||||
uint64_t __arm_cx2d(int, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
|
||||
uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
|
||||
uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
|
||||
uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
|
||||
uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
|
||||
uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
|
||||
uint32_t __arm_vcx1_u32(int, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
|
||||
uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
|
||||
uint64_t __arm_vcx1d_u64(int, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
|
||||
uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
|
||||
uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
|
||||
uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
|
||||
uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
|
||||
uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
|
||||
uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
|
||||
uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
|
||||
uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
|
||||
uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
|
||||
|
||||
#if __ARM_FEATURE_MVE
|
||||
|
||||
typedef uint16_t mve_pred16_t;
|
||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
|
||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
|
||||
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
|
||||
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
|
||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
|
||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
|
||||
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
|
||||
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
|
||||
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
|
||||
int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
|
||||
int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
|
||||
int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
|
||||
int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
|
||||
uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
|
||||
uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
|
||||
uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
|
||||
uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
|
||||
uint8x16_t __arm_vcx1q_u8(int, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
|
||||
int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
|
||||
int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
|
||||
int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
|
||||
int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
|
||||
uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
|
||||
uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
|
||||
uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
|
||||
uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
|
||||
int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
|
||||
int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
|
||||
int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
|
||||
int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
|
||||
uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
|
||||
uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
|
||||
uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
|
||||
uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
|
||||
int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
|
||||
int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
|
||||
int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
|
||||
int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
|
||||
uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
|
||||
uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
|
||||
uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
|
||||
uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
|
||||
int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
|
||||
int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
|
||||
int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
|
||||
int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
|
||||
uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
|
||||
uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
|
||||
uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
|
||||
uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
|
||||
int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
|
||||
int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
|
||||
int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
|
||||
int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
|
||||
uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
|
||||
uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
|
||||
uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
|
||||
uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
|
||||
int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
|
||||
int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
|
||||
int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
|
||||
int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
|
||||
uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
|
||||
uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
|
||||
uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
|
||||
uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
|
||||
int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
|
||||
int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
|
||||
int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
|
||||
int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
|
||||
uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
|
||||
uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
|
||||
uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
|
||||
uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
|
||||
int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
|
||||
int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
|
||||
int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
|
||||
int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
|
||||
uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
|
||||
uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
|
||||
uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
|
||||
uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
|
||||
int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
|
||||
int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
|
||||
int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
|
||||
int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
|
||||
uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
|
||||
uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
|
||||
uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
|
||||
uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
|
||||
int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
|
||||
int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
|
||||
int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
|
||||
int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
|
||||
uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
|
||||
uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
|
||||
uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
|
||||
uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
|
||||
int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
|
||||
int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
|
||||
int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
|
||||
int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
|
||||
uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
|
||||
uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
|
||||
uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
|
||||
#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
|
||||
#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
|
||||
#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
|
||||
#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
||||
#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
||||
#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
||||
#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
|
||||
#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
||||
|
||||
#endif /* __ARM_FEATURE_MVE */
|
||||
|
||||
#if __ARM_FEATURE_MVE & 2
|
||||
|
||||
typedef __fp16 float16_t;
|
||||
typedef float float32_t;
|
||||
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
|
||||
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
|
||||
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
|
||||
float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
|
||||
float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
|
||||
float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
|
||||
float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
|
||||
float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
|
||||
float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
|
||||
float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
|
||||
float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
|
||||
float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
|
||||
float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
|
||||
uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
|
||||
float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
|
||||
float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
|
||||
float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
|
||||
float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
|
||||
float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
|
||||
float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
|
||||
float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
|
||||
float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
|
||||
uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
|
||||
float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
|
||||
float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
|
||||
float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
|
||||
float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
|
||||
float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
|
||||
float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
|
||||
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
|
||||
uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
|
||||
|
||||
#endif /* __ARM_FEATURE_MVE & 2 */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* __ARM_CDE_H */
|
||||
19136
lib/include/arm_mve.h
vendored
19136
lib/include/arm_mve.h
vendored
File diff suppressed because it is too large
Load diff
16716
lib/include/arm_neon.h
vendored
16716
lib/include/arm_neon.h
vendored
File diff suppressed because it is too large
Load diff
18148
lib/include/arm_sve.h
vendored
Normal file
18148
lib/include/arm_sve.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
2
lib/include/avx2intrin.h
vendored
2
lib/include/avx2intrin.h
vendored
|
|
@ -740,6 +740,8 @@ _mm256_broadcastsi128_si256(__m128i __X)
|
|||
return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
|
||||
}
|
||||
|
||||
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
|
||||
|
||||
#define _mm_blend_epi32(V1, V2, M) \
|
||||
(__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
|
||||
(__v4si)(__m128i)(V2), (int)(M))
|
||||
|
|
|
|||
19
lib/include/avx512bwintrin.h
vendored
19
lib/include/avx512bwintrin.h
vendored
|
|
@ -1504,13 +1504,14 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_slli_epi16(__m512i __A, int __B)
|
||||
_mm512_slli_epi16(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
|
||||
_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_slli_epi16(__A, __B),
|
||||
|
|
@ -1518,7 +1519,7 @@ _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
|
||||
_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_slli_epi16(__A, __B),
|
||||
|
|
@ -1595,13 +1596,14 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srai_epi16(__m512i __A, int __B)
|
||||
_mm512_srai_epi16(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_srai_epi16(__A, __B),
|
||||
|
|
@ -1609,7 +1611,7 @@ _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
|
||||
_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_srai_epi16(__A, __B),
|
||||
|
|
@ -1639,13 +1641,14 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srli_epi16(__m512i __A, int __B)
|
||||
_mm512_srli_epi16(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||
(__v32hi)_mm512_srli_epi16(__A, __B),
|
||||
|
|
|
|||
42
lib/include/avx512fintrin.h
vendored
42
lib/include/avx512fintrin.h
vendored
|
|
@ -5111,13 +5111,14 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
|
|||
(__v8di)_mm512_setzero_si512())
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_slli_epi32(__m512i __A, int __B)
|
||||
_mm512_slli_epi32(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
||||
_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_slli_epi32(__A, __B),
|
||||
|
|
@ -5125,20 +5126,20 @@ _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
|
||||
_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_slli_epi32(__A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_slli_epi64(__m512i __A, int __B)
|
||||
_mm512_slli_epi64(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_slli_epi64(__A, __B),
|
||||
|
|
@ -5146,7 +5147,7 @@ _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_slli_epi64(__A, __B),
|
||||
|
|
@ -5154,13 +5155,14 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srli_epi32(__m512i __A, int __B)
|
||||
_mm512_srli_epi32(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_srli_epi32(__A, __B),
|
||||
|
|
@ -5168,20 +5170,21 @@ _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
|
||||
_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_srli_epi32(__A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srli_epi64(__m512i __A, int __B)
|
||||
_mm512_srli_epi64(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_srli_epi64(__A, __B),
|
||||
|
|
@ -5189,7 +5192,8 @@ _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_srli_epi64(__A, __B),
|
||||
|
|
@ -6593,13 +6597,14 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
(int)(R))
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srai_epi32(__m512i __A, int __B)
|
||||
_mm512_srai_epi32(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_srai_epi32(__A, __B),
|
||||
|
|
@ -6607,20 +6612,21 @@ _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
|
||||
_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
|
||||
unsigned int __B) {
|
||||
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||
(__v16si)_mm512_srai_epi32(__A, __B),
|
||||
(__v16si)_mm512_setzero_si512());
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_srai_epi64(__m512i __A, int __B)
|
||||
_mm512_srai_epi64(__m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_srai_epi64(__A, __B),
|
||||
|
|
@ -6628,7 +6634,7 @@ _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||
_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
|
||||
_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
|
||||
{
|
||||
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||
(__v8di)_mm512_srai_epi64(__A, __B),
|
||||
|
|
|
|||
18
lib/include/avx512vlbwintrin.h
vendored
18
lib/include/avx512vlbwintrin.h
vendored
|
|
@ -1939,7 +1939,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_slli_epi16(__A, __B),
|
||||
|
|
@ -1947,7 +1947,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_slli_epi16(__A, __B),
|
||||
|
|
@ -1955,7 +1955,8 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
|
||||
_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_slli_epi16(__A, __B),
|
||||
|
|
@ -1963,7 +1964,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_slli_epi16(__A, __B),
|
||||
|
|
@ -2091,7 +2092,7 @@ _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_srai_epi16(__A, __B),
|
||||
|
|
@ -2099,7 +2100,7 @@ _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||
(__v8hi)_mm_srai_epi16(__A, __B),
|
||||
|
|
@ -2107,7 +2108,8 @@ _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
|
||||
_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
|
||||
unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_srai_epi16(__A, __B),
|
||||
|
|
@ -2115,7 +2117,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||
(__v16hi)_mm256_srai_epi16(__A, __B),
|
||||
|
|
|
|||
53
lib/include/avx512vlintrin.h
vendored
53
lib/include/avx512vlintrin.h
vendored
|
|
@ -4522,7 +4522,7 @@ _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_slli_epi32(__A, __B),
|
||||
|
|
@ -4530,7 +4530,7 @@ _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_slli_epi32(__A, __B),
|
||||
|
|
@ -4538,7 +4538,7 @@ _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_slli_epi32(__A, __B),
|
||||
|
|
@ -4546,7 +4546,7 @@ _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_slli_epi32(__A, __B),
|
||||
|
|
@ -4586,7 +4586,7 @@ _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||
(__v2di)_mm_slli_epi64(__A, __B),
|
||||
|
|
@ -4594,7 +4594,7 @@ _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||
(__v2di)_mm_slli_epi64(__A, __B),
|
||||
|
|
@ -4602,7 +4602,7 @@ _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||
(__v4di)_mm256_slli_epi64(__A, __B),
|
||||
|
|
@ -4610,7 +4610,7 @@ _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||
(__v4di)_mm256_slli_epi64(__A, __B),
|
||||
|
|
@ -4866,7 +4866,7 @@ _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_srli_epi32(__A, __B),
|
||||
|
|
@ -4874,7 +4874,7 @@ _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_srli_epi32(__A, __B),
|
||||
|
|
@ -4882,7 +4882,7 @@ _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_srli_epi32(__A, __B),
|
||||
|
|
@ -4890,7 +4890,7 @@ _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_srli_epi32(__A, __B),
|
||||
|
|
@ -4930,7 +4930,7 @@ _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||
(__v2di)_mm_srli_epi64(__A, __B),
|
||||
|
|
@ -4938,7 +4938,7 @@ _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||
(__v2di)_mm_srli_epi64(__A, __B),
|
||||
|
|
@ -4946,7 +4946,7 @@ _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||
(__v4di)_mm256_srli_epi64(__A, __B),
|
||||
|
|
@ -4954,7 +4954,7 @@ _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||
(__v4di)_mm256_srli_epi64(__A, __B),
|
||||
|
|
@ -6405,7 +6405,7 @@ _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
||||
_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_srai_epi32(__A, __B),
|
||||
|
|
@ -6413,7 +6413,7 @@ _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
|
||||
_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||
(__v4si)_mm_srai_epi32(__A, __B),
|
||||
|
|
@ -6421,7 +6421,7 @@ _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_srai_epi32(__A, __B),
|
||||
|
|
@ -6429,7 +6429,7 @@ _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B)
|
||||
_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||
(__v8si)_mm256_srai_epi32(__A, __B),
|
||||
|
|
@ -6481,13 +6481,13 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_srai_epi64(__m128i __A, int __imm)
|
||||
_mm_srai_epi64(__m128i __A, unsigned int __imm)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
|
||||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
|
||||
_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
|
||||
(__v2di)_mm_srai_epi64(__A, __imm), \
|
||||
|
|
@ -6495,7 +6495,7 @@ _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm)
|
|||
}
|
||||
|
||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||
_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
|
||||
_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
|
||||
(__v2di)_mm_srai_epi64(__A, __imm), \
|
||||
|
|
@ -6503,13 +6503,14 @@ _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_srai_epi64(__m256i __A, int __imm)
|
||||
_mm256_srai_epi64(__m256i __A, unsigned int __imm)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
|
||||
_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A,
|
||||
unsigned int __imm)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
|
||||
(__v4di)_mm256_srai_epi64(__A, __imm), \
|
||||
|
|
@ -6517,7 +6518,7 @@ _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm)
|
|||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||
_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm)
|
||||
_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
|
||||
(__v4di)_mm256_srai_epi64(__A, __imm), \
|
||||
|
|
|
|||
50
lib/include/bmiintrin.h
vendored
50
lib/include/bmiintrin.h
vendored
|
|
@ -111,7 +111,8 @@ _mm_tzcnt_64(unsigned long long __X)
|
|||
|
||||
#undef __RELAXED_FN_ATTRS
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__BMI__)
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
|
||||
|
|
@ -192,6 +193,28 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
|
|||
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR2 */
|
||||
/// Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
|
||||
/// specify the index of the least significant bit. Bits [15:8] specify the
|
||||
/// number of bits to be extracted.
|
||||
/// \returns An unsigned integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
/// \see __bextr_u32
|
||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||
_bextr2_u32(unsigned int __X, unsigned int __Y) {
|
||||
return __builtin_ia32_bextr_u32(__X, __Y);
|
||||
}
|
||||
|
||||
/// Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
|
|
@ -321,6 +344,28 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
|
|||
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
|
||||
}
|
||||
|
||||
/* Intel-specified, single-leading-underscore version of BEXTR2 */
|
||||
/// Extracts the specified bits from the first operand and returns them
|
||||
/// in the least significant bits of the result.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
|
||||
///
|
||||
/// \param __X
|
||||
/// An unsigned 64-bit integer whose bits are to be extracted.
|
||||
/// \param __Y
|
||||
/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
|
||||
/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
|
||||
/// the number of bits to be extracted.
|
||||
/// \returns An unsigned 64-bit integer whose least significant bits contain the
|
||||
/// extracted bits.
|
||||
/// \see __bextr_u64
|
||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||
_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
|
||||
return __builtin_ia32_bextr_u64(__X, __Y);
|
||||
}
|
||||
|
||||
/// Clears all bits in the source except for the least significant bit
|
||||
/// containing a value of 1 and returns the result.
|
||||
///
|
||||
|
|
@ -376,6 +421,7 @@ __blsr_u64(unsigned long long __X)
|
|||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) */
|
||||
#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
|
||||
|| defined(__BMI__) */
|
||||
|
||||
#endif /* __BMIINTRIN_H */
|
||||
|
|
|
|||
66
lib/include/cet.h
vendored
Normal file
66
lib/include/cet.h
vendored
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
/*===------ cet.h -Control-flow Enforcement Technology feature ------------===
|
||||
* Add x86 feature with IBT and/or SHSTK bits to ELF program property if they
|
||||
* are enabled. Otherwise, contents in this header file are unused. This file
|
||||
* is mainly design for assembly source code which want to enable CET.
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
#ifndef __CET_H
|
||||
#define __CET_H
|
||||
|
||||
#ifdef __ASSEMBLER__
|
||||
|
||||
#ifndef __CET__
|
||||
# define _CET_ENDBR
|
||||
#endif
|
||||
|
||||
#ifdef __CET__
|
||||
|
||||
# ifdef __LP64__
|
||||
# if __CET__ & 0x1
|
||||
# define _CET_ENDBR endbr64
|
||||
# else
|
||||
# define _CET_ENDBR
|
||||
# endif
|
||||
# else
|
||||
# if __CET__ & 0x1
|
||||
# define _CET_ENDBR endbr32
|
||||
# else
|
||||
# define _CET_ENDBR
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
# ifdef __LP64__
|
||||
# define __PROPERTY_ALIGN 3
|
||||
# else
|
||||
# define __PROPERTY_ALIGN 2
|
||||
# endif
|
||||
|
||||
.pushsection ".note.gnu.property", "a"
|
||||
.p2align __PROPERTY_ALIGN
|
||||
.long 1f - 0f /* name length. */
|
||||
.long 4f - 1f /* data length. */
|
||||
/* NT_GNU_PROPERTY_TYPE_0. */
|
||||
.long 5 /* note type. */
|
||||
0:
|
||||
.asciz "GNU" /* vendor name. */
|
||||
1:
|
||||
.p2align __PROPERTY_ALIGN
|
||||
/* GNU_PROPERTY_X86_FEATURE_1_AND. */
|
||||
.long 0xc0000002 /* pr_type. */
|
||||
.long 3f - 2f /* pr_datasz. */
|
||||
2:
|
||||
/* GNU_PROPERTY_X86_FEATURE_1_XXX. */
|
||||
.long __CET__
|
||||
3:
|
||||
.p2align __PROPERTY_ALIGN
|
||||
4:
|
||||
.popsection
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
8
lib/include/cldemoteintrin.h
vendored
8
lib/include/cldemoteintrin.h
vendored
|
|
@ -18,11 +18,19 @@
|
|||
#define __DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("cldemote")))
|
||||
|
||||
/// Hint to hardware that the cache line that contains \p __P should be demoted
|
||||
/// from the cache closest to the processor core to a level more distant from
|
||||
/// the processor core.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS
|
||||
_cldemote(const void * __P) {
|
||||
__builtin_ia32_cldemote(__P);
|
||||
}
|
||||
|
||||
#define _mm_cldemote(p) _cldemote(p)
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
||||
#endif
|
||||
|
|
|
|||
9
lib/include/cpuid.h
vendored
9
lib/include/cpuid.h
vendored
|
|
@ -24,6 +24,10 @@
|
|||
#define signature_CYRIX_ebx 0x69727943
|
||||
#define signature_CYRIX_edx 0x736e4978
|
||||
#define signature_CYRIX_ecx 0x64616574
|
||||
/* HYGON: "HygonGenuine" */
|
||||
#define signature_HYGON_ebx 0x6f677948
|
||||
#define signature_HYGON_edx 0x6e65476e
|
||||
#define signature_HYGON_ecx 0x656e6975
|
||||
/* INTEL: "GenuineIntel" */
|
||||
#define signature_INTEL_ebx 0x756e6547
|
||||
#define signature_INTEL_edx 0x49656e69
|
||||
|
|
@ -182,8 +186,13 @@
|
|||
/* Features in %edx for leaf 7 sub-leaf 0 */
|
||||
#define bit_AVX5124VNNIW 0x00000004
|
||||
#define bit_AVX5124FMAPS 0x00000008
|
||||
#define bit_SERIALIZE 0x00004000
|
||||
#define bit_TSXLDTRK 0x00010000
|
||||
#define bit_PCONFIG 0x00040000
|
||||
#define bit_IBT 0x00100000
|
||||
#define bit_AMXBF16 0x00400000
|
||||
#define bit_AMXTILE 0x01000000
|
||||
#define bit_AMXINT8 0x02000000
|
||||
|
||||
/* Features in %eax for leaf 7 sub-leaf 1 */
|
||||
#define bit_AVX512BF16 0x00000020
|
||||
|
|
|
|||
6
lib/include/emmintrin.h
vendored
6
lib/include/emmintrin.h
vendored
|
|
@ -4970,10 +4970,10 @@ void _mm_pause(void);
|
|||
|
||||
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
|
||||
|
||||
#define _MM_DENORMALS_ZERO_ON (0x0040)
|
||||
#define _MM_DENORMALS_ZERO_OFF (0x0000)
|
||||
#define _MM_DENORMALS_ZERO_ON (0x0040U)
|
||||
#define _MM_DENORMALS_ZERO_OFF (0x0000U)
|
||||
|
||||
#define _MM_DENORMALS_ZERO_MASK (0x0040)
|
||||
#define _MM_DENORMALS_ZERO_MASK (0x0040U)
|
||||
|
||||
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
|
||||
#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
|
||||
|
|
|
|||
207
lib/include/immintrin.h
vendored
207
lib/include/immintrin.h
vendored
|
|
@ -10,198 +10,231 @@
|
|||
#ifndef __IMMINTRIN_H
|
||||
#define __IMMINTRIN_H
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__MMX__)
|
||||
#include <mmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SSE__)
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SSE3__)
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__SSE4_2__) || defined(__SSE4_1__))
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AES__) || defined(__PCLMUL__))
|
||||
#include <wmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__CLFLUSHOPT__)
|
||||
#include <clflushoptintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__CLWB__)
|
||||
#include <clwbintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX__)
|
||||
#include <avxintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX2__)
|
||||
#include <avx2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__F16C__)
|
||||
#include <f16cintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__VPCLMULQDQ__)
|
||||
#include <vpclmulqdqintrin.h>
|
||||
#endif
|
||||
|
||||
/* No feature check desired due to internal checks */
|
||||
#include <bmiintrin.h>
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__BMI2__)
|
||||
#include <bmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__LZCNT__)
|
||||
#include <lzcntintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__POPCNT__)
|
||||
#include <popcntintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__FMA__)
|
||||
#include <fmaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512F__)
|
||||
#include <avx512fintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VL__)
|
||||
#include <avx512vlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512BW__)
|
||||
#include <avx512bwintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512BITALG__)
|
||||
#include <avx512bitalgintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512CD__)
|
||||
#include <avx512cdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VPOPCNTDQ__)
|
||||
#include <avx512vpopcntdqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
|
||||
#include <avx512vpopcntdqvlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VNNI__)
|
||||
#include <avx512vnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VNNI__))
|
||||
#include <avx512vlvnniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512DQ__)
|
||||
#include <avx512dqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BITALG__))
|
||||
#include <avx512vlbitalgintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BW__))
|
||||
#include <avx512vlbwintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512CD__))
|
||||
#include <avx512vlcdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512DQ__))
|
||||
#include <avx512vldqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512ER__)
|
||||
#include <avx512erintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512IFMA__)
|
||||
#include <avx512ifmaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512IFMA__) && defined(__AVX512VL__))
|
||||
#include <avx512ifmavlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VBMI__)
|
||||
#include <avx512vbmiintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VBMI__) && defined(__AVX512VL__))
|
||||
#include <avx512vbmivlintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VBMI2__)
|
||||
#include <avx512vbmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VBMI2__) && defined(__AVX512VL__))
|
||||
#include <avx512vlvbmi2intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512PF__)
|
||||
#include <avx512pfintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BF16__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512BF16__)
|
||||
#include <avx512bf16intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512BF16__))
|
||||
#include <avx512vlbf16intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__PKU__)
|
||||
#include <pkuintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__VAES__)
|
||||
#include <vaesintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__GFNI__)
|
||||
#include <gfniintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RDPID__)
|
||||
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
|
|
@ -213,7 +246,8 @@ _rdpid_u32(void) {
|
|||
}
|
||||
#endif // __RDPID__
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RDRND__)
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||
_rdrand16_step(unsigned short *__p)
|
||||
{
|
||||
|
|
@ -235,7 +269,8 @@ _rdrand64_step(unsigned long long *__p)
|
|||
#endif
|
||||
#endif /* __RDRND__ */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__FSGSBASE__)
|
||||
#ifdef __x86_64__
|
||||
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
|
||||
_readfsbase_u32(void)
|
||||
|
|
@ -288,7 +323,8 @@ _writegsbase_u64(unsigned long long __V)
|
|||
#endif
|
||||
#endif /* __FSGSBASE__ */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MOVBE__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__MOVBE__)
|
||||
|
||||
/* The structs used below are to force the load/store to be unaligned. This
|
||||
* is accomplished with the __packed__ attribute. The __may_alias__ prevents
|
||||
|
|
@ -347,35 +383,42 @@ _storebe_i64(void * __P, long long __D) {
|
|||
#endif
|
||||
#endif /* __MOVBE */
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RTM__)
|
||||
#include <rtmintrin.h>
|
||||
#include <xtestintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SHA__)
|
||||
#include <shaintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__FXSR__)
|
||||
#include <fxsrintrin.h>
|
||||
#endif
|
||||
|
||||
/* No feature check desired due to internal MSC_VER checks */
|
||||
#include <xsaveintrin.h>
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__XSAVEOPT__)
|
||||
#include <xsaveoptintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__XSAVEC__)
|
||||
#include <xsavecintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__XSAVES__)
|
||||
#include <xsavesintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SHSTK__)
|
||||
#include <cetintrin.h>
|
||||
#endif
|
||||
|
||||
|
|
@ -383,57 +426,81 @@ _storebe_i64(void * __P, long long __D) {
|
|||
* whereas others are also available at all times. */
|
||||
#include <adxintrin.h>
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__RDSEED__)
|
||||
#include <rdseedintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__WBNOINVD__)
|
||||
#include <wbnoinvdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__CLDEMOTE__)
|
||||
#include <cldemoteintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__WAITPKG__)
|
||||
#include <waitpkgintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
defined(__MOVDIRI__) || defined(__MOVDIR64B__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__MOVDIRI__) || defined(__MOVDIR64B__)
|
||||
#include <movdirintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__PCONFIG__)
|
||||
#include <pconfigintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SGX__)
|
||||
#include <sgxintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__PTWRITE__)
|
||||
#include <ptwriteintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__INVPCID__)
|
||||
#include <invpcidintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
defined(__AVX512VP2INTERSECT__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
|
||||
#include <amxintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__AVX512VP2INTERSECT__)
|
||||
#include <avx512vp2intersectintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
(defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
|
||||
#include <avx512vlvp2intersectintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__ENQCMD__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__ENQCMD__)
|
||||
#include <enqcmdintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SERIALIZE__)
|
||||
#include <serializeintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__TSXLDTRK__)
|
||||
#include <tsxldtrkintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && __has_extension(gnu_asm)
|
||||
/* Define the default attributes for these intrinsics */
|
||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||
|
|
|
|||
3
lib/include/intrin.h
vendored
3
lib/include/intrin.h
vendored
|
|
@ -289,6 +289,9 @@ unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
|
|||
static __inline__
|
||||
unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
|
||||
static __inline__
|
||||
__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
|
||||
static __inline__
|
||||
|
|
|
|||
6
lib/include/module.modulemap
vendored
6
lib/include/module.modulemap
vendored
|
|
@ -27,6 +27,12 @@ module _Builtin_intrinsics [system] [extern_c] {
|
|||
header "arm_fp16.h"
|
||||
export *
|
||||
}
|
||||
|
||||
explicit module sve {
|
||||
requires sve
|
||||
header "arm_sve.h"
|
||||
export *
|
||||
}
|
||||
}
|
||||
|
||||
explicit module intel {
|
||||
|
|
|
|||
4
lib/include/msa.h
vendored
4
lib/include/msa.h
vendored
|
|
@ -212,10 +212,14 @@ typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
|
|||
#define __msa_ld_h __builtin_msa_ld_h
|
||||
#define __msa_ld_w __builtin_msa_ld_w
|
||||
#define __msa_ld_d __builtin_msa_ld_d
|
||||
#define __msa_ldr_d __builtin_msa_ldr_d
|
||||
#define __msa_ldr_w __builtin_msa_ldrq_w
|
||||
#define __msa_st_b __builtin_msa_st_b
|
||||
#define __msa_st_h __builtin_msa_st_h
|
||||
#define __msa_st_w __builtin_msa_st_w
|
||||
#define __msa_st_d __builtin_msa_st_d
|
||||
#define __msa_str_d __builtin_msa_str_d
|
||||
#define __msa_str_w __builtin_msa_strq_w
|
||||
#define __msa_sat_s_b __builtin_msa_sat_s_b
|
||||
#define __msa_sat_s_h __builtin_msa_sat_s_h
|
||||
#define __msa_sat_s_w __builtin_msa_sat_s_w
|
||||
|
|
|
|||
698
lib/include/opencl-c.h
vendored
698
lib/include/opencl-c.h
vendored
|
|
@ -13432,18 +13432,12 @@ int __ovld atomic_fetch_min_explicit(volatile atomic_int *object, int operand, m
|
|||
uint __ovld atomic_fetch_min(volatile atomic_uint *object, uint operand);
|
||||
uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order);
|
||||
uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
|
||||
uint __ovld atomic_fetch_min(volatile atomic_uint *object, int operand);
|
||||
uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order);
|
||||
uint __ovld atomic_fetch_min_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
|
||||
int __ovld atomic_fetch_max(volatile atomic_int *object, int operand);
|
||||
int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order);
|
||||
int __ovld atomic_fetch_max_explicit(volatile atomic_int *object, int operand, memory_order order, memory_scope scope);
|
||||
uint __ovld atomic_fetch_max(volatile atomic_uint *object, uint operand);
|
||||
uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order);
|
||||
uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, uint operand, memory_order order, memory_scope scope);
|
||||
uint __ovld atomic_fetch_max(volatile atomic_uint *object, int operand);
|
||||
uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order);
|
||||
uint __ovld atomic_fetch_max_explicit(volatile atomic_uint *object, int operand, memory_order order, memory_scope scope);
|
||||
|
||||
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
|
||||
long __ovld atomic_fetch_add(volatile atomic_long *object, long operand);
|
||||
|
|
@ -13482,18 +13476,12 @@ long __ovld atomic_fetch_min_explicit(volatile atomic_long *object, long operand
|
|||
ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, ulong operand);
|
||||
ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
|
||||
ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
|
||||
ulong __ovld atomic_fetch_min(volatile atomic_ulong *object, long operand);
|
||||
ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order);
|
||||
ulong __ovld atomic_fetch_min_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
|
||||
long __ovld atomic_fetch_max(volatile atomic_long *object, long operand);
|
||||
long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order);
|
||||
long __ovld atomic_fetch_max_explicit(volatile atomic_long *object, long operand, memory_order order, memory_scope scope);
|
||||
ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, ulong operand);
|
||||
ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order);
|
||||
ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, ulong operand, memory_order order, memory_scope scope);
|
||||
ulong __ovld atomic_fetch_max(volatile atomic_ulong *object, long operand);
|
||||
ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order);
|
||||
ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long operand, memory_order order, memory_scope scope);
|
||||
#endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
|
||||
|
||||
// OpenCL v2.0 s6.13.11.7.5:
|
||||
|
|
@ -14682,7 +14670,7 @@ void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, flo
|
|||
|
||||
// OpenCL Extension v2.0 s9.18 - Mipmaps
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
#ifdef cl_khr_mipmap_image
|
||||
#if defined(cl_khr_mipmap_image_writes)
|
||||
void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
|
||||
void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
|
||||
void __ovld write_imageui(write_only image1d_t image, int coord, int lod, uint4 color);
|
||||
|
|
@ -14699,15 +14687,16 @@ void __ovld write_imagef(write_only image2d_array_t image_array, int4 coord, int
|
|||
void __ovld write_imagei(write_only image2d_array_t image_array, int4 coord, int lod, int4 color);
|
||||
void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, int lod, uint4 color);
|
||||
|
||||
void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
|
||||
void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
|
||||
void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float depth);
|
||||
void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float depth);
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
|
||||
void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
|
||||
void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
|
||||
#endif
|
||||
#endif //cl_khr_mipmap_image
|
||||
#endif //cl_khr_3d_image_writes
|
||||
|
||||
#endif //defined(cl_khr_mipmap_image_writes)
|
||||
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
// Image write functions for half4 type
|
||||
|
|
@ -14756,7 +14745,7 @@ void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, flo
|
|||
#endif //cl_khr_depth_images
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
#ifdef cl_khr_mipmap_image
|
||||
#if defined(cl_khr_mipmap_image_writes)
|
||||
void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
|
||||
void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
|
||||
void __ovld write_imageui(read_write image1d_t image, int coord, int lod, uint4 color);
|
||||
|
|
@ -14780,8 +14769,9 @@ void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int
|
|||
void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
|
||||
void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
|
||||
void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
|
||||
#endif
|
||||
#endif //cl_khr_mipmap_image
|
||||
#endif //cl_khr_3d_image_writes
|
||||
|
||||
#endif //cl_khr_mipmap_image_writes
|
||||
#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
// Image write functions for half4 type
|
||||
|
|
@ -15470,6 +15460,674 @@ double __ovld __conv sub_group_scan_inclusive_max(double x);
|
|||
|
||||
#endif //cl_khr_subgroups cl_intel_subgroups
|
||||
|
||||
#if defined(cl_khr_subgroup_extended_types)
|
||||
char __ovld __conv sub_group_broadcast( char value, uint index );
|
||||
char2 __ovld __conv sub_group_broadcast( char2 value, uint index );
|
||||
char3 __ovld __conv sub_group_broadcast( char3 value, uint index );
|
||||
char4 __ovld __conv sub_group_broadcast( char4 value, uint index );
|
||||
char8 __ovld __conv sub_group_broadcast( char8 value, uint index );
|
||||
char16 __ovld __conv sub_group_broadcast( char16 value, uint index );
|
||||
|
||||
uchar __ovld __conv sub_group_broadcast( uchar value, uint index );
|
||||
uchar2 __ovld __conv sub_group_broadcast( uchar2 value, uint index );
|
||||
uchar3 __ovld __conv sub_group_broadcast( uchar3 value, uint index );
|
||||
uchar4 __ovld __conv sub_group_broadcast( uchar4 value, uint index );
|
||||
uchar8 __ovld __conv sub_group_broadcast( uchar8 value, uint index );
|
||||
uchar16 __ovld __conv sub_group_broadcast( uchar16 value, uint index );
|
||||
|
||||
short __ovld __conv sub_group_broadcast( short value, uint index );
|
||||
short2 __ovld __conv sub_group_broadcast( short2 value, uint index );
|
||||
short3 __ovld __conv sub_group_broadcast( short3 value, uint index );
|
||||
short4 __ovld __conv sub_group_broadcast( short4 value, uint index );
|
||||
short8 __ovld __conv sub_group_broadcast( short8 value, uint index );
|
||||
short16 __ovld __conv sub_group_broadcast( short16 value, uint index );
|
||||
|
||||
ushort __ovld __conv sub_group_broadcast( ushort value, uint index );
|
||||
ushort2 __ovld __conv sub_group_broadcast( ushort2 value, uint index );
|
||||
ushort3 __ovld __conv sub_group_broadcast( ushort3 value, uint index );
|
||||
ushort4 __ovld __conv sub_group_broadcast( ushort4 value, uint index );
|
||||
ushort8 __ovld __conv sub_group_broadcast( ushort8 value, uint index );
|
||||
ushort16 __ovld __conv sub_group_broadcast( ushort16 value, uint index );
|
||||
|
||||
// scalar int broadcast is part of cl_khr_subgroups
|
||||
int2 __ovld __conv sub_group_broadcast( int2 value, uint index );
|
||||
int3 __ovld __conv sub_group_broadcast( int3 value, uint index );
|
||||
int4 __ovld __conv sub_group_broadcast( int4 value, uint index );
|
||||
int8 __ovld __conv sub_group_broadcast( int8 value, uint index );
|
||||
int16 __ovld __conv sub_group_broadcast( int16 value, uint index );
|
||||
|
||||
// scalar uint broadcast is part of cl_khr_subgroups
|
||||
uint2 __ovld __conv sub_group_broadcast( uint2 value, uint index );
|
||||
uint3 __ovld __conv sub_group_broadcast( uint3 value, uint index );
|
||||
uint4 __ovld __conv sub_group_broadcast( uint4 value, uint index );
|
||||
uint8 __ovld __conv sub_group_broadcast( uint8 value, uint index );
|
||||
uint16 __ovld __conv sub_group_broadcast( uint16 value, uint index );
|
||||
|
||||
// scalar long broadcast is part of cl_khr_subgroups
|
||||
long2 __ovld __conv sub_group_broadcast( long2 value, uint index );
|
||||
long3 __ovld __conv sub_group_broadcast( long3 value, uint index );
|
||||
long4 __ovld __conv sub_group_broadcast( long4 value, uint index );
|
||||
long8 __ovld __conv sub_group_broadcast( long8 value, uint index );
|
||||
long16 __ovld __conv sub_group_broadcast( long16 value, uint index );
|
||||
|
||||
// scalar ulong broadcast is part of cl_khr_subgroups
|
||||
ulong2 __ovld __conv sub_group_broadcast( ulong2 value, uint index );
|
||||
ulong3 __ovld __conv sub_group_broadcast( ulong3 value, uint index );
|
||||
ulong4 __ovld __conv sub_group_broadcast( ulong4 value, uint index );
|
||||
ulong8 __ovld __conv sub_group_broadcast( ulong8 value, uint index );
|
||||
ulong16 __ovld __conv sub_group_broadcast( ulong16 value, uint index );
|
||||
|
||||
// scalar float broadcast is part of cl_khr_subgroups
|
||||
float2 __ovld __conv sub_group_broadcast( float2 value, uint index );
|
||||
float3 __ovld __conv sub_group_broadcast( float3 value, uint index );
|
||||
float4 __ovld __conv sub_group_broadcast( float4 value, uint index );
|
||||
float8 __ovld __conv sub_group_broadcast( float8 value, uint index );
|
||||
float16 __ovld __conv sub_group_broadcast( float16 value, uint index );
|
||||
|
||||
char __ovld __conv sub_group_reduce_add( char value );
|
||||
uchar __ovld __conv sub_group_reduce_add( uchar value );
|
||||
short __ovld __conv sub_group_reduce_add( short value );
|
||||
ushort __ovld __conv sub_group_reduce_add( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_reduce_min( char value );
|
||||
uchar __ovld __conv sub_group_reduce_min( uchar value );
|
||||
short __ovld __conv sub_group_reduce_min( short value );
|
||||
ushort __ovld __conv sub_group_reduce_min( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_reduce_max( char value );
|
||||
uchar __ovld __conv sub_group_reduce_max( uchar value );
|
||||
short __ovld __conv sub_group_reduce_max( short value );
|
||||
ushort __ovld __conv sub_group_reduce_max( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_inclusive_add( char value );
|
||||
uchar __ovld __conv sub_group_scan_inclusive_add( uchar value );
|
||||
short __ovld __conv sub_group_scan_inclusive_add( short value );
|
||||
ushort __ovld __conv sub_group_scan_inclusive_add( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_inclusive_min( char value );
|
||||
uchar __ovld __conv sub_group_scan_inclusive_min( uchar value );
|
||||
short __ovld __conv sub_group_scan_inclusive_min( short value );
|
||||
ushort __ovld __conv sub_group_scan_inclusive_min( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_inclusive_max( char value );
|
||||
uchar __ovld __conv sub_group_scan_inclusive_max( uchar value );
|
||||
short __ovld __conv sub_group_scan_inclusive_max( short value );
|
||||
ushort __ovld __conv sub_group_scan_inclusive_max( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_exclusive_add( char value );
|
||||
uchar __ovld __conv sub_group_scan_exclusive_add( uchar value );
|
||||
short __ovld __conv sub_group_scan_exclusive_add( short value );
|
||||
ushort __ovld __conv sub_group_scan_exclusive_add( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_exclusive_min( char value );
|
||||
uchar __ovld __conv sub_group_scan_exclusive_min( uchar value );
|
||||
short __ovld __conv sub_group_scan_exclusive_min( short value );
|
||||
ushort __ovld __conv sub_group_scan_exclusive_min( ushort value );
|
||||
|
||||
char __ovld __conv sub_group_scan_exclusive_max( char value );
|
||||
uchar __ovld __conv sub_group_scan_exclusive_max( uchar value );
|
||||
short __ovld __conv sub_group_scan_exclusive_max( short value );
|
||||
ushort __ovld __conv sub_group_scan_exclusive_max( ushort value );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
// scalar half broadcast is part of cl_khr_subgroups
|
||||
half2 __ovld __conv sub_group_broadcast( half2 value, uint index );
|
||||
half3 __ovld __conv sub_group_broadcast( half3 value, uint index );
|
||||
half4 __ovld __conv sub_group_broadcast( half4 value, uint index );
|
||||
half8 __ovld __conv sub_group_broadcast( half8 value, uint index );
|
||||
half16 __ovld __conv sub_group_broadcast( half16 value, uint index );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
// scalar double broadcast is part of cl_khr_subgroups
|
||||
double2 __ovld __conv sub_group_broadcast( double2 value, uint index );
|
||||
double3 __ovld __conv sub_group_broadcast( double3 value, uint index );
|
||||
double4 __ovld __conv sub_group_broadcast( double4 value, uint index );
|
||||
double8 __ovld __conv sub_group_broadcast( double8 value, uint index );
|
||||
double16 __ovld __conv sub_group_broadcast( double16 value, uint index );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_extended_types
|
||||
|
||||
#if defined(cl_khr_subgroup_non_uniform_vote)
|
||||
int __ovld sub_group_elect(void);
|
||||
int __ovld sub_group_non_uniform_all( int predicate );
|
||||
int __ovld sub_group_non_uniform_any( int predicate );
|
||||
|
||||
int __ovld sub_group_non_uniform_all_equal( char value );
|
||||
int __ovld sub_group_non_uniform_all_equal( uchar value );
|
||||
int __ovld sub_group_non_uniform_all_equal( short value );
|
||||
int __ovld sub_group_non_uniform_all_equal( ushort value );
|
||||
int __ovld sub_group_non_uniform_all_equal( int value );
|
||||
int __ovld sub_group_non_uniform_all_equal( uint value );
|
||||
int __ovld sub_group_non_uniform_all_equal( long value );
|
||||
int __ovld sub_group_non_uniform_all_equal( ulong value );
|
||||
int __ovld sub_group_non_uniform_all_equal( float value );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
int __ovld sub_group_non_uniform_all_equal( half value );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
int __ovld sub_group_non_uniform_all_equal( double value );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_non_uniform_vote
|
||||
|
||||
#if defined(cl_khr_subgroup_ballot)
|
||||
char __ovld sub_group_non_uniform_broadcast( char value, uint index );
|
||||
char2 __ovld sub_group_non_uniform_broadcast( char2 value, uint index );
|
||||
char3 __ovld sub_group_non_uniform_broadcast( char3 value, uint index );
|
||||
char4 __ovld sub_group_non_uniform_broadcast( char4 value, uint index );
|
||||
char8 __ovld sub_group_non_uniform_broadcast( char8 value, uint index );
|
||||
char16 __ovld sub_group_non_uniform_broadcast( char16 value, uint index );
|
||||
|
||||
uchar __ovld sub_group_non_uniform_broadcast( uchar value, uint index );
|
||||
uchar2 __ovld sub_group_non_uniform_broadcast( uchar2 value, uint index );
|
||||
uchar3 __ovld sub_group_non_uniform_broadcast( uchar3 value, uint index );
|
||||
uchar4 __ovld sub_group_non_uniform_broadcast( uchar4 value, uint index );
|
||||
uchar8 __ovld sub_group_non_uniform_broadcast( uchar8 value, uint index );
|
||||
uchar16 __ovld sub_group_non_uniform_broadcast( uchar16 value, uint index );
|
||||
|
||||
short __ovld sub_group_non_uniform_broadcast( short value, uint index );
|
||||
short2 __ovld sub_group_non_uniform_broadcast( short2 value, uint index );
|
||||
short3 __ovld sub_group_non_uniform_broadcast( short3 value, uint index );
|
||||
short4 __ovld sub_group_non_uniform_broadcast( short4 value, uint index );
|
||||
short8 __ovld sub_group_non_uniform_broadcast( short8 value, uint index );
|
||||
short16 __ovld sub_group_non_uniform_broadcast( short16 value, uint index );
|
||||
|
||||
ushort __ovld sub_group_non_uniform_broadcast( ushort value, uint index );
|
||||
ushort2 __ovld sub_group_non_uniform_broadcast( ushort2 value, uint index );
|
||||
ushort3 __ovld sub_group_non_uniform_broadcast( ushort3 value, uint index );
|
||||
ushort4 __ovld sub_group_non_uniform_broadcast( ushort4 value, uint index );
|
||||
ushort8 __ovld sub_group_non_uniform_broadcast( ushort8 value, uint index );
|
||||
ushort16 __ovld sub_group_non_uniform_broadcast( ushort16 value, uint index );
|
||||
|
||||
int __ovld sub_group_non_uniform_broadcast( int value, uint index );
|
||||
int2 __ovld sub_group_non_uniform_broadcast( int2 value, uint index );
|
||||
int3 __ovld sub_group_non_uniform_broadcast( int3 value, uint index );
|
||||
int4 __ovld sub_group_non_uniform_broadcast( int4 value, uint index );
|
||||
int8 __ovld sub_group_non_uniform_broadcast( int8 value, uint index );
|
||||
int16 __ovld sub_group_non_uniform_broadcast( int16 value, uint index );
|
||||
|
||||
uint __ovld sub_group_non_uniform_broadcast( uint value, uint index );
|
||||
uint2 __ovld sub_group_non_uniform_broadcast( uint2 value, uint index );
|
||||
uint3 __ovld sub_group_non_uniform_broadcast( uint3 value, uint index );
|
||||
uint4 __ovld sub_group_non_uniform_broadcast( uint4 value, uint index );
|
||||
uint8 __ovld sub_group_non_uniform_broadcast( uint8 value, uint index );
|
||||
uint16 __ovld sub_group_non_uniform_broadcast( uint16 value, uint index );
|
||||
|
||||
long __ovld sub_group_non_uniform_broadcast( long value, uint index );
|
||||
long2 __ovld sub_group_non_uniform_broadcast( long2 value, uint index );
|
||||
long3 __ovld sub_group_non_uniform_broadcast( long3 value, uint index );
|
||||
long4 __ovld sub_group_non_uniform_broadcast( long4 value, uint index );
|
||||
long8 __ovld sub_group_non_uniform_broadcast( long8 value, uint index );
|
||||
long16 __ovld sub_group_non_uniform_broadcast( long16 value, uint index );
|
||||
|
||||
ulong __ovld sub_group_non_uniform_broadcast( ulong value, uint index );
|
||||
ulong2 __ovld sub_group_non_uniform_broadcast( ulong2 value, uint index );
|
||||
ulong3 __ovld sub_group_non_uniform_broadcast( ulong3 value, uint index );
|
||||
ulong4 __ovld sub_group_non_uniform_broadcast( ulong4 value, uint index );
|
||||
ulong8 __ovld sub_group_non_uniform_broadcast( ulong8 value, uint index );
|
||||
ulong16 __ovld sub_group_non_uniform_broadcast( ulong16 value, uint index );
|
||||
|
||||
float __ovld sub_group_non_uniform_broadcast( float value, uint index );
|
||||
float2 __ovld sub_group_non_uniform_broadcast( float2 value, uint index );
|
||||
float3 __ovld sub_group_non_uniform_broadcast( float3 value, uint index );
|
||||
float4 __ovld sub_group_non_uniform_broadcast( float4 value, uint index );
|
||||
float8 __ovld sub_group_non_uniform_broadcast( float8 value, uint index );
|
||||
float16 __ovld sub_group_non_uniform_broadcast( float16 value, uint index );
|
||||
|
||||
char __ovld sub_group_broadcast_first( char value );
|
||||
uchar __ovld sub_group_broadcast_first( uchar value );
|
||||
short __ovld sub_group_broadcast_first( short value );
|
||||
ushort __ovld sub_group_broadcast_first( ushort value );
|
||||
int __ovld sub_group_broadcast_first( int value );
|
||||
uint __ovld sub_group_broadcast_first( uint value );
|
||||
long __ovld sub_group_broadcast_first( long value );
|
||||
ulong __ovld sub_group_broadcast_first( ulong value );
|
||||
float __ovld sub_group_broadcast_first( float value );
|
||||
|
||||
uint4 __ovld sub_group_ballot( int predicate );
|
||||
int __ovld __cnfn sub_group_inverse_ballot( uint4 value );
|
||||
int __ovld __cnfn sub_group_ballot_bit_extract( uint4 value, uint index );
|
||||
uint __ovld __cnfn sub_group_ballot_bit_count( uint4 value );
|
||||
|
||||
uint __ovld sub_group_ballot_inclusive_scan( uint4 value );
|
||||
uint __ovld sub_group_ballot_exclusive_scan( uint4 value );
|
||||
uint __ovld sub_group_ballot_find_lsb( uint4 value );
|
||||
uint __ovld sub_group_ballot_find_msb( uint4 value );
|
||||
|
||||
uint4 __ovld __cnfn get_sub_group_eq_mask(void);
|
||||
uint4 __ovld __cnfn get_sub_group_ge_mask(void);
|
||||
uint4 __ovld __cnfn get_sub_group_gt_mask(void);
|
||||
uint4 __ovld __cnfn get_sub_group_le_mask(void);
|
||||
uint4 __ovld __cnfn get_sub_group_lt_mask(void);
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
half __ovld sub_group_non_uniform_broadcast( half value, uint index );
|
||||
half2 __ovld sub_group_non_uniform_broadcast( half2 value, uint index );
|
||||
half3 __ovld sub_group_non_uniform_broadcast( half3 value, uint index );
|
||||
half4 __ovld sub_group_non_uniform_broadcast( half4 value, uint index );
|
||||
half8 __ovld sub_group_non_uniform_broadcast( half8 value, uint index );
|
||||
half16 __ovld sub_group_non_uniform_broadcast( half16 value, uint index );
|
||||
|
||||
half __ovld sub_group_broadcast_first( half value );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld sub_group_non_uniform_broadcast( double value, uint index );
|
||||
double2 __ovld sub_group_non_uniform_broadcast( double2 value, uint index );
|
||||
double3 __ovld sub_group_non_uniform_broadcast( double3 value, uint index );
|
||||
double4 __ovld sub_group_non_uniform_broadcast( double4 value, uint index );
|
||||
double8 __ovld sub_group_non_uniform_broadcast( double8 value, uint index );
|
||||
double16 __ovld sub_group_non_uniform_broadcast( double16 value, uint index );
|
||||
|
||||
double __ovld sub_group_broadcast_first( double value );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_ballot
|
||||
|
||||
#if defined(cl_khr_subgroup_non_uniform_arithmetic)
|
||||
char __ovld sub_group_non_uniform_reduce_add( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_add( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_add( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_add( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_add( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_add( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_add( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_add( ulong value );
|
||||
float __ovld sub_group_non_uniform_reduce_add( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_mul( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_mul( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_mul( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_mul( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_mul( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_mul( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_mul( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_mul( ulong value );
|
||||
float __ovld sub_group_non_uniform_reduce_mul( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_min( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_min( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_min( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_min( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_min( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_min( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_min( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_min( ulong value );
|
||||
float __ovld sub_group_non_uniform_reduce_min( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_max( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_max( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_max( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_max( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_max( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_max( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_max( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_max( ulong value );
|
||||
float __ovld sub_group_non_uniform_reduce_max( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_add( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_add( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_add( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_add( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_add( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_add( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_add( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_add( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_inclusive_add( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_mul( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_mul( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_mul( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_mul( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_mul( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_mul( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_mul( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_mul( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_inclusive_mul( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_min( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_min( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_min( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_min( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_min( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_min( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_min( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_min( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_inclusive_min( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_max( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_max( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_max( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_max( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_max( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_max( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_max( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_max( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_inclusive_max( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_add( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_add( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_add( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_add( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_add( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_add( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_add( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_add( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_exclusive_add( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_mul( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_mul( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_mul( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_mul( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_mul( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_mul( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_mul( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_mul( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_exclusive_mul( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_min( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_min( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_min( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_min( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_min( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_min( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_min( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_min( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_exclusive_min( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_max( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_max( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_max( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_max( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_max( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_max( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_max( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_max( ulong value );
|
||||
float __ovld sub_group_non_uniform_scan_exclusive_max( float value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_and( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_and( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_and( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_and( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_and( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_and( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_and( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_and( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_or( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_or( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_or( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_or( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_or( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_or( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_or( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_or( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_reduce_xor( char value );
|
||||
uchar __ovld sub_group_non_uniform_reduce_xor( uchar value );
|
||||
short __ovld sub_group_non_uniform_reduce_xor( short value );
|
||||
ushort __ovld sub_group_non_uniform_reduce_xor( ushort value );
|
||||
int __ovld sub_group_non_uniform_reduce_xor( int value );
|
||||
uint __ovld sub_group_non_uniform_reduce_xor( uint value );
|
||||
long __ovld sub_group_non_uniform_reduce_xor( long value );
|
||||
ulong __ovld sub_group_non_uniform_reduce_xor( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_and( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_and( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_and( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_and( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_and( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_and( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_and( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_and( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_or( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_or( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_or( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_or( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_or( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_or( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_or( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_or( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_inclusive_xor( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_inclusive_xor( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_inclusive_xor( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_inclusive_xor( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_xor( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_inclusive_xor( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_inclusive_xor( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_inclusive_xor( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_and( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_and( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_and( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_and( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_and( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_and( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_and( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_and( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_or( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_or( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_or( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_or( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_or( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_or( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_or( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_or( ulong value );
|
||||
|
||||
char __ovld sub_group_non_uniform_scan_exclusive_xor( char value );
|
||||
uchar __ovld sub_group_non_uniform_scan_exclusive_xor( uchar value );
|
||||
short __ovld sub_group_non_uniform_scan_exclusive_xor( short value );
|
||||
ushort __ovld sub_group_non_uniform_scan_exclusive_xor( ushort value );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_xor( int value );
|
||||
uint __ovld sub_group_non_uniform_scan_exclusive_xor( uint value );
|
||||
long __ovld sub_group_non_uniform_scan_exclusive_xor( long value );
|
||||
ulong __ovld sub_group_non_uniform_scan_exclusive_xor( ulong value );
|
||||
|
||||
int __ovld sub_group_non_uniform_reduce_logical_and( int predicate );
|
||||
int __ovld sub_group_non_uniform_reduce_logical_or( int predicate );
|
||||
int __ovld sub_group_non_uniform_reduce_logical_xor( int predicate );
|
||||
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_logical_and( int predicate );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_logical_or( int predicate );
|
||||
int __ovld sub_group_non_uniform_scan_inclusive_logical_xor( int predicate );
|
||||
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_logical_and( int predicate );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_logical_or( int predicate );
|
||||
int __ovld sub_group_non_uniform_scan_exclusive_logical_xor( int predicate );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
half __ovld sub_group_non_uniform_reduce_add( half value );
|
||||
half __ovld sub_group_non_uniform_reduce_mul( half value );
|
||||
half __ovld sub_group_non_uniform_reduce_min( half value );
|
||||
half __ovld sub_group_non_uniform_reduce_max( half value );
|
||||
half __ovld sub_group_non_uniform_scan_inclusive_add( half value );
|
||||
half __ovld sub_group_non_uniform_scan_inclusive_mul( half value );
|
||||
half __ovld sub_group_non_uniform_scan_inclusive_min( half value );
|
||||
half __ovld sub_group_non_uniform_scan_inclusive_max( half value );
|
||||
half __ovld sub_group_non_uniform_scan_exclusive_add( half value );
|
||||
half __ovld sub_group_non_uniform_scan_exclusive_mul( half value );
|
||||
half __ovld sub_group_non_uniform_scan_exclusive_min( half value );
|
||||
half __ovld sub_group_non_uniform_scan_exclusive_max( half value );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld sub_group_non_uniform_reduce_add( double value );
|
||||
double __ovld sub_group_non_uniform_reduce_mul( double value );
|
||||
double __ovld sub_group_non_uniform_reduce_min( double value );
|
||||
double __ovld sub_group_non_uniform_reduce_max( double value );
|
||||
double __ovld sub_group_non_uniform_scan_inclusive_add( double value );
|
||||
double __ovld sub_group_non_uniform_scan_inclusive_mul( double value );
|
||||
double __ovld sub_group_non_uniform_scan_inclusive_min( double value );
|
||||
double __ovld sub_group_non_uniform_scan_inclusive_max( double value );
|
||||
double __ovld sub_group_non_uniform_scan_exclusive_add( double value );
|
||||
double __ovld sub_group_non_uniform_scan_exclusive_mul( double value );
|
||||
double __ovld sub_group_non_uniform_scan_exclusive_min( double value );
|
||||
double __ovld sub_group_non_uniform_scan_exclusive_max( double value );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_non_uniform_arithmetic
|
||||
|
||||
#if defined(cl_khr_subgroup_shuffle)
|
||||
char __ovld sub_group_shuffle( char value, uint index );
|
||||
uchar __ovld sub_group_shuffle( uchar value, uint index );
|
||||
short __ovld sub_group_shuffle( short value, uint index );
|
||||
ushort __ovld sub_group_shuffle( ushort value, uint index );
|
||||
int __ovld sub_group_shuffle( int value, uint index );
|
||||
uint __ovld sub_group_shuffle( uint value, uint index );
|
||||
long __ovld sub_group_shuffle( long value, uint index );
|
||||
ulong __ovld sub_group_shuffle( ulong value, uint index );
|
||||
float __ovld sub_group_shuffle( float value, uint index );
|
||||
|
||||
char __ovld sub_group_shuffle_xor( char value, uint mask );
|
||||
uchar __ovld sub_group_shuffle_xor( uchar value, uint mask );
|
||||
short __ovld sub_group_shuffle_xor( short value, uint mask );
|
||||
ushort __ovld sub_group_shuffle_xor( ushort value, uint mask );
|
||||
int __ovld sub_group_shuffle_xor( int value, uint mask );
|
||||
uint __ovld sub_group_shuffle_xor( uint value, uint mask );
|
||||
long __ovld sub_group_shuffle_xor( long value, uint mask );
|
||||
ulong __ovld sub_group_shuffle_xor( ulong value, uint mask );
|
||||
float __ovld sub_group_shuffle_xor( float value, uint mask );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
half __ovld sub_group_shuffle( half value, uint index );
|
||||
half __ovld sub_group_shuffle_xor( half value, uint mask );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld sub_group_shuffle( double value, uint index );
|
||||
double __ovld sub_group_shuffle_xor( double value, uint mask );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_shuffle
|
||||
|
||||
#if defined(cl_khr_subgroup_shuffle_relative)
|
||||
char __ovld sub_group_shuffle_up( char value, uint delta );
|
||||
uchar __ovld sub_group_shuffle_up( uchar value, uint delta );
|
||||
short __ovld sub_group_shuffle_up( short value, uint delta );
|
||||
ushort __ovld sub_group_shuffle_up( ushort value, uint delta );
|
||||
int __ovld sub_group_shuffle_up( int value, uint delta );
|
||||
uint __ovld sub_group_shuffle_up( uint value, uint delta );
|
||||
long __ovld sub_group_shuffle_up( long value, uint delta );
|
||||
ulong __ovld sub_group_shuffle_up( ulong value, uint delta );
|
||||
float __ovld sub_group_shuffle_up( float value, uint delta );
|
||||
|
||||
char __ovld sub_group_shuffle_down( char value, uint delta );
|
||||
uchar __ovld sub_group_shuffle_down( uchar value, uint delta );
|
||||
short __ovld sub_group_shuffle_down( short value, uint delta );
|
||||
ushort __ovld sub_group_shuffle_down( ushort value, uint delta );
|
||||
int __ovld sub_group_shuffle_down( int value, uint delta );
|
||||
uint __ovld sub_group_shuffle_down( uint value, uint delta );
|
||||
long __ovld sub_group_shuffle_down( long value, uint delta );
|
||||
ulong __ovld sub_group_shuffle_down( ulong value, uint delta );
|
||||
float __ovld sub_group_shuffle_down( float value, uint delta );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
half __ovld sub_group_shuffle_up( half value, uint delta );
|
||||
half __ovld sub_group_shuffle_down( half value, uint delta );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld sub_group_shuffle_up( double value, uint delta );
|
||||
double __ovld sub_group_shuffle_down( double value, uint delta );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_shuffle_relative
|
||||
|
||||
#if defined(cl_khr_subgroup_clustered_reduce)
|
||||
char __ovld sub_group_clustered_reduce_add( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_add( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_add( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_add( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_add( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_add( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_add( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_add( ulong value, uint clustersize );
|
||||
float __ovld sub_group_clustered_reduce_add( float value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_mul( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_mul( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_mul( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_mul( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_mul( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_mul( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_mul( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_mul( ulong value, uint clustersize );
|
||||
float __ovld sub_group_clustered_reduce_mul( float value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_min( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_min( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_min( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_min( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_min( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_min( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_min( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_min( ulong value, uint clustersize );
|
||||
float __ovld sub_group_clustered_reduce_min( float value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_max( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_max( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_max( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_max( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_max( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_max( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_max( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_max( ulong value, uint clustersize );
|
||||
float __ovld sub_group_clustered_reduce_max( float value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_and( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_and( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_and( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_and( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_and( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_and( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_and( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_and( ulong value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_or( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_or( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_or( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_or( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_or( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_or( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_or( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_or( ulong value, uint clustersize );
|
||||
|
||||
char __ovld sub_group_clustered_reduce_xor( char value, uint clustersize );
|
||||
uchar __ovld sub_group_clustered_reduce_xor( uchar value, uint clustersize );
|
||||
short __ovld sub_group_clustered_reduce_xor( short value, uint clustersize );
|
||||
ushort __ovld sub_group_clustered_reduce_xor( ushort value, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_xor( int value, uint clustersize );
|
||||
uint __ovld sub_group_clustered_reduce_xor( uint value, uint clustersize );
|
||||
long __ovld sub_group_clustered_reduce_xor( long value, uint clustersize );
|
||||
ulong __ovld sub_group_clustered_reduce_xor( ulong value, uint clustersize );
|
||||
|
||||
int __ovld sub_group_clustered_reduce_logical_and( int predicate, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_logical_or( int predicate, uint clustersize );
|
||||
int __ovld sub_group_clustered_reduce_logical_xor( int predicate, uint clustersize );
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
half __ovld sub_group_clustered_reduce_add( half value, uint clustersize );
|
||||
half __ovld sub_group_clustered_reduce_mul( half value, uint clustersize );
|
||||
half __ovld sub_group_clustered_reduce_min( half value, uint clustersize );
|
||||
half __ovld sub_group_clustered_reduce_max( half value, uint clustersize );
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#if defined(cl_khr_fp64)
|
||||
double __ovld sub_group_clustered_reduce_add( double value, uint clustersize );
|
||||
double __ovld sub_group_clustered_reduce_mul( double value, uint clustersize );
|
||||
double __ovld sub_group_clustered_reduce_min( double value, uint clustersize );
|
||||
double __ovld sub_group_clustered_reduce_max( double value, uint clustersize );
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#endif // cl_khr_subgroup_clustered_reduce
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
// Intel-Specific Sub Group Functions
|
||||
float __ovld __conv intel_sub_group_shuffle( float x, uint c );
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/*===---- __clang_openmp_math_declares.h - OpenMP math declares ------------===
|
||||
/*===- __clang_openmp_device_functions.h - OpenMP device function declares -===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
|
|
@ -7,27 +7,36 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_OPENMP_MATH_DECLARES_H__
|
||||
#define __CLANG_OPENMP_MATH_DECLARES_H__
|
||||
#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
|
||||
#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
#if defined(__NVPTX__) && defined(_OPENMP)
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define __CUDA__
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
#endif
|
||||
#define __OPENMP_NVPTX__
|
||||
|
||||
/// Include declarations for libdevice functions.
|
||||
#include <__clang_cuda_libdevice_declares.h>
|
||||
|
||||
/// Provide definitions for these functions.
|
||||
#include <__clang_cuda_device_functions.h>
|
||||
|
||||
#undef __OPENMP_NVPTX__
|
||||
#undef __CUDA__
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
#endif
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#if defined(__NVPTX__) && defined(_OPENMP)
|
||||
/// TODO:
|
||||
/// We are currently reusing the functionality of the Clang-CUDA code path
|
||||
/// as an alternative to the host declarations provided by math.h and cmath.
|
||||
/// This is suboptimal.
|
||||
///
|
||||
/// We should instead declare the device functions in a similar way, e.g.,
|
||||
/// through OpenMP 5.0 variants, and afterwards populate the module with the
|
||||
/// host declarations by unconditionally including the host math.h or cmath,
|
||||
/// respectively. This is actually what the Clang-CUDA code path does, using
|
||||
/// __device__ instead of variants to avoid redeclarations and get the desired
|
||||
/// overload resolution.
|
||||
|
||||
#define __CUDA__
|
||||
|
||||
#if defined(__cplusplus)
|
||||
#include <__clang_cuda_cmath.h>
|
||||
#endif
|
||||
|
||||
#undef __CUDA__
|
||||
|
||||
/// Magic macro for stopping the math.h/cmath host header from being included.
|
||||
#define __CLANG_NO_HOST_MATH__
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/*===-------------- cmath - Alternative cmath header -----------------------===
|
||||
/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
|
|
@ -7,10 +7,69 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#include <__clang_openmp_math.h>
|
||||
#ifndef __CLANG_OPENMP_CMATH_H__
|
||||
#define __CLANG_OPENMP_CMATH_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
#include_next <cmath>
|
||||
|
||||
// Make sure we include our math.h overlay, it probably happend already but we
|
||||
// need to be sure.
|
||||
#include <math.h>
|
||||
|
||||
// We (might) need cstdlib because __clang_cuda_cmath.h below declares `abs`
|
||||
// which might live in cstdlib.
|
||||
#include <cstdlib>
|
||||
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
#include <__clang_cuda_cmath.h>
|
||||
#undef __OPENMP_NVPTX__
|
||||
#undef __CUDA__
|
||||
|
||||
// Overloads not provided by the CUDA wrappers but by the CUDA system headers.
|
||||
// Since we do not include the latter we define them ourselves.
|
||||
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
|
||||
|
||||
__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
|
||||
__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
|
||||
__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
|
||||
__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
|
||||
__DEVICE__ float erf(float __x) { return ::erff(__x); }
|
||||
__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
|
||||
__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
|
||||
__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
|
||||
__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
|
||||
__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
|
||||
__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
|
||||
__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
|
||||
__DEVICE__ long long int llrint(float __x) { return ::llrintf(__x); }
|
||||
__DEVICE__ long long int llround(float __x) { return ::llroundf(__x); }
|
||||
__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
|
||||
__DEVICE__ float log2(float __x) { return ::log2f(__x); }
|
||||
__DEVICE__ float logb(float __x) { return ::logbf(__x); }
|
||||
__DEVICE__ long int lrint(float __x) { return ::lrintf(__x); }
|
||||
__DEVICE__ long int lround(float __x) { return ::lroundf(__x); }
|
||||
__DEVICE__ float nextafter(float __x, float __y) {
|
||||
return ::nextafterf(__x, __y);
|
||||
}
|
||||
__DEVICE__ float remainder(float __x, float __y) {
|
||||
return ::remainderf(__x, __y);
|
||||
}
|
||||
__DEVICE__ float scalbln(float __x, long int __y) {
|
||||
return ::scalblnf(__x, __y);
|
||||
}
|
||||
__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
|
||||
__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
|
||||
|
||||
#undef __DEVICE__
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
#ifndef __CLANG_NO_HOST_MATH__
|
||||
#include_next <cmath>
|
||||
#else
|
||||
#undef __CLANG_NO_HOST_MATH__
|
||||
#endif
|
||||
|
|
|
|||
25
lib/include/openmp_wrappers/complex
Normal file
25
lib/include/openmp_wrappers/complex
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
/*===-- complex --- OpenMP complex wrapper for target regions --------- c++ -===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_OPENMP_COMPLEX__
|
||||
#define __CLANG_OPENMP_COMPLEX__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
// We require std::math functions in the complex builtins below.
|
||||
#include <cmath>
|
||||
|
||||
#define __CUDA__
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
#endif
|
||||
|
||||
// Grab the host header too.
|
||||
#include_next <complex>
|
||||
25
lib/include/openmp_wrappers/complex.h
Normal file
25
lib/include/openmp_wrappers/complex.h
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
/*===-- complex --- OpenMP complex wrapper for target regions --------- c++ -===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __CLANG_OPENMP_COMPLEX_H__
|
||||
#define __CLANG_OPENMP_COMPLEX_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
// We require math functions in the complex builtins below.
|
||||
#include <math.h>
|
||||
|
||||
#define __CUDA__
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
#endif
|
||||
|
||||
// Grab the host header too.
|
||||
#include_next <complex.h>
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
/*===------------- math.h - Alternative math.h header ----------------------===
|
||||
/*===---- openmp_wrapper/math.h -------- OpenMP math.h intercept ------ c++ -===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
|
|
@ -7,11 +7,45 @@
|
|||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#include <__clang_openmp_math.h>
|
||||
|
||||
#ifndef __CLANG_NO_HOST_MATH__
|
||||
#include_next <math.h>
|
||||
#else
|
||||
#undef __CLANG_NO_HOST_MATH__
|
||||
// If we are in C++ mode and include <math.h> (not <cmath>) first, we still need
|
||||
// to make sure <cmath> is read first. The problem otherwise is that we haven't
|
||||
// seen the declarations of the math.h functions when the system math.h includes
|
||||
// our cmath overlay. However, our cmath overlay, or better the underlying
|
||||
// overlay, e.g. CUDA, uses the math.h functions. Since we haven't declared them
|
||||
// yet we get errors. CUDA avoids this by eagerly declaring all math functions
|
||||
// (in the __device__ space) but we cannot do this. Instead we break the
|
||||
// dependence by forcing cmath to go first. While our cmath will in turn include
|
||||
// this file, the cmath guards will prevent recursion.
|
||||
#ifdef __cplusplus
|
||||
#include <cmath>
|
||||
#endif
|
||||
|
||||
#ifndef __CLANG_OPENMP_MATH_H__
|
||||
#define __CLANG_OPENMP_MATH_H__
|
||||
|
||||
#ifndef _OPENMP
|
||||
#error "This file is for OpenMP compilation only."
|
||||
#endif
|
||||
|
||||
#include_next <math.h>
|
||||
|
||||
// We need limits.h for __clang_cuda_math.h below and because it should not hurt
|
||||
// we include it eagerly here.
|
||||
#include <limits.h>
|
||||
|
||||
// We need stdlib.h because (for now) __clang_cuda_math.h below declares `abs`
|
||||
// which should live in stdlib.h.
|
||||
#include <stdlib.h>
|
||||
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
#include <__clang_cuda_math.h>
|
||||
#undef __OPENMP_NVPTX__
|
||||
#undef __CUDA__
|
||||
|
||||
#pragma omp end declare variant
|
||||
|
||||
#endif
|
||||
|
|
|
|||
70
lib/include/openmp_wrappers/new
Normal file
70
lib/include/openmp_wrappers/new
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
//===--------- new - OPENMP wrapper for <new> ------------------------------===
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===-----------------------------------------------------------------------===
|
||||
|
||||
#ifndef __CLANG_OPENMP_WRAPPERS_NEW
|
||||
#define __CLANG_OPENMP_WRAPPERS_NEW
|
||||
|
||||
#include_next <new>
|
||||
|
||||
#if defined(__NVPTX__) && defined(_OPENMP)
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#pragma push_macro("OPENMP_NOEXCEPT")
|
||||
#if __cplusplus >= 201103L
|
||||
#define OPENMP_NOEXCEPT noexcept
|
||||
#else
|
||||
#define OPENMP_NOEXCEPT
|
||||
#endif
|
||||
|
||||
// Device overrides for non-placement new and delete.
|
||||
inline void *operator new(__SIZE_TYPE__ size) {
|
||||
if (size == 0)
|
||||
size = 1;
|
||||
return ::malloc(size);
|
||||
}
|
||||
inline void *operator new(__SIZE_TYPE__ size,
|
||||
const std::nothrow_t &) OPENMP_NOEXCEPT {
|
||||
return ::operator new(size);
|
||||
}
|
||||
|
||||
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
|
||||
inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
|
||||
return ::operator new(size);
|
||||
}
|
||||
|
||||
inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
|
||||
if (ptr)
|
||||
::free(ptr);
|
||||
}
|
||||
inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
|
||||
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
inline void operator delete[](void *ptr,
|
||||
const std::nothrow_t &) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
|
||||
// Sized delete, C++14 only.
|
||||
#if __cplusplus >= 201402L
|
||||
inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("OPENMP_NOEXCEPT")
|
||||
#endif
|
||||
|
||||
#endif // include guard
|
||||
30
lib/include/serializeintrin.h
vendored
Normal file
30
lib/include/serializeintrin.h
vendored
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
/*===--------------- serializeintrin.h - serialize intrinsics --------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __SERIALIZEINTRIN_H
|
||||
#define __SERIALIZEINTRIN_H
|
||||
|
||||
/// Serialize instruction fetch and execution.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
|
||||
///
|
||||
static __inline__ void
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
|
||||
_serialize (void)
|
||||
{
|
||||
__builtin_ia32_serialize ();
|
||||
}
|
||||
|
||||
#endif /* __SERIALIZEINTRIN_H */
|
||||
56
lib/include/tsxldtrkintrin.h
vendored
Normal file
56
lib/include/tsxldtrkintrin.h
vendored
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
*===-----------------------------------------------------------------------===
|
||||
*/
|
||||
|
||||
#ifndef __IMMINTRIN_H
|
||||
#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
|
||||
#ifndef __TSXLDTRKINTRIN_H
|
||||
#define __TSXLDTRKINTRIN_H
|
||||
|
||||
/* Define the default attributes for the functions in this file */
|
||||
#define _DEFAULT_FN_ATTRS \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
|
||||
|
||||
/// Marks the start of an TSX (RTM) suspend load address tracking region. If
|
||||
/// this intrinsic is used inside a transactional region, subsequent loads
|
||||
/// are not added to the read set of the transaction. If it's used inside a
|
||||
/// suspend load address tracking region it will cause transaction abort.
|
||||
/// If it's used outside of a transactional region it behaves like a NOP.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
|
||||
///
|
||||
static __inline__ void _DEFAULT_FN_ATTRS
|
||||
_xsusldtrk (void)
|
||||
{
|
||||
__builtin_ia32_xsusldtrk();
|
||||
}
|
||||
|
||||
/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
|
||||
/// intrinsic is used inside a suspend load address tracking region it will
|
||||
/// end the suspend region and all following load addresses will be added to
|
||||
/// the transaction read set. If it's used inside an active transaction but
|
||||
/// not in a suspend region it will cause transaction abort. If it's used
|
||||
/// outside of a transactional region it behaves like a NOP.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the \c XRESLDTRK instruction.
|
||||
///
|
||||
static __inline__ void _DEFAULT_FN_ATTRS
|
||||
_xresldtrk (void)
|
||||
{
|
||||
__builtin_ia32_xresldtrk();
|
||||
}
|
||||
|
||||
#undef _DEFAULT_FN_ATTRS
|
||||
|
||||
#endif /* __TSXLDTRKINTRIN_H */
|
||||
8994
lib/include/vecintrin.h
vendored
8994
lib/include/vecintrin.h
vendored
File diff suppressed because it is too large
Load diff
1133
lib/include/wasm_simd128.h
vendored
Normal file
1133
lib/include/wasm_simd128.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
27
lib/include/x86intrin.h
vendored
27
lib/include/x86intrin.h
vendored
|
|
@ -14,39 +14,48 @@
|
|||
|
||||
#include <immintrin.h>
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__3dNOW__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__3dNOW__)
|
||||
#include <mm3dnow.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__PRFCHW__)
|
||||
#include <prfchwintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE4A__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__SSE4A__)
|
||||
#include <ammintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA4__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__FMA4__)
|
||||
#include <fma4intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XOP__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__XOP__)
|
||||
#include <xopintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__TBM__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__TBM__)
|
||||
#include <tbmintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__LWP__)
|
||||
#include <lwpintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__MWAITX__)
|
||||
#include <mwaitxintrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__CLZERO__)
|
||||
#include <clzerointrin.h>
|
||||
#endif
|
||||
|
||||
|
|
|
|||
44
lib/include/xmmintrin.h
vendored
44
lib/include/xmmintrin.h
vendored
|
|
@ -2931,31 +2931,31 @@ _mm_movemask_ps(__m128 __a)
|
|||
|
||||
#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
|
||||
|
||||
#define _MM_EXCEPT_INVALID (0x0001)
|
||||
#define _MM_EXCEPT_DENORM (0x0002)
|
||||
#define _MM_EXCEPT_DIV_ZERO (0x0004)
|
||||
#define _MM_EXCEPT_OVERFLOW (0x0008)
|
||||
#define _MM_EXCEPT_UNDERFLOW (0x0010)
|
||||
#define _MM_EXCEPT_INEXACT (0x0020)
|
||||
#define _MM_EXCEPT_MASK (0x003f)
|
||||
#define _MM_EXCEPT_INVALID (0x0001U)
|
||||
#define _MM_EXCEPT_DENORM (0x0002U)
|
||||
#define _MM_EXCEPT_DIV_ZERO (0x0004U)
|
||||
#define _MM_EXCEPT_OVERFLOW (0x0008U)
|
||||
#define _MM_EXCEPT_UNDERFLOW (0x0010U)
|
||||
#define _MM_EXCEPT_INEXACT (0x0020U)
|
||||
#define _MM_EXCEPT_MASK (0x003fU)
|
||||
|
||||
#define _MM_MASK_INVALID (0x0080)
|
||||
#define _MM_MASK_DENORM (0x0100)
|
||||
#define _MM_MASK_DIV_ZERO (0x0200)
|
||||
#define _MM_MASK_OVERFLOW (0x0400)
|
||||
#define _MM_MASK_UNDERFLOW (0x0800)
|
||||
#define _MM_MASK_INEXACT (0x1000)
|
||||
#define _MM_MASK_MASK (0x1f80)
|
||||
#define _MM_MASK_INVALID (0x0080U)
|
||||
#define _MM_MASK_DENORM (0x0100U)
|
||||
#define _MM_MASK_DIV_ZERO (0x0200U)
|
||||
#define _MM_MASK_OVERFLOW (0x0400U)
|
||||
#define _MM_MASK_UNDERFLOW (0x0800U)
|
||||
#define _MM_MASK_INEXACT (0x1000U)
|
||||
#define _MM_MASK_MASK (0x1f80U)
|
||||
|
||||
#define _MM_ROUND_NEAREST (0x0000)
|
||||
#define _MM_ROUND_DOWN (0x2000)
|
||||
#define _MM_ROUND_UP (0x4000)
|
||||
#define _MM_ROUND_TOWARD_ZERO (0x6000)
|
||||
#define _MM_ROUND_MASK (0x6000)
|
||||
#define _MM_ROUND_NEAREST (0x0000U)
|
||||
#define _MM_ROUND_DOWN (0x2000U)
|
||||
#define _MM_ROUND_UP (0x4000U)
|
||||
#define _MM_ROUND_TOWARD_ZERO (0x6000U)
|
||||
#define _MM_ROUND_MASK (0x6000U)
|
||||
|
||||
#define _MM_FLUSH_ZERO_MASK (0x8000)
|
||||
#define _MM_FLUSH_ZERO_ON (0x8000)
|
||||
#define _MM_FLUSH_ZERO_OFF (0x0000)
|
||||
#define _MM_FLUSH_ZERO_MASK (0x8000U)
|
||||
#define _MM_FLUSH_ZERO_ON (0x8000U)
|
||||
#define _MM_FLUSH_ZERO_OFF (0x0000U)
|
||||
|
||||
#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
|
||||
#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue