mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 05:44:20 +00:00
update C header files to clang 13 rc1
This commit is contained in:
parent
78ff2a148a
commit
21606339af
34 changed files with 156298 additions and 7363 deletions
4
lib/include/__clang_cuda_math.h
vendored
4
lib/include/__clang_cuda_math.h
vendored
|
|
@ -166,6 +166,8 @@ __DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
|
|||
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
|
||||
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
|
||||
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
|
||||
__DEVICE__ double round(double __a) { return __nv_round(__a); }
|
||||
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
|
||||
__DEVICE__ double log(double __a) { return __nv_log(__a); }
|
||||
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
|
||||
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
|
||||
|
|
@ -270,8 +272,6 @@ __DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
|
|||
__DEVICE__ float rnormf(int __dim, const float *__t) {
|
||||
return __nv_rnormf(__dim, __t);
|
||||
}
|
||||
__DEVICE__ double round(double __a) { return __nv_round(__a); }
|
||||
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
|
||||
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
|
||||
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
|
||||
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
|
||||
|
|
|
|||
7
lib/include/__clang_cuda_runtime_wrapper.h
vendored
7
lib/include/__clang_cuda_runtime_wrapper.h
vendored
|
|
@ -349,9 +349,14 @@ extern "C" {
|
|||
__device__ int vprintf(const char *, const char *);
|
||||
__device__ void free(void *) __attribute((nothrow));
|
||||
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
|
||||
|
||||
// __assertfail() used to have a `noreturn` attribute. Unfortunately that
|
||||
// contributed to triggering the longstanding bug in ptxas when assert was used
|
||||
// in sufficiently convoluted code. See
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
|
||||
__device__ void __assertfail(const char *__message, const char *__file,
|
||||
unsigned __line, const char *__function,
|
||||
size_t __charSize) __attribute__((noreturn));
|
||||
size_t __charSize);
|
||||
|
||||
// In order for standard assert() macro on linux to work we need to
|
||||
// provide device-side __assert_fail()
|
||||
|
|
|
|||
238
lib/include/__clang_hip_cmath.h
vendored
238
lib/include/__clang_hip_cmath.h
vendored
|
|
@ -14,6 +14,7 @@
|
|||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
||||
#endif
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#if defined(__cplusplus)
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
|
@ -21,6 +22,7 @@
|
|||
#endif
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
||||
|
|
@ -34,6 +36,9 @@ __DEVICE__ long abs(long __n) { return ::labs(__n); }
|
|||
__DEVICE__ float fma(float __x, float __y, float __z) {
|
||||
return ::fmaf(__x, __y, __z);
|
||||
}
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
// The value returned by fpclassify is platform dependent, therefore it is not
|
||||
// supported by hipRTC.
|
||||
__DEVICE__ int fpclassify(float __x) {
|
||||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
|
|
@ -42,11 +47,51 @@ __DEVICE__ int fpclassify(double __x) {
|
|||
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
|
||||
FP_ZERO, __x);
|
||||
}
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
__DEVICE__ float frexp(float __arg, int *__exp) {
|
||||
return ::frexpf(__arg, __exp);
|
||||
}
|
||||
|
||||
#if defined(__OPENMP_AMDGCN__)
|
||||
// For OpenMP we work around some old system headers that have non-conforming
|
||||
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
|
||||
// this by providing two versions of these functions, differing only in the
|
||||
// return type. To avoid conflicting definitions we disable implicit base
|
||||
// function generation. That means we will end up with two specializations, one
|
||||
// per type, but only one has a base function defined by the system header.
|
||||
#pragma omp begin declare variant match( \
|
||||
implementation = {extension(disable_implicit_base)})
|
||||
|
||||
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
|
||||
// add a suffix. This means we would clash with the names of the variants
|
||||
// (note that we do not create implicit base functions here). To avoid
|
||||
// this clash we add a new trait to some of them that is always true
|
||||
// (this is LLVM after all ;)). It will only influence the mangled name
|
||||
// of the variants inside the inner region and avoid the clash.
|
||||
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
|
||||
|
||||
__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
|
||||
__DEVICE__ int isfinite(double __x) { return ::__finite(__x); }
|
||||
__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif // defined(__OPENMP_AMDGCN__)
|
||||
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
|
||||
__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
|
||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
||||
|
||||
#if defined(__OPENMP_AMDGCN__)
|
||||
#pragma omp end declare variant
|
||||
#endif // defined(__OPENMP_AMDGCN__)
|
||||
|
||||
__DEVICE__ bool isgreater(float __x, float __y) {
|
||||
return __builtin_isgreater(__x, __y);
|
||||
}
|
||||
|
|
@ -59,8 +104,6 @@ __DEVICE__ bool isgreaterequal(float __x, float __y) {
|
|||
__DEVICE__ bool isgreaterequal(double __x, double __y) {
|
||||
return __builtin_isgreaterequal(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
|
||||
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
|
||||
__DEVICE__ bool isless(float __x, float __y) {
|
||||
return __builtin_isless(__x, __y);
|
||||
}
|
||||
|
|
@ -79,8 +122,6 @@ __DEVICE__ bool islessgreater(float __x, float __y) {
|
|||
__DEVICE__ bool islessgreater(double __x, double __y) {
|
||||
return __builtin_islessgreater(__x, __y);
|
||||
}
|
||||
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
|
||||
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
|
||||
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
|
||||
__DEVICE__ bool isunordered(float __x, float __y) {
|
||||
|
|
@ -207,11 +248,117 @@ template <bool __B, class __T = void> struct __hip_enable_if {};
|
|||
|
||||
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
|
||||
|
||||
namespace __hip {
|
||||
template <class _Tp> struct is_integral {
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <> struct is_integral<bool> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<signed char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<unsigned char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<wchar_t> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<short> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<unsigned short> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<int> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<unsigned int> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<unsigned long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<long long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_integral<unsigned long long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
// ToDo: specializes is_arithmetic<_Float16>
|
||||
template <class _Tp> struct is_arithmetic {
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <> struct is_arithmetic<bool> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<signed char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<unsigned char> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<wchar_t> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<short> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<unsigned short> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<int> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<unsigned int> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<unsigned long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<long long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<unsigned long long> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<float> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <> struct is_arithmetic<double> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
struct true_type {
|
||||
static const __constant__ bool value = true;
|
||||
};
|
||||
struct false_type {
|
||||
static const __constant__ bool value = false;
|
||||
};
|
||||
|
||||
template <typename __T, typename __U> struct is_same : public false_type {};
|
||||
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
||||
|
||||
template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
|
||||
|
||||
template <typename __T> typename add_rvalue_reference<__T>::type declval();
|
||||
|
||||
// decltype is only available in C++11 and above.
|
||||
#if __cplusplus >= 201103L
|
||||
// __hip_promote
|
||||
namespace __hip {
|
||||
|
||||
template <class _Tp> struct __numeric_type {
|
||||
static void __test(...);
|
||||
static _Float16 __test(_Float16);
|
||||
|
|
@ -227,8 +374,8 @@ template <class _Tp> struct __numeric_type {
|
|||
// No support for long double, use double instead.
|
||||
static double __test(long double);
|
||||
|
||||
typedef decltype(__test(std::declval<_Tp>())) type;
|
||||
static const bool value = !std::is_same<type, void>::value;
|
||||
typedef decltype(__test(declval<_Tp>())) type;
|
||||
static const bool value = !is_same<type, void>::value;
|
||||
};
|
||||
|
||||
template <> struct __numeric_type<void> { static const bool value = true; };
|
||||
|
|
@ -271,18 +418,17 @@ public:
|
|||
|
||||
template <class _A1, class _A2 = void, class _A3 = void>
|
||||
class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
||||
|
||||
} // namespace __hip
|
||||
#endif //__cplusplus >= 201103L
|
||||
} // namespace __hip
|
||||
|
||||
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
|
||||
// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
|
||||
// floor(double).
|
||||
#define __HIP_OVERLOAD1(__retty, __fn) \
|
||||
template <typename __T> \
|
||||
__DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer, \
|
||||
__retty>::type \
|
||||
__fn(__T __x) { \
|
||||
__DEVICE__ \
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type \
|
||||
__fn(__T __x) { \
|
||||
return ::__fn((double)__x); \
|
||||
}
|
||||
|
||||
|
|
@ -293,8 +439,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
|||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ typename __hip_enable_if< \
|
||||
std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, \
|
||||
typename __hip::__promote<__T1, __T2>::type>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
typedef typename __hip::__promote<__T1, __T2>::type __result_type; \
|
||||
|
|
@ -303,16 +448,14 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
|||
#else
|
||||
#define __HIP_OVERLOAD2(__retty, __fn) \
|
||||
template <typename __T1, typename __T2> \
|
||||
__DEVICE__ \
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized && \
|
||||
std::numeric_limits<__T2>::is_specialized, \
|
||||
__retty>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \
|
||||
__hip::is_arithmetic<__T2>::value, \
|
||||
__retty>::type \
|
||||
__fn(__T1 __x, __T2 __y) { \
|
||||
return __fn((double)__x, (double)__y); \
|
||||
}
|
||||
#endif
|
||||
|
||||
__HIP_OVERLOAD1(double, abs)
|
||||
__HIP_OVERLOAD1(double, acos)
|
||||
__HIP_OVERLOAD1(double, acosh)
|
||||
__HIP_OVERLOAD1(double, asin)
|
||||
|
|
@ -336,7 +479,9 @@ __HIP_OVERLOAD1(double, floor)
|
|||
__HIP_OVERLOAD2(double, fmax)
|
||||
__HIP_OVERLOAD2(double, fmin)
|
||||
__HIP_OVERLOAD2(double, fmod)
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
__HIP_OVERLOAD1(int, fpclassify)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
__HIP_OVERLOAD2(double, hypot)
|
||||
__HIP_OVERLOAD1(int, ilogb)
|
||||
__HIP_OVERLOAD1(bool, isfinite)
|
||||
|
|
@ -382,9 +527,8 @@ __HIP_OVERLOAD2(double, min)
|
|||
#if __cplusplus >= 201103L
|
||||
template <typename __T1, typename __T2, typename __T3>
|
||||
__DEVICE__ typename __hip_enable_if<
|
||||
std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized &&
|
||||
std::numeric_limits<__T3>::is_specialized,
|
||||
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
|
||||
__hip::is_arithmetic<__T3>::value,
|
||||
typename __hip::__promote<__T1, __T2, __T3>::type>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
|
||||
|
|
@ -392,33 +536,32 @@ fma(__T1 __x, __T2 __y, __T3 __z) {
|
|||
}
|
||||
#else
|
||||
template <typename __T1, typename __T2, typename __T3>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized &&
|
||||
std::numeric_limits<__T3>::is_specialized,
|
||||
double>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
||||
__hip::is_arithmetic<__T2>::value &&
|
||||
__hip::is_arithmetic<__T3>::value,
|
||||
double>::type
|
||||
fma(__T1 __x, __T2 __y, __T3 __z) {
|
||||
return ::fma((double)__x, (double)__y, (double)__z);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
||||
frexp(__T __x, int *__exp) {
|
||||
return ::frexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
||||
ldexp(__T __x, int __exp) {
|
||||
return ::ldexp((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
||||
modf(__T __x, double *__exp) {
|
||||
return ::modf((double)__x, __exp);
|
||||
}
|
||||
|
|
@ -426,8 +569,8 @@ __DEVICE__
|
|||
#if __cplusplus >= 201103L
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized,
|
||||
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
||||
__hip::is_arithmetic<__T2>::value,
|
||||
typename __hip::__promote<__T1, __T2>::type>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
|
||||
|
|
@ -435,25 +578,24 @@ __DEVICE__
|
|||
}
|
||||
#else
|
||||
template <typename __T1, typename __T2>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
|
||||
std::numeric_limits<__T2>::is_specialized,
|
||||
double>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
||||
__hip::is_arithmetic<__T2>::value,
|
||||
double>::type
|
||||
remquo(__T1 __x, __T2 __y, int *__quo) {
|
||||
return ::remquo((double)__x, (double)__y, __quo);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
||||
scalbln(__T __x, long int __exp) {
|
||||
return ::scalbln((double)__x, __exp);
|
||||
}
|
||||
|
||||
template <typename __T>
|
||||
__DEVICE__
|
||||
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
|
||||
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
||||
scalbn(__T __x, int __exp) {
|
||||
return ::scalbn((double)__x, __exp);
|
||||
}
|
||||
|
|
@ -468,14 +610,15 @@ __DEVICE__
|
|||
#endif // defined(__cplusplus)
|
||||
|
||||
// Define these overloads inside the namespace our standard library uses.
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
_LIBCPP_BEGIN_NAMESPACE_STD
|
||||
#else
|
||||
namespace std {
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif
|
||||
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
#endif // _LIBCPP_BEGIN_NAMESPACE_STD
|
||||
|
||||
// Pull the new overloads we defined above into namespace std.
|
||||
// using ::abs; - This may be considered for C++.
|
||||
|
|
@ -620,11 +763,13 @@ _LIBCPP_END_NAMESPACE_STD
|
|||
#else
|
||||
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_GLIBCXX_END_NAMESPACE_VERSION
|
||||
#endif
|
||||
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
} // namespace std
|
||||
#endif
|
||||
#endif // _LIBCPP_END_NAMESPACE_STD
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
// Define device-side math functions from <ymath.h> on MSVC.
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
|
||||
|
|
@ -658,6 +803,7 @@ __DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
|
|||
}
|
||||
#endif // defined(__cplusplus)
|
||||
#endif // defined(_MSC_VER)
|
||||
#endif // !defined(__HIPCC_RTC__)
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
||||
|
|
|
|||
36
lib/include/__clang_hip_libdevice_declares.h
vendored
36
lib/include/__clang_hip_libdevice_declares.h
vendored
|
|
@ -138,14 +138,22 @@ __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
|||
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
||||
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
|
||||
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
|
||||
__device__ __attribute__((const)) float
|
||||
__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
|
||||
__device__ inline __attribute__((const)) float
|
||||
__llvm_amdgcn_cos_f32(float __x) {
|
||||
return __builtin_amdgcn_cosf(__x);
|
||||
}
|
||||
__device__ inline __attribute__((const)) float
|
||||
__llvm_amdgcn_rcp_f32(float __x) {
|
||||
return __builtin_amdgcn_rcpf(__x);
|
||||
}
|
||||
__device__ inline __attribute__((const)) float
|
||||
__llvm_amdgcn_rsq_f32(float __x) {
|
||||
return __builtin_amdgcn_rsqf(__x);
|
||||
}
|
||||
__device__ inline __attribute__((const)) float
|
||||
__llvm_amdgcn_sin_f32(float __x) {
|
||||
return __builtin_amdgcn_sinf(__x);
|
||||
}
|
||||
// END INTRINSICS
|
||||
// END FLOAT
|
||||
|
||||
|
|
@ -269,10 +277,14 @@ __device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
|
|||
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
|
||||
double);
|
||||
|
||||
__device__ __attribute__((const)) double
|
||||
__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
|
||||
__device__ __attribute__((const)) double
|
||||
__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
|
||||
__device__ inline __attribute__((const)) double
|
||||
__llvm_amdgcn_rcp_f64(double __x) {
|
||||
return __builtin_amdgcn_rcp(__x);
|
||||
}
|
||||
__device__ inline __attribute__((const)) double
|
||||
__llvm_amdgcn_rsq_f64(double __x) {
|
||||
return __builtin_amdgcn_rsq(__x);
|
||||
}
|
||||
|
||||
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
||||
__device__ _Float16 __ocml_cos_f16(_Float16);
|
||||
|
|
|
|||
6
lib/include/__clang_hip_math.h
vendored
6
lib/include/__clang_hip_math.h
vendored
|
|
@ -13,11 +13,13 @@
|
|||
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
|
||||
#endif
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#if defined(__cplusplus)
|
||||
#include <algorithm>
|
||||
#endif
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#endif // __HIPCC_RTC__
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
|
||||
|
|
@ -36,7 +38,7 @@ template<bool>
|
|||
struct __compare_result{};
|
||||
template<>
|
||||
struct __compare_result<true> {
|
||||
static const bool valid;
|
||||
static const __device__ bool valid;
|
||||
};
|
||||
|
||||
__DEVICE__
|
||||
|
|
@ -1260,6 +1262,7 @@ float min(float __x, float __y) { return fminf(__x, __y); }
|
|||
__DEVICE__
|
||||
double min(double __x, double __y) { return fmin(__x, __y); }
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
__host__ inline static int min(int __arg1, int __arg2) {
|
||||
return std::min(__arg1, __arg2);
|
||||
}
|
||||
|
|
@ -1267,6 +1270,7 @@ __host__ inline static int min(int __arg1, int __arg2) {
|
|||
__host__ inline static int max(int __arg1, int __arg2) {
|
||||
return std::max(__arg1, __arg2);
|
||||
}
|
||||
#endif // __HIPCC_RTC__
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("__DEVICE__")
|
||||
|
|
|
|||
91
lib/include/__clang_hip_runtime_wrapper.h
vendored
91
lib/include/__clang_hip_runtime_wrapper.h
vendored
|
|
@ -18,52 +18,107 @@
|
|||
|
||||
#if __HIP__
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define __host__ __attribute__((host))
|
||||
#define __device__ __attribute__((device))
|
||||
#define __global__ __attribute__((global))
|
||||
#define __shared__ __attribute__((shared))
|
||||
#define __constant__ __attribute__((constant))
|
||||
#define __managed__ __attribute__((managed))
|
||||
|
||||
#if !defined(__cplusplus) || __cplusplus < 201103L
|
||||
#define nullptr NULL;
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
__attribute__((__visibility__("default")))
|
||||
__attribute__((weak))
|
||||
__attribute__((noreturn))
|
||||
__device__ void __cxa_pure_virtual(void) {
|
||||
__builtin_trap();
|
||||
}
|
||||
__attribute__((__visibility__("default")))
|
||||
__attribute__((weak))
|
||||
__attribute__((noreturn))
|
||||
__device__ void __cxa_deleted_virtual(void) {
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
#endif //__cplusplus
|
||||
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <stdlib.h>
|
||||
#else
|
||||
typedef __SIZE_TYPE__ size_t;
|
||||
// Define macros which are needed to declare HIP device API's without standard
|
||||
// C/C++ headers. This is for readability so that these API's can be written
|
||||
// the same way as non-hipRTC use case. These macros need to be popped so that
|
||||
// they do not pollute users' name space.
|
||||
#pragma push_macro("NULL")
|
||||
#pragma push_macro("uint32_t")
|
||||
#pragma push_macro("uint64_t")
|
||||
#pragma push_macro("CHAR_BIT")
|
||||
#pragma push_macro("INT_MAX")
|
||||
#define NULL (void *)0
|
||||
#define uint32_t __UINT32_TYPE__
|
||||
#define uint64_t __UINT64_TYPE__
|
||||
#define CHAR_BIT __CHAR_BIT__
|
||||
#define INT_MAX __INTMAX_MAX__
|
||||
#endif // __HIPCC_RTC__
|
||||
|
||||
typedef __SIZE_TYPE__ __hip_size_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif //__cplusplus
|
||||
|
||||
#if __HIP_ENABLE_DEVICE_MALLOC__
|
||||
extern "C" __device__ void *__hip_malloc(size_t __size);
|
||||
extern "C" __device__ void *__hip_free(void *__ptr);
|
||||
static inline __device__ void *malloc(size_t __size) {
|
||||
__device__ void *__hip_malloc(__hip_size_t __size);
|
||||
__device__ void *__hip_free(void *__ptr);
|
||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
||||
return __hip_malloc(__size);
|
||||
}
|
||||
static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
|
||||
#else
|
||||
static inline __device__ void *malloc(size_t __size) {
|
||||
__builtin_trap();
|
||||
return nullptr;
|
||||
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
|
||||
return __hip_free(__ptr);
|
||||
}
|
||||
static inline __device__ void *free(void *__ptr) {
|
||||
#else
|
||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
||||
__builtin_trap();
|
||||
return nullptr;
|
||||
return (void *)0;
|
||||
}
|
||||
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
|
||||
__builtin_trap();
|
||||
return (void *)0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif //__cplusplus
|
||||
|
||||
#include <__clang_hip_libdevice_declares.h>
|
||||
#include <__clang_hip_math.h>
|
||||
|
||||
#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#include <__clang_hip_cmath.h>
|
||||
#else
|
||||
#include <__clang_cuda_math_forward_declares.h>
|
||||
#include <__clang_hip_cmath.h>
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <new>
|
||||
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
|
||||
#endif // __HIPCC_RTC__
|
||||
|
||||
#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#pragma pop_macro("NULL")
|
||||
#pragma pop_macro("uint32_t")
|
||||
#pragma pop_macro("uint64_t")
|
||||
#pragma pop_macro("CHAR_BIT")
|
||||
#pragma pop_macro("INT_MAX")
|
||||
#endif // __HIPCC_RTC__
|
||||
#endif // __HIP__
|
||||
#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
|
||||
|
|
|
|||
1584
lib/include/altivec.h
vendored
1584
lib/include/altivec.h
vendored
File diff suppressed because it is too large
Load diff
255
lib/include/amxintrin.h
vendored
255
lib/include/amxintrin.h
vendored
|
|
@ -15,8 +15,13 @@
|
|||
#define __AMXINTRIN_H
|
||||
#ifdef __x86_64__
|
||||
|
||||
/* Define the default attributes for the functions in this file. */
|
||||
#define __DEFAULT_FN_ATTRS_TILE \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||
#define __DEFAULT_FN_ATTRS_INT8 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
#define __DEFAULT_FN_ATTRS_BF16 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
|
||||
|
||||
/// Load tile configuration from a 64-byte memory location specified by
|
||||
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||
|
|
@ -25,7 +30,7 @@
|
|||
/// config and the tile data, and the tiles are zeroed. Any invalid
|
||||
/// configurations will result in #GP fault.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
|
||||
///
|
||||
|
|
@ -41,7 +46,7 @@ _tile_loadconfig(const void *__config) {
|
|||
/// palette, the number of bytes per row, and the number of rows. If tiles
|
||||
/// are not configured, all zeroes will be stored to memory.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
|
||||
///
|
||||
|
|
@ -55,7 +60,7 @@ _tile_storeconfig(void *__config) {
|
|||
/// Release the tile configuration to return to the init state, which
|
||||
/// releases all storage it currently holds.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||
|
|
@ -66,7 +71,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// destination tile "dst" using the tile configuration previously configured
|
||||
/// via "_tile_loadconfig".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
||||
///
|
||||
|
|
@ -86,7 +91,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// that the data will likely not be reused in the near future and the data
|
||||
/// caching can be optimized accordingly.
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||
///
|
||||
|
|
@ -104,7 +109,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// "stride" using the tile configuration previously configured via
|
||||
/// "_tile_loadconfig".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||
///
|
||||
|
|
@ -119,7 +124,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
|
||||
/// Zero the tile specified by "tdest".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
||||
///
|
||||
|
|
@ -133,7 +138,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
||||
///
|
||||
|
|
@ -152,7 +157,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
||||
///
|
||||
|
|
@ -171,7 +176,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
||||
///
|
||||
|
|
@ -190,7 +195,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
||||
/// "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
||||
///
|
||||
|
|
@ -208,7 +213,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <x86intrin.h>
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
||||
///
|
||||
|
|
@ -221,10 +226,12 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
|||
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
||||
|
||||
#define __DEFAULT_FN_ATTRS_INT8 \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||
|
||||
/// AMX tile register size can be configured, the maximum size is 16x64=1024
|
||||
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
|
||||
/// represent 2D tile and the fixed size is maximum amx tile register size.
|
||||
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
|
|
@ -232,12 +239,43 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
|||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
return __builtin_ia32_tileloaddt164_internal(m, n, base,
|
||||
(__SIZE_TYPE__)(stride));
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ void __DEFAULT_FN_ATTRS_INT8
|
||||
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
||||
__SIZE_TYPE__ stride, _tile1024i tile) {
|
||||
|
|
@ -245,34 +283,211 @@ _tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
|||
(__SIZE_TYPE__)(stride), tile);
|
||||
}
|
||||
|
||||
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
|
||||
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
|
||||
}
|
||||
|
||||
/// This struct pack the shape and tile data together for user. We suggest
|
||||
/// initializing the struct as early as possible, because compiler depends
|
||||
/// on the shape information to do configure. The constant value is preferred
|
||||
/// for optimization by compiler.
|
||||
typedef struct __tile1024i_str {
|
||||
const unsigned short row;
|
||||
const unsigned short col;
|
||||
_tile1024i tile;
|
||||
} __tile1024i;
|
||||
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TILE
|
||||
static void __tile_loadd(__tile1024i *dst, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
|
||||
__tile1024i src2) {
|
||||
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
|
||||
src1.tile, src2.tile);
|
||||
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||
/// destination tile "dst". This intrinsic provides a hint to the implementation
|
||||
/// that the data will likely not be reused in the near future and the data
|
||||
/// caching can be optimized accordingly.
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be loaded in memory.
|
||||
__DEFAULT_FN_ATTRS_TILE
|
||||
static void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
||||
__SIZE_TYPE__ stride) {
|
||||
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
||||
}
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||
/// in "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||
/// and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
||||
/// "dst", and store the 32-bit result back to tile "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_INT8
|
||||
static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||
/// "stride".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// A destination tile. Max size is 1024 Bytes.
|
||||
/// \param base
|
||||
/// A pointer to base address.
|
||||
/// \param stride
|
||||
/// The stride between the rows' data to be stored in memory.
|
||||
__DEFAULT_FN_ATTRS_TILE
|
||||
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
|
||||
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
||||
}
|
||||
|
||||
/// Zero the tile specified by "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile to be zero. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_TILE
|
||||
static void __tile_zero(__tile1024i *dst) {
|
||||
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
|
||||
}
|
||||
|
||||
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||
/// "dst".
|
||||
///
|
||||
/// \headerfile <immintrin.h>
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
||||
///
|
||||
/// \param dst
|
||||
/// The destination tile. Max size is 1024 Bytes.
|
||||
/// \param src0
|
||||
/// The 1st source tile. Max size is 1024 Bytes.
|
||||
/// \param src1
|
||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||
__DEFAULT_FN_ATTRS_BF16
|
||||
static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||
__tile1024i src1) {
|
||||
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||
src0.tile, src1.tile);
|
||||
}
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS_TILE
|
||||
#undef __DEFAULT_FN_ATTRS_INT8
|
||||
#undef __DEFAULT_FN_ATTRS_BF16
|
||||
|
||||
#endif /* __x86_64__ */
|
||||
#endif /* __AMXINTRIN_H */
|
||||
|
|
|
|||
55
lib/include/arm_acle.h
vendored
55
lib/include/arm_acle.h
vendored
|
|
@ -639,6 +639,49 @@ __jcvt(double __a) {
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Armv8.5-A FP rounding intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32zf(float __a) {
|
||||
return __builtin_arm_frint32zf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32z(double __a) {
|
||||
return __builtin_arm_frint32z(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64zf(float __a) {
|
||||
return __builtin_arm_frint64zf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64z(double __a) {
|
||||
return __builtin_arm_frint64z(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32xf(float __a) {
|
||||
return __builtin_arm_frint32xf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint32x(double __a) {
|
||||
return __builtin_arm_frint32x(__a);
|
||||
}
|
||||
|
||||
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64xf(float __a) {
|
||||
return __builtin_arm_frint64xf(__a);
|
||||
}
|
||||
|
||||
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||
__frint64x(double __a) {
|
||||
return __builtin_arm_frint64x(__a);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Armv8.7-A load/store 64-byte intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
|
||||
typedef struct {
|
||||
|
|
@ -709,6 +752,18 @@ __arm_st64bv0(void *__addr, data512_t __value) {
|
|||
|
||||
#endif /* __ARM_FEATURE_TME */
|
||||
|
||||
/* Armv8.5-A Random number generation intrinsics */
|
||||
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
__rndr(uint64_t *__p) {
|
||||
return __builtin_arm_rndr(__p);
|
||||
}
|
||||
static __inline__ int __attribute__((__always_inline__, __nodebug__))
|
||||
__rndrrs(uint64_t *__p) {
|
||||
return __builtin_arm_rndrrs(__p);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
1242
lib/include/arm_neon.h
vendored
1242
lib/include/arm_neon.h
vendored
File diff suppressed because it is too large
Load diff
17687
lib/include/arm_sve.h
vendored
17687
lib/include/arm_sve.h
vendored
File diff suppressed because it is too large
Load diff
212
lib/include/avx512fintrin.h
vendored
212
lib/include/avx512fintrin.h
vendored
|
|
@ -9300,8 +9300,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
|
|||
* computations. In vector-reduction arithmetic, the evaluation order is
|
||||
* independent of the order of the input elements of V.
|
||||
|
||||
* For floating point types, we always assume the elements are reassociable even
|
||||
* if -fast-math is off.
|
||||
* For floating-point intrinsics:
|
||||
* 1. When using fadd/fmul intrinsics, the order of operations within the
|
||||
* vector is unspecified (associative math).
|
||||
* 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
|
||||
* produce unspecified results.
|
||||
|
||||
* Used bisection method. At each step, we partition the vector with previous
|
||||
* step in half, and the operation is performed on its two halves.
|
||||
|
|
@ -9524,75 +9527,49 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
|
|||
return __builtin_ia32_reduce_umin_d512((__v16si)__V);
|
||||
}
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
|
||||
__m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
|
||||
__m256d __t3 = _mm256_##op(__t1, __t2); \
|
||||
__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
|
||||
__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
|
||||
__m128d __t6 = _mm_##op(__t4, __t5); \
|
||||
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
|
||||
__m128d __t8 = _mm_##op(__t6, __t7); \
|
||||
return __t8[0]
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_pd(__m512d __V) {
|
||||
_mm512_mask_reduce_operator(max_pd);
|
||||
return __builtin_ia32_reduce_fmax_pd512(__V);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_pd(__m512d __V) {
|
||||
_mm512_mask_reduce_operator(min_pd);
|
||||
return __builtin_ia32_reduce_fmin_pd512(__V);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
|
||||
__V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
|
||||
_mm512_mask_reduce_operator(max_pd);
|
||||
return __builtin_ia32_reduce_fmax_pd512(__V);
|
||||
}
|
||||
|
||||
static __inline__ double __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
|
||||
__V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_pd);
|
||||
return __builtin_ia32_reduce_fmin_pd512(__V);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
#define _mm512_mask_reduce_operator(op) \
|
||||
__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
|
||||
__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
|
||||
__m256 __t3 = _mm256_##op(__t1, __t2); \
|
||||
__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
|
||||
__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
|
||||
__m128 __t6 = _mm_##op(__t4, __t5); \
|
||||
__m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
|
||||
__m128 __t8 = _mm_##op(__t6, __t7); \
|
||||
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
|
||||
__m128 __t10 = _mm_##op(__t8, __t9); \
|
||||
return __t10[0]
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_max_ps(__m512 __V) {
|
||||
_mm512_mask_reduce_operator(max_ps);
|
||||
return __builtin_ia32_reduce_fmax_ps512(__V);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_reduce_min_ps(__m512 __V) {
|
||||
_mm512_mask_reduce_operator(min_ps);
|
||||
return __builtin_ia32_reduce_fmin_ps512(__V);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
|
||||
__V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
|
||||
_mm512_mask_reduce_operator(max_ps);
|
||||
return __builtin_ia32_reduce_fmax_ps512(__V);
|
||||
}
|
||||
|
||||
static __inline__ float __DEFAULT_FN_ATTRS512
|
||||
_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
|
||||
__V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
|
||||
_mm512_mask_reduce_operator(min_ps);
|
||||
return __builtin_ia32_reduce_fmin_ps512(__V);
|
||||
}
|
||||
#undef _mm512_mask_reduce_operator
|
||||
|
||||
/// Moves the least significant 32 bits of a vector of [16 x i32] to a
|
||||
/// 32-bit signed integer value.
|
||||
|
|
@ -9611,6 +9588,169 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||
return __b[0];
|
||||
}
|
||||
|
||||
/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
|
||||
/// locations starting at location \a base_addr at packed 32-bit integer indices
|
||||
/// stored in the lower half of \a vindex scaled by \a scale them in dst.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||
/// ENDFOR
|
||||
/// dst[MAX:512] := 0
|
||||
/// \endoperation
|
||||
#define _mm512_i32logather_pd(vindex, base_addr, scale) \
|
||||
_mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
||||
|
||||
/// Loads 8 double-precision (64-bit) floating-point elements from memory
|
||||
/// starting at location \a base_addr at packed 32-bit integer indices stored in
|
||||
/// the lower half of \a vindex scaled by \a scale into dst using writemask
|
||||
/// \a mask (elements are copied from \a src when the corresponding mask bit is
|
||||
/// not set).
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// IF mask[j]
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||
/// ELSE
|
||||
/// dst[i+63:i] := src[i+63:i]
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX:512] := 0
|
||||
/// \endoperation
|
||||
#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
|
||||
_mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
|
||||
(base_addr), (scale))
|
||||
|
||||
/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
|
||||
/// at packed 32-bit integer indices stored in the lower half of \a vindex
|
||||
/// scaled by \a scale and stores them in dst.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||
/// ENDFOR
|
||||
/// dst[MAX:512] := 0
|
||||
/// \endoperation
|
||||
#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
|
||||
_mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
||||
|
||||
/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
|
||||
/// at packed 32-bit integer indices stored in the lower half of \a vindex
|
||||
/// scaled by \a scale and stores them in dst using writemask \a mask (elements
|
||||
/// are copied from \a src when the corresponding mask bit is not set).
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// IF mask[j]
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||
/// ELSE
|
||||
/// dst[i+63:i] := src[i+63:i]
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// dst[MAX:512] := 0
|
||||
/// \endoperation
|
||||
#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
|
||||
_mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
|
||||
(base_addr), (scale))
|
||||
|
||||
/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
|
||||
/// and to memory locations starting at location \a base_addr at packed 32-bit
|
||||
/// integer indices stored in \a vindex scaled by \a scale.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// MEM[addr+63:addr] := v1[i+63:i]
|
||||
/// ENDFOR
|
||||
/// \endoperation
|
||||
#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
|
||||
_mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
|
||||
|
||||
/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
|
||||
/// to memory locations starting at location \a base_addr at packed 32-bit
|
||||
/// integer indices stored in \a vindex scaled by \a scale. Only those elements
|
||||
/// whose corresponding mask bit is set in writemask \a mask are written to
|
||||
/// memory.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// IF mask[j]
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// MEM[addr+63:addr] := a[i+63:i]
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// \endoperation
|
||||
#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
|
||||
_mm512_mask_i32scatter_pd((base_addr), (mask), \
|
||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||
|
||||
/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
|
||||
/// memory locations starting at location \a base_addr at packed 32-bit integer
|
||||
/// indices stored in \a vindex scaled by \a scale.
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// MEM[addr+63:addr] := a[i+63:i]
|
||||
/// ENDFOR
|
||||
/// \endoperation
|
||||
#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
|
||||
_mm512_i32scatter_epi64((base_addr), \
|
||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||
|
||||
/// Stores 8 packed 64-bit integer elements located in a and stores them in
|
||||
/// memory locations starting at location \a base_addr at packed 32-bit integer
|
||||
/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
|
||||
/// whose corresponding mask bit is not set are not written to memory).
|
||||
///
|
||||
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
||||
///
|
||||
/// \operation
|
||||
/// FOR j := 0 to 7
|
||||
/// i := j*64
|
||||
/// m := j*32
|
||||
/// IF mask[j]
|
||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||
/// MEM[addr+63:addr] := a[i+63:i]
|
||||
/// FI
|
||||
/// ENDFOR
|
||||
/// \endoperation
|
||||
#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
|
||||
_mm512_mask_i32scatter_epi64((base_addr), (mask), \
|
||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS512
|
||||
#undef __DEFAULT_FN_ATTRS128
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
|
|
|
|||
16
lib/include/builtins.h
vendored
Normal file
16
lib/include/builtins.h
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
/*===---- builtins.h - Standard header for extra builtins -----------------===*\
|
||||
*
|
||||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
* See https://llvm.org/LICENSE.txt for license information.
|
||||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
*
|
||||
\*===----------------------------------------------------------------------===*/
|
||||
|
||||
/// Some legacy compilers have builtin definitions in a file named builtins.h.
|
||||
/// This header file has been added to allow compatibility with code that was
|
||||
/// written for those compilers. Code may have an include line for this file
|
||||
/// and to avoid an error an empty file with this name is provided.
|
||||
#ifndef __BUILTINS_H
|
||||
#define __BUILTINS_H
|
||||
|
||||
#endif /* __BUILTINS_H */
|
||||
8
lib/include/cuda_wrappers/complex
vendored
8
lib/include/cuda_wrappers/complex
vendored
|
|
@ -72,8 +72,16 @@
|
|||
#define _GLIBCXX_USE_C99_COMPLEX 0
|
||||
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
|
||||
|
||||
// Work around a compatibility issue with libstdc++ 11.1.0
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=50383
|
||||
#pragma push_macro("__failed_assertion")
|
||||
#if _GLIBCXX_RELEASE == 11
|
||||
#define __failed_assertion __cuda_failed_assertion
|
||||
#endif
|
||||
|
||||
#include_next <complex>
|
||||
|
||||
#pragma pop_macro("__failed_assertion")
|
||||
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
|
||||
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
|
||||
|
||||
|
|
|
|||
298
lib/include/hexagon_circ_brev_intrinsics.h
vendored
Normal file
298
lib/include/hexagon_circ_brev_intrinsics.h
vendored
Normal file
|
|
@ -0,0 +1,298 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_
|
||||
#define _HEXAGON_CIRC_BREV_INTRINSICS_H_ 1
|
||||
|
||||
#include <hexagon_protos.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* Circular Load */
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_D(Word64 dst, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_D(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int64_t *) HEXAGON_circ_ldd (ptr, &(dest), ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_W(Word32 dst, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_W(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int *) HEXAGON_circ_ldw (ptr, &(dest), (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_H(Word16 dst, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_H(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int16_t *) HEXAGON_circ_ldh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_UH( UWord16 dst, UWord16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_UH(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (uint16_t *) HEXAGON_circ_lduh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_B(Word8 dst, Word8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_B(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int8_t *) HEXAGON_circ_ldb (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_load_update_UB(dest,ptr,incr,bufsize,K) \
|
||||
{ ptr = (uint8_t *) HEXAGON_circ_ldub (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
|
||||
|
||||
/* Circular Store */
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_store_update_D(Word64 *src, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_store_update_D(src,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int64_t *) HEXAGON_circ_std (ptr, src, ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_store_update_W(Word32 *src, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_store_update_W(src,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int *) HEXAGON_circ_stw (ptr, src, (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_store_update_HL(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_store_update_HL(src,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int16_t *) HEXAGON_circ_sth (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_store_update_HH(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_store_update_HH(src,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int16_t *) HEXAGON_circ_sthhi (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_circ_store_update_B(Word8 *src, Word8 *ptr, UWord32 I4, UWord32 bufsize, UWord64 K)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_circ_store_update_B(src,ptr,incr,bufsize,K) \
|
||||
{ ptr = (int8_t *) HEXAGON_circ_stb (ptr, src, ((((K)-2)<<24)|(bufsize)), incr); }
|
||||
|
||||
|
||||
/* Bit Reverse Load */
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_D(Word64 dst, Word64 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_D(dest,ptr,log2bufsize) \
|
||||
{ ptr = (int64_t *) HEXAGON_brev_ldd (ptr, &(dest), (1<<(16-((log2bufsize) + 3)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_W(Word32 dst, Word32 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_W(dest,ptr,log2bufsize) \
|
||||
{ ptr = (int *) HEXAGON_brev_ldw (ptr, &(dest), (1<<(16-((log2bufsize) + 2)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_H(Word16 dst, Word16 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_H(dest,ptr,log2bufsize) \
|
||||
{ ptr = (int16_t *) HEXAGON_brev_ldh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_UH(UWord16 dst, UWord16 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_UH(dest,ptr,log2bufsize) \
|
||||
{ ptr = (uint16_t *) HEXAGON_brev_lduh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_B(Word8 dst, Word8 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_B(dest,ptr,log2bufsize) \
|
||||
{ ptr = (int8_t *) HEXAGON_brev_ldb (ptr, &(dest), (1<<(16-((log2bufsize))))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_load_update_UB(dest,ptr,log2bufsize) \
|
||||
{ ptr = (uint8_t *) HEXAGON_brev_ldub (ptr, &(dest), (1<<(16-((log2bufsize))))); }
|
||||
|
||||
/* Bit Reverse Store */
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_store_update_D(Word64 *src, Word64 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_store_update_D(src,ptr,log2bufsize) \
|
||||
{ ptr = (int64_t *) HEXAGON_brev_std (ptr, src, (1<<(16-((log2bufsize) + 3)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_store_update_W(Word32 *src, Word32 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_store_update_W(src,ptr,log2bufsize) \
|
||||
{ ptr = (int *) HEXAGON_brev_stw (ptr, src, (1<<(16-((log2bufsize) + 2)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_store_update_HL(Word16 *src, Word16 *ptr, Word32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_store_update_HL(src,ptr,log2bufsize) \
|
||||
{ ptr = (int16_t *) HEXAGON_brev_sth (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_store_update_HH(Word16 *src, Word16 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_store_update_HH(src,ptr,log2bufsize) \
|
||||
{ ptr = (int16_t *) HEXAGON_brev_sthhi (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: void Q6_bitrev_store_update_B(Word8 *src, Word8 *ptr, UWord32 Iu4)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#define Q6_bitrev_store_update_B(src,ptr,log2bufsize) \
|
||||
{ ptr = (int8_t *) HEXAGON_brev_stb (ptr, src, (1<<(16-((log2bufsize))))); }
|
||||
|
||||
|
||||
#define HEXAGON_circ_ldd __builtin_circ_ldd
|
||||
#define HEXAGON_circ_ldw __builtin_circ_ldw
|
||||
#define HEXAGON_circ_ldh __builtin_circ_ldh
|
||||
#define HEXAGON_circ_lduh __builtin_circ_lduh
|
||||
#define HEXAGON_circ_ldb __builtin_circ_ldb
|
||||
#define HEXAGON_circ_ldub __builtin_circ_ldub
|
||||
|
||||
|
||||
#define HEXAGON_circ_std __builtin_circ_std
|
||||
#define HEXAGON_circ_stw __builtin_circ_stw
|
||||
#define HEXAGON_circ_sth __builtin_circ_sth
|
||||
#define HEXAGON_circ_sthhi __builtin_circ_sthhi
|
||||
#define HEXAGON_circ_stb __builtin_circ_stb
|
||||
|
||||
|
||||
#define HEXAGON_brev_ldd __builtin_brev_ldd
|
||||
#define HEXAGON_brev_ldw __builtin_brev_ldw
|
||||
#define HEXAGON_brev_ldh __builtin_brev_ldh
|
||||
#define HEXAGON_brev_lduh __builtin_brev_lduh
|
||||
#define HEXAGON_brev_ldb __builtin_brev_ldb
|
||||
#define HEXAGON_brev_ldub __builtin_brev_ldub
|
||||
|
||||
#define HEXAGON_brev_std __builtin_brev_std
|
||||
#define HEXAGON_brev_stw __builtin_brev_stw
|
||||
#define HEXAGON_brev_sth __builtin_brev_sth
|
||||
#define HEXAGON_brev_sthhi __builtin_brev_sthhi
|
||||
#define HEXAGON_brev_stb __builtin_brev_stb
|
||||
|
||||
#ifdef __HVX__
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: if (Qt) vmem(Rt+#0) = Vs
|
||||
C Intrinsic Prototype: void Q6_vmaskedstoreq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
|
||||
Instruction Type: COPROC_VMEM
|
||||
Execution Slots: SLOT0
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_vmaskedstoreq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstoreq)
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: if (!Qt) vmem(Rt+#0) = Vs
|
||||
C Intrinsic Prototype: void Q6_vmaskedstorenq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
|
||||
Instruction Type: COPROC_VMEM
|
||||
Execution Slots: SLOT0
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_vmaskedstorenq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorenq)
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: if (Qt) vmem(Rt+#0):nt = Vs
|
||||
C Intrinsic Prototype: void Q6_vmaskedstorentq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
|
||||
Instruction Type: COPROC_VMEM
|
||||
Execution Slots: SLOT0
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_vmaskedstorentq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentq)
|
||||
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: if (!Qt) vmem(Rt+#0):nt = Vs
|
||||
C Intrinsic Prototype: void Q6_vmaskedstorentnq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
|
||||
Instruction Type: COPROC_VMEM
|
||||
Execution Slots: SLOT0
|
||||
========================================================================== */
|
||||
|
||||
#define Q6_vmaskedstorentnq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentnq)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* #ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_ */
|
||||
|
||||
#ifdef __NOT_DEFINED__
|
||||
/*** comment block template ***/
|
||||
/* ==========================================================================
|
||||
Assembly Syntax: Return=instruction()
|
||||
C Intrinsic Prototype: ReturnType Intrinsic(ParamType Rs, ParamType Rt)
|
||||
Instruction Type: InstructionType
|
||||
Execution Slots: SLOT0123
|
||||
========================================================================== */
|
||||
#endif /*** __NOT_DEFINED__ ***/
|
||||
8450
lib/include/hexagon_protos.h
vendored
Normal file
8450
lib/include/hexagon_protos.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
2653
lib/include/hexagon_types.h
vendored
Normal file
2653
lib/include/hexagon_types.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
4392
lib/include/hvx_hexagon_protos.h
vendored
Normal file
4392
lib/include/hvx_hexagon_protos.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
10
lib/include/immintrin.h
vendored
10
lib/include/immintrin.h
vendored
|
|
@ -72,11 +72,6 @@
|
|||
#include <f16cintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__VPCLMULQDQ__)
|
||||
#include <vpclmulqdqintrin.h>
|
||||
#endif
|
||||
|
||||
/* No feature check desired due to internal checks */
|
||||
#include <bmiintrin.h>
|
||||
|
||||
|
|
@ -230,6 +225,11 @@
|
|||
#include <pkuintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__VPCLMULQDQ__)
|
||||
#include <vpclmulqdqintrin.h>
|
||||
#endif
|
||||
|
||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||
defined(__VAES__)
|
||||
#include <vaesintrin.h>
|
||||
|
|
|
|||
49
lib/include/intrin.h
vendored
49
lib/include/intrin.h
vendored
|
|
@ -451,24 +451,47 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
|
|||
static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
|
||||
unsigned char const *__src,
|
||||
size_t __n) {
|
||||
__asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
: : "memory");
|
||||
#if defined(__x86_64__)
|
||||
__asm__ __volatile__("rep movsb"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
|
||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
|
||||
unsigned long const *__src,
|
||||
size_t __n) {
|
||||
#if defined(__x86_64__)
|
||||
__asm__ __volatile__("rep movsl"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
|
||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
|
||||
unsigned short const *__src,
|
||||
size_t __n) {
|
||||
#if defined(__x86_64__)
|
||||
__asm__ __volatile__("rep movsw"
|
||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#else
|
||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
|
||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||
:
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
|
||||
unsigned long __x,
|
||||
|
|
@ -507,16 +530,26 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
|
|||
|* Misc
|
||||
\*----------------------------------------------------------------------------*/
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
#if defined(__i386__)
|
||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
||||
__asm("cpuid" \
|
||||
: "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__leaf), "2"(__count))
|
||||
#else
|
||||
/* x86-64 uses %rbx as the base register, so preserve it. */
|
||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
||||
__asm("xchgq %%rbx,%q1\n" \
|
||||
"cpuid\n" \
|
||||
"xchgq %%rbx,%q1" \
|
||||
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||
: "0"(__leaf), "2"(__count))
|
||||
#endif
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
|
||||
__asm__("cpuid"
|
||||
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(0));
|
||||
__cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
|
||||
int __ecx) {
|
||||
__asm__("cpuid"
|
||||
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
|
||||
: "a"(__level), "c"(__ecx));
|
||||
__cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
|
||||
}
|
||||
static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
|
||||
__asm__ volatile("hlt");
|
||||
|
|
|
|||
30
lib/include/keylockerintrin.h
vendored
30
lib/include/keylockerintrin.h
vendored
|
|
@ -230,10 +230,12 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
|
|
@ -267,10 +269,12 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
|
|
@ -304,10 +308,12 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF (Authentic == 0)
|
||||
/// ZF := 1
|
||||
/// MEM[__odata+127:__odata] := 0
|
||||
/// ELSE
|
||||
/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
|
||||
/// ZF := 0
|
||||
|
|
@ -354,10 +360,16 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
|
||||
|
|
@ -394,10 +406,16 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
|
||||
|
|
@ -434,10 +452,16 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
|
||||
/// IF (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
|
||||
|
|
@ -474,10 +498,16 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
|
||||
/// If (IllegalHandle)
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
|
||||
/// IF Authentic == 0
|
||||
/// ZF := 1
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := 0
|
||||
/// ENDFOR
|
||||
/// ELSE
|
||||
/// FOR i := 0 to 7
|
||||
/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
|
||||
|
|
|
|||
147
lib/include/opencl-c-base.h
vendored
147
lib/include/opencl-c-base.h
vendored
|
|
@ -21,9 +21,37 @@
|
|||
#define cl_khr_subgroup_shuffle 1
|
||||
#define cl_khr_subgroup_shuffle_relative 1
|
||||
#define cl_khr_subgroup_clustered_reduce 1
|
||||
#define cl_khr_extended_bit_ops 1
|
||||
#define cl_khr_integer_dot_product 1
|
||||
#define __opencl_c_integer_dot_product_input_4x8bit 1
|
||||
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
|
||||
|
||||
#endif // defined(__SPIR__)
|
||||
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
||||
|
||||
// Define feature macros for OpenCL C 2.0
|
||||
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
|
||||
#define __opencl_c_pipes 1
|
||||
#define __opencl_c_generic_address_space 1
|
||||
#define __opencl_c_work_group_collective_functions 1
|
||||
#define __opencl_c_atomic_order_acq_rel 1
|
||||
#define __opencl_c_atomic_order_seq_cst 1
|
||||
#define __opencl_c_atomic_scope_device 1
|
||||
#define __opencl_c_atomic_scope_all_devices 1
|
||||
#define __opencl_c_device_enqueue 1
|
||||
#define __opencl_c_read_write_images 1
|
||||
#define __opencl_c_program_scope_global_variables 1
|
||||
#define __opencl_c_images 1
|
||||
#endif
|
||||
|
||||
// Define header-only feature macros for OpenCL C 3.0.
|
||||
#if (__OPENCL_C_VERSION__ == 300)
|
||||
// For the SPIR target all features are supported.
|
||||
#if defined(__SPIR__)
|
||||
#define __opencl_c_atomic_scope_all_devices 1
|
||||
#endif // defined(__SPIR__)
|
||||
#endif // (__OPENCL_C_VERSION__ == 300)
|
||||
|
||||
// built-in scalar data types:
|
||||
|
||||
/**
|
||||
|
|
@ -141,7 +169,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
|
|||
typedef double double16 __attribute__((ext_vector_type(16)));
|
||||
#endif
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
#if defined(__OPENCL_CPP_VERSION__)
|
||||
#define NULL nullptr
|
||||
#elif defined(__OPENCL_C_VERSION__)
|
||||
#define NULL ((void*)0)
|
||||
#endif
|
||||
|
||||
|
|
@ -297,7 +327,12 @@ typedef enum memory_scope {
|
|||
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
#if defined(__opencl_c_atomic_scope_all_devices)
|
||||
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
|
||||
memory_scope_all_devices = memory_scope_all_svm_devices,
|
||||
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
|
||||
#endif // defined(__opencl_c_atomic_scope_all_devices)
|
||||
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
|
||||
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
#endif
|
||||
|
|
@ -322,7 +357,9 @@ typedef enum memory_order
|
|||
memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
memory_order_release = __ATOMIC_RELEASE,
|
||||
memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
#if defined(__opencl_c_atomic_order_seq_cst)
|
||||
memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
#endif
|
||||
} memory_order;
|
||||
|
||||
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
|
@ -445,8 +482,113 @@ typedef struct {
|
|||
|
||||
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
/**
|
||||
* OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
|
||||
* Reinterprets a data type as another data type of the same size
|
||||
*/
|
||||
#define as_char(x) __builtin_astype((x), char)
|
||||
#define as_char2(x) __builtin_astype((x), char2)
|
||||
#define as_char3(x) __builtin_astype((x), char3)
|
||||
#define as_char4(x) __builtin_astype((x), char4)
|
||||
#define as_char8(x) __builtin_astype((x), char8)
|
||||
#define as_char16(x) __builtin_astype((x), char16)
|
||||
|
||||
#define as_uchar(x) __builtin_astype((x), uchar)
|
||||
#define as_uchar2(x) __builtin_astype((x), uchar2)
|
||||
#define as_uchar3(x) __builtin_astype((x), uchar3)
|
||||
#define as_uchar4(x) __builtin_astype((x), uchar4)
|
||||
#define as_uchar8(x) __builtin_astype((x), uchar8)
|
||||
#define as_uchar16(x) __builtin_astype((x), uchar16)
|
||||
|
||||
#define as_short(x) __builtin_astype((x), short)
|
||||
#define as_short2(x) __builtin_astype((x), short2)
|
||||
#define as_short3(x) __builtin_astype((x), short3)
|
||||
#define as_short4(x) __builtin_astype((x), short4)
|
||||
#define as_short8(x) __builtin_astype((x), short8)
|
||||
#define as_short16(x) __builtin_astype((x), short16)
|
||||
|
||||
#define as_ushort(x) __builtin_astype((x), ushort)
|
||||
#define as_ushort2(x) __builtin_astype((x), ushort2)
|
||||
#define as_ushort3(x) __builtin_astype((x), ushort3)
|
||||
#define as_ushort4(x) __builtin_astype((x), ushort4)
|
||||
#define as_ushort8(x) __builtin_astype((x), ushort8)
|
||||
#define as_ushort16(x) __builtin_astype((x), ushort16)
|
||||
|
||||
#define as_int(x) __builtin_astype((x), int)
|
||||
#define as_int2(x) __builtin_astype((x), int2)
|
||||
#define as_int3(x) __builtin_astype((x), int3)
|
||||
#define as_int4(x) __builtin_astype((x), int4)
|
||||
#define as_int8(x) __builtin_astype((x), int8)
|
||||
#define as_int16(x) __builtin_astype((x), int16)
|
||||
|
||||
#define as_uint(x) __builtin_astype((x), uint)
|
||||
#define as_uint2(x) __builtin_astype((x), uint2)
|
||||
#define as_uint3(x) __builtin_astype((x), uint3)
|
||||
#define as_uint4(x) __builtin_astype((x), uint4)
|
||||
#define as_uint8(x) __builtin_astype((x), uint8)
|
||||
#define as_uint16(x) __builtin_astype((x), uint16)
|
||||
|
||||
#define as_long(x) __builtin_astype((x), long)
|
||||
#define as_long2(x) __builtin_astype((x), long2)
|
||||
#define as_long3(x) __builtin_astype((x), long3)
|
||||
#define as_long4(x) __builtin_astype((x), long4)
|
||||
#define as_long8(x) __builtin_astype((x), long8)
|
||||
#define as_long16(x) __builtin_astype((x), long16)
|
||||
|
||||
#define as_ulong(x) __builtin_astype((x), ulong)
|
||||
#define as_ulong2(x) __builtin_astype((x), ulong2)
|
||||
#define as_ulong3(x) __builtin_astype((x), ulong3)
|
||||
#define as_ulong4(x) __builtin_astype((x), ulong4)
|
||||
#define as_ulong8(x) __builtin_astype((x), ulong8)
|
||||
#define as_ulong16(x) __builtin_astype((x), ulong16)
|
||||
|
||||
#define as_float(x) __builtin_astype((x), float)
|
||||
#define as_float2(x) __builtin_astype((x), float2)
|
||||
#define as_float3(x) __builtin_astype((x), float3)
|
||||
#define as_float4(x) __builtin_astype((x), float4)
|
||||
#define as_float8(x) __builtin_astype((x), float8)
|
||||
#define as_float16(x) __builtin_astype((x), float16)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#define as_double(x) __builtin_astype((x), double)
|
||||
#define as_double2(x) __builtin_astype((x), double2)
|
||||
#define as_double3(x) __builtin_astype((x), double3)
|
||||
#define as_double4(x) __builtin_astype((x), double4)
|
||||
#define as_double8(x) __builtin_astype((x), double8)
|
||||
#define as_double16(x) __builtin_astype((x), double16)
|
||||
#endif // cl_khr_fp64
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
#define as_half(x) __builtin_astype((x), half)
|
||||
#define as_half2(x) __builtin_astype((x), half2)
|
||||
#define as_half3(x) __builtin_astype((x), half3)
|
||||
#define as_half4(x) __builtin_astype((x), half4)
|
||||
#define as_half8(x) __builtin_astype((x), half8)
|
||||
#define as_half16(x) __builtin_astype((x), half16)
|
||||
#endif // cl_khr_fp16
|
||||
|
||||
#define as_size_t(x) __builtin_astype((x), size_t)
|
||||
#define as_ptrdiff_t(x) __builtin_astype((x), ptrdiff_t)
|
||||
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
|
||||
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
|
||||
|
||||
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
|
||||
|
||||
#define __kernel_exec(X, typen) __kernel \
|
||||
__attribute__((work_group_size_hint(X, 1, 1))) \
|
||||
__attribute__((vec_type_hint(typen)))
|
||||
|
||||
#define kernel_exec(X, typen) __kernel \
|
||||
__attribute__((work_group_size_hint(X, 1, 1))) \
|
||||
__attribute__((vec_type_hint(typen)))
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
|
||||
// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
|
||||
|
||||
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_device_side_avc_motion_estimation
|
||||
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
|
||||
|
||||
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
|
||||
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
|
||||
|
|
@ -580,7 +722,6 @@ typedef struct {
|
|||
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
|
||||
|
||||
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
|
||||
#endif // cl_intel_device_side_avc_motion_estimation
|
||||
|
||||
// Disable any extensions we may have enabled previously.
|
||||
|
|
|
|||
864
lib/include/opencl-c.h
vendored
864
lib/include/opencl-c.h
vendored
File diff suppressed because it is too large
Load diff
|
|
@ -39,4 +39,46 @@ extern "C" {
|
|||
|
||||
#pragma omp end declare variant
|
||||
|
||||
// Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the
|
||||
// need to `include <new>` in C++ mode.
|
||||
#ifdef __cplusplus
|
||||
|
||||
// We require malloc/free.
|
||||
#include <cstdlib>
|
||||
|
||||
#pragma push_macro("OPENMP_NOEXCEPT")
|
||||
#if __cplusplus >= 201103L
|
||||
#define OPENMP_NOEXCEPT noexcept
|
||||
#else
|
||||
#define OPENMP_NOEXCEPT
|
||||
#endif
|
||||
|
||||
// Device overrides for non-placement new and delete.
|
||||
inline void *operator new(__SIZE_TYPE__ size) {
|
||||
if (size == 0)
|
||||
size = 1;
|
||||
return ::malloc(size);
|
||||
}
|
||||
|
||||
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
|
||||
|
||||
inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }
|
||||
|
||||
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
|
||||
// Sized delete, C++14 only.
|
||||
#if __cplusplus >= 201402L
|
||||
inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("OPENMP_NOEXCEPT")
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
6
lib/include/openmp_wrappers/complex
vendored
6
lib/include/openmp_wrappers/complex
vendored
|
|
@ -17,7 +17,6 @@
|
|||
// We require std::math functions in the complex builtins below.
|
||||
#include <cmath>
|
||||
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
#undef __OPENMP_NVPTX__
|
||||
|
|
@ -26,9 +25,6 @@
|
|||
// Grab the host header too.
|
||||
#include_next <complex>
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
|
||||
// after including <cmath> above. Since the complex header we use is a
|
||||
// simplified version of the libc++, we don't need it in this case. If we
|
||||
|
|
@ -48,5 +44,3 @@
|
|||
#pragma omp end declare variant
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
1
lib/include/openmp_wrappers/complex.h
vendored
1
lib/include/openmp_wrappers/complex.h
vendored
|
|
@ -17,7 +17,6 @@
|
|||
// We require math functions in the complex builtins below.
|
||||
#include <math.h>
|
||||
|
||||
#define __CUDA__
|
||||
#define __OPENMP_NVPTX__
|
||||
#include <__clang_cuda_complex_builtins.h>
|
||||
#undef __OPENMP_NVPTX__
|
||||
|
|
|
|||
26
lib/include/openmp_wrappers/new
vendored
26
lib/include/openmp_wrappers/new
vendored
|
|
@ -9,6 +9,8 @@
|
|||
#ifndef __CLANG_OPENMP_WRAPPERS_NEW
|
||||
#define __CLANG_OPENMP_WRAPPERS_NEW
|
||||
|
||||
// We need the system <new> for the std::nothrow_t. The new/delete operators
|
||||
// which do not use nothrow_t are provided without the <new> header.
|
||||
#include_next <new>
|
||||
|
||||
#if defined(__NVPTX__) && defined(_OPENMP)
|
||||
|
|
@ -22,48 +24,24 @@
|
|||
#define OPENMP_NOEXCEPT
|
||||
#endif
|
||||
|
||||
// Device overrides for non-placement new and delete.
|
||||
inline void *operator new(__SIZE_TYPE__ size) {
|
||||
if (size == 0)
|
||||
size = 1;
|
||||
return ::malloc(size);
|
||||
}
|
||||
inline void *operator new(__SIZE_TYPE__ size,
|
||||
const std::nothrow_t &) OPENMP_NOEXCEPT {
|
||||
return ::operator new(size);
|
||||
}
|
||||
|
||||
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
|
||||
inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
|
||||
return ::operator new(size);
|
||||
}
|
||||
|
||||
inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
|
||||
if (ptr)
|
||||
::free(ptr);
|
||||
}
|
||||
inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
|
||||
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
inline void operator delete[](void *ptr,
|
||||
const std::nothrow_t &) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
|
||||
// Sized delete, C++14 only.
|
||||
#if __cplusplus >= 201402L
|
||||
inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
|
||||
::operator delete(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma pop_macro("OPENMP_NOEXCEPT")
|
||||
#endif
|
||||
|
||||
|
|
|
|||
9
lib/include/ppc_wrappers/xmmintrin.h
vendored
9
lib/include/ppc_wrappers/xmmintrin.h
vendored
|
|
@ -28,7 +28,7 @@
|
|||
Most SSE scalar float intrinsic operations can be performed more
|
||||
efficiently as C language float scalar operations or optimized to
|
||||
use vector SIMD operations. We recommend this for new applications. */
|
||||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
||||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
||||
#endif
|
||||
|
||||
#ifndef _XMMINTRIN_H_INCLUDED
|
||||
|
|
@ -62,14 +62,13 @@
|
|||
|
||||
/* The Intel API is flexible enough that we must allow aliasing with other
|
||||
vector types, and their scalar components. */
|
||||
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
|
||||
typedef vector float __m128 __attribute__((__may_alias__));
|
||||
|
||||
/* Unaligned version of the same type. */
|
||||
typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
|
||||
__aligned__ (1)));
|
||||
typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
|
||||
|
||||
/* Internal data types for implementing the intrinsics. */
|
||||
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
|
||||
typedef vector float __v4sf;
|
||||
|
||||
/* Create an undefined vector. */
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
|
|
|||
123865
lib/include/riscv_vector.h
vendored
Normal file
123865
lib/include/riscv_vector.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
7
lib/include/uintrintrin.h
vendored
7
lib/include/uintrintrin.h
vendored
|
|
@ -20,6 +20,13 @@
|
|||
|
||||
#ifdef __x86_64__
|
||||
|
||||
struct __uintr_frame
|
||||
{
|
||||
unsigned long long rip;
|
||||
unsigned long long rflags;
|
||||
unsigned long long rsp;
|
||||
};
|
||||
|
||||
/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
|
||||
/// user interrupt cannot be delivered on the instruction boundary following
|
||||
/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
|
||||
|
|
|
|||
45
lib/include/vaesintrin.h
vendored
45
lib/include/vaesintrin.h
vendored
|
|
@ -28,13 +28,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesdec_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
|
|
@ -42,13 +35,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesenclast_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
|
|
@ -56,13 +42,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|||
(__v4di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
||||
_mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
|
||||
{
|
||||
|
|
@ -70,13 +49,35 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|||
(__v4di) __B);
|
||||
}
|
||||
|
||||
#ifdef __AVX512FINTRIN_H
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
||||
_mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
|
||||
(__v8di) __B);
|
||||
}
|
||||
|
||||
#endif // __AVX512FINTRIN_H
|
||||
|
||||
#undef __DEFAULT_FN_ATTRS
|
||||
#undef __DEFAULT_FN_ATTRS_F
|
||||
|
|
|
|||
283
lib/include/vecintrin.h
vendored
283
lib/include/vecintrin.h
vendored
|
|
@ -1016,64 +1016,84 @@ vec_scatter_element(__vector double __vec,
|
|||
|
||||
static inline __ATTRS_o_ai __vector signed char
|
||||
vec_xl(long __offset, const signed char *__ptr) {
|
||||
return *(const __vector signed char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed char));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector unsigned char
|
||||
vec_xl(long __offset, const unsigned char *__ptr) {
|
||||
return *(const __vector unsigned char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned char));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector signed short
|
||||
vec_xl(long __offset, const signed short *__ptr) {
|
||||
return *(const __vector signed short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed short));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector unsigned short
|
||||
vec_xl(long __offset, const unsigned short *__ptr) {
|
||||
return *(const __vector unsigned short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned short));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector signed int
|
||||
vec_xl(long __offset, const signed int *__ptr) {
|
||||
return *(const __vector signed int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed int));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector unsigned int
|
||||
vec_xl(long __offset, const unsigned int *__ptr) {
|
||||
return *(const __vector unsigned int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned int));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector signed long long
|
||||
vec_xl(long __offset, const signed long long *__ptr) {
|
||||
return *(const __vector signed long long *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed long long V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed long long));
|
||||
return V;
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai __vector unsigned long long
|
||||
vec_xl(long __offset, const unsigned long long *__ptr) {
|
||||
return *(const __vector unsigned long long *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned long long V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned long long));
|
||||
return V;
|
||||
}
|
||||
|
||||
#if __ARCH__ >= 12
|
||||
static inline __ATTRS_o_ai __vector float
|
||||
vec_xl(long __offset, const float *__ptr) {
|
||||
return *(const __vector float *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector float V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector float));
|
||||
return V;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline __ATTRS_o_ai __vector double
|
||||
vec_xl(long __offset, const double *__ptr) {
|
||||
return *(const __vector double *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector double V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector double));
|
||||
return V;
|
||||
}
|
||||
|
||||
/*-- vec_xld2 ---------------------------------------------------------------*/
|
||||
|
|
@ -1081,64 +1101,82 @@ vec_xl(long __offset, const double *__ptr) {
|
|||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed char
|
||||
vec_xld2(long __offset, const signed char *__ptr) {
|
||||
return *(const __vector signed char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed char));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned char
|
||||
vec_xld2(long __offset, const unsigned char *__ptr) {
|
||||
return *(const __vector unsigned char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned char));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed short
|
||||
vec_xld2(long __offset, const signed short *__ptr) {
|
||||
return *(const __vector signed short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed short));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned short
|
||||
vec_xld2(long __offset, const unsigned short *__ptr) {
|
||||
return *(const __vector unsigned short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned short));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed int
|
||||
vec_xld2(long __offset, const signed int *__ptr) {
|
||||
return *(const __vector signed int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed int));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned int
|
||||
vec_xld2(long __offset, const unsigned int *__ptr) {
|
||||
return *(const __vector unsigned int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned int));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed long long
|
||||
vec_xld2(long __offset, const signed long long *__ptr) {
|
||||
return *(const __vector signed long long *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed long long V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed long long));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned long long
|
||||
vec_xld2(long __offset, const unsigned long long *__ptr) {
|
||||
return *(const __vector unsigned long long *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned long long V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned long long));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector double
|
||||
vec_xld2(long __offset, const double *__ptr) {
|
||||
return *(const __vector double *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector double V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector double));
|
||||
return V;
|
||||
}
|
||||
|
||||
/*-- vec_xlw4 ---------------------------------------------------------------*/
|
||||
|
|
@ -1146,99 +1184,128 @@ vec_xld2(long __offset, const double *__ptr) {
|
|||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed char
|
||||
vec_xlw4(long __offset, const signed char *__ptr) {
|
||||
return *(const __vector signed char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed char));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned char
|
||||
vec_xlw4(long __offset, const unsigned char *__ptr) {
|
||||
return *(const __vector unsigned char *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned char V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned char));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed short
|
||||
vec_xlw4(long __offset, const signed short *__ptr) {
|
||||
return *(const __vector signed short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed short));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned short
|
||||
vec_xlw4(long __offset, const unsigned short *__ptr) {
|
||||
return *(const __vector unsigned short *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned short V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned short));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector signed int
|
||||
vec_xlw4(long __offset, const signed int *__ptr) {
|
||||
return *(const __vector signed int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector signed int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector signed int));
|
||||
return V;
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai __vector unsigned int
|
||||
vec_xlw4(long __offset, const unsigned int *__ptr) {
|
||||
return *(const __vector unsigned int *)
|
||||
((const char *)__ptr + __offset);
|
||||
__vector unsigned int V;
|
||||
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
|
||||
sizeof(__vector unsigned int));
|
||||
return V;
|
||||
}
|
||||
|
||||
/*-- vec_xst ----------------------------------------------------------------*/
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector signed char __vec, long __offset, signed char *__ptr) {
|
||||
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed char));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
|
||||
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned char));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector signed short __vec, long __offset, signed short *__ptr) {
|
||||
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed short));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
|
||||
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned short));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector signed int __vec, long __offset, signed int *__ptr) {
|
||||
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
|
||||
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned int));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector signed long long __vec, long __offset,
|
||||
signed long long *__ptr) {
|
||||
*(__vector signed long long *)((char *)__ptr + __offset) = __vec;
|
||||
signed long long *__ptr) {
|
||||
__vector signed long long V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed long long));
|
||||
}
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector unsigned long long __vec, long __offset,
|
||||
unsigned long long *__ptr) {
|
||||
*(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
|
||||
unsigned long long *__ptr) {
|
||||
__vector unsigned long long V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned long long));
|
||||
}
|
||||
|
||||
#if __ARCH__ >= 12
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector float __vec, long __offset, float *__ptr) {
|
||||
*(__vector float *)((char *)__ptr + __offset) = __vec;
|
||||
__vector float V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector float));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xst(__vector double __vec, long __offset, double *__ptr) {
|
||||
*(__vector double *)((char *)__ptr + __offset) = __vec;
|
||||
__vector double V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
|
||||
}
|
||||
|
||||
/*-- vec_xstd2 --------------------------------------------------------------*/
|
||||
|
|
@ -1246,57 +1313,73 @@ vec_xst(__vector double __vec, long __offset, double *__ptr) {
|
|||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector signed char __vec, long __offset, signed char *__ptr) {
|
||||
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed char));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
|
||||
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned char));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector signed short __vec, long __offset, signed short *__ptr) {
|
||||
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed short));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
|
||||
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned short));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector signed int __vec, long __offset, signed int *__ptr) {
|
||||
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
|
||||
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned int));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector signed long long __vec, long __offset,
|
||||
signed long long *__ptr) {
|
||||
*(__vector signed long long *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed long long V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed long long));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector unsigned long long __vec, long __offset,
|
||||
unsigned long long *__ptr) {
|
||||
*(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned long long V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned long long));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
|
||||
*(__vector double *)((char *)__ptr + __offset) = __vec;
|
||||
__vector double V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
|
||||
}
|
||||
|
||||
/*-- vec_xstw4 --------------------------------------------------------------*/
|
||||
|
|
@ -1304,37 +1387,48 @@ vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
|
|||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector signed char __vec, long __offset, signed char *__ptr) {
|
||||
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed char));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
|
||||
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned char V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned char));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector signed short __vec, long __offset, signed short *__ptr) {
|
||||
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector signed short));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
|
||||
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned short V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned short));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector signed int __vec, long __offset, signed int *__ptr) {
|
||||
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector signed int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
|
||||
}
|
||||
|
||||
// This prototype is deprecated.
|
||||
static inline __ATTRS_o_ai void
|
||||
vec_xstw4(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
|
||||
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
|
||||
__vector unsigned int V = __vec;
|
||||
__builtin_memcpy(((char *)__ptr + __offset), &V,
|
||||
sizeof(__vector unsigned int));
|
||||
}
|
||||
|
||||
/*-- vec_load_bndry ---------------------------------------------------------*/
|
||||
|
|
@ -9259,6 +9353,41 @@ vec_fp_test_data_class(__vector double __a, int __b, int *__c)
|
|||
__VEC_CLASS_FP_ZERO | \
|
||||
__VEC_CLASS_FP_INFINITY)
|
||||
|
||||
/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
|
||||
|
||||
#if __ARCH__ >= 14
|
||||
#define vec_extend_to_fp32_hi(X, W) \
|
||||
((__vector float)__builtin_s390_vclfnhs((X), (W)));
|
||||
#endif
|
||||
|
||||
/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
|
||||
|
||||
#if __ARCH__ >= 14
|
||||
#define vec_extend_to_fp32_lo(X, W) \
|
||||
((__vector float)__builtin_s390_vclfnls((X), (W)));
|
||||
#endif
|
||||
|
||||
/*-- vec_round_from_fp32 ----------------------------------------------------*/
|
||||
|
||||
#if __ARCH__ >= 14
|
||||
#define vec_round_from_fp32(X, Y, W) \
|
||||
((__vector unsigned short)__builtin_s390_vcrnfs((X), (Y), (W)));
|
||||
#endif
|
||||
|
||||
/*-- vec_convert_to_fp16 ----------------------------------------------------*/
|
||||
|
||||
#if __ARCH__ >= 14
|
||||
#define vec_convert_to_fp16(X, W) \
|
||||
((__vector unsigned short)__builtin_s390_vcfn((X), (W)));
|
||||
#endif
|
||||
|
||||
/*-- vec_convert_from_fp16 --------------------------------------------------*/
|
||||
|
||||
#if __ARCH__ >= 14
|
||||
#define vec_convert_from_fp16(X, W) \
|
||||
((__vector unsigned short)__builtin_s390_vcnf((X), (W)));
|
||||
#endif
|
||||
|
||||
/*-- vec_cp_until_zero ------------------------------------------------------*/
|
||||
|
||||
static inline __ATTRS_o_ai __vector signed char
|
||||
|
|
|
|||
2
lib/include/vpclmulqdqintrin.h
vendored
2
lib/include/vpclmulqdqintrin.h
vendored
|
|
@ -19,10 +19,12 @@
|
|||
(__v4di)(__m256i)(B), \
|
||||
(char)(I))
|
||||
|
||||
#ifdef __AVX512FINTRIN_H
|
||||
#define _mm512_clmulepi64_epi128(A, B, I) \
|
||||
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
|
||||
(__v8di)(__m512i)(B), \
|
||||
(char)(I))
|
||||
#endif // __AVX512FINTRIN_H
|
||||
|
||||
#endif /* __VPCLMULQDQINTRIN_H */
|
||||
|
||||
|
|
|
|||
1041
lib/include/wasm_simd128.h
vendored
1041
lib/include/wasm_simd128.h
vendored
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue