From db4fea6689eb34959028cad3b63de45da65683d3 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Sat, 28 Aug 2021 13:11:47 -0700 Subject: [PATCH] update libcxx, libcxxabi, and C headers to release/13.x branch upstream commit 9c49fee5e7ac0ca8bc4ec1c3738ca0d83df65852 --- lib/include/__clang_cuda_device_functions.h | 276 +++++++++++------- lib/include/__clang_hip_cmath.h | 188 +++++++----- lib/include/__clang_hip_math.h | 50 +++- lib/include/intrin.h | 3 + .../__clang_openmp_device_functions.h | 32 +- lib/include/openmp_wrappers/cmath | 54 ++++ lib/include/openmp_wrappers/math.h | 10 + lib/libcxx/include/cwctype | 2 + lib/libcxx/include/string | 19 ++ lib/libcxx/include/vector | 20 ++ lib/libcxx/include/wctype.h | 10 + lib/libcxxabi/src/cxa_personality.cpp | 2 +- 12 files changed, 476 insertions(+), 190 deletions(-) diff --git a/lib/include/__clang_cuda_device_functions.h b/lib/include/__clang_cuda_device_functions.h index f801e5426a..cc4e1a4dd9 100644 --- a/lib/include/__clang_cuda_device_functions.h +++ b/lib/include/__clang_cuda_device_functions.h @@ -34,10 +34,12 @@ __DEVICE__ unsigned long long __brevll(unsigned long long __a) { return __nv_brevll(__a); } #if defined(__cplusplus) -__DEVICE__ void __brkpt() { asm volatile("brkpt;"); } +__DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); } __DEVICE__ void __brkpt(int __a) { __brkpt(); } #else -__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); } +__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { + __asm__ __volatile__("brkpt;"); +} __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); } #endif __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, @@ -507,7 +509,7 @@ __DEVICE__ float __powf(float __a, float __b) { } // Parameter must have a known integer value. -#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a)) +#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a)) __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); } __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { return __nv_sad(__a, __b, __c); @@ -526,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; -__DEVICE__ void __trap(void) { asm volatile("trap;"); } +__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { return __nvvm_atom_add_gen_i((int *)__p, __v); } @@ -1051,122 +1053,136 @@ __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { } __DEVICE__ unsigned int __vabs2(unsigned int __a) { unsigned int r; - asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(0), "r"(0)); + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); return r; } __DEVICE__ unsigned int __vabs4(unsigned int __a) { unsigned int r; - asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(0), "r"(0)); + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); return r; } __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vabsss2(unsigned int __a) { unsigned int r; - asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(0), "r"(0)); + __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); return r; } __DEVICE__ unsigned int __vabsss4(unsigned int __a) { unsigned int r; - asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(0), "r"(0)); + __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); return r; } __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.eq %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { @@ -1174,7 +1190,9 @@ __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.eq %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { @@ -1182,7 +1200,9 @@ __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.s32.s32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { @@ -1190,7 +1210,9 @@ __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.s32.s32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { @@ -1198,7 +1220,9 @@ __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { @@ -1206,7 +1230,9 @@ __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.ge %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { @@ -1214,7 +1240,9 @@ __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.s32.s32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { @@ -1222,7 +1250,9 @@ __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.s32.s32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { @@ -1230,7 +1260,9 @@ __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { @@ -1238,7 +1270,9 @@ __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.gt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { @@ -1246,7 +1280,9 @@ __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.s32.s32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { @@ -1254,7 +1290,9 @@ __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.s32.s32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { @@ -1262,7 +1300,9 @@ __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { @@ -1270,7 +1310,9 @@ __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.le %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { @@ -1278,7 +1320,9 @@ __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.s32.s32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { @@ -1286,7 +1330,9 @@ __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.s32.s32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { @@ -1294,7 +1340,9 @@ __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { @@ -1302,7 +1350,9 @@ __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.lt %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { @@ -1310,7 +1360,9 @@ __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset2.u32.u32.ne %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { @@ -1318,7 +1370,9 @@ __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { } __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vset4.u32.u32.ne %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { @@ -1345,94 +1399,112 @@ __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { unsigned mask = __vcmpgts2(__a, __b); r = (__a & mask) | (__b & ~mask); } else { - asm("vmax2.s32.s32.s32 %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); } return r; } __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); } __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vnegss2(unsigned int __a) { @@ -1440,9 +1512,9 @@ __DEVICE__ unsigned int __vnegss2(unsigned int __a) { } __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vnegss4(unsigned int __a) { @@ -1450,16 +1522,16 @@ __DEVICE__ unsigned int __vnegss4(unsigned int __a) { } __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { unsigned int r; - asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" - : "=r"(r) - : "r"(__a), "r"(__b), "r"(0)); + __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); return r; } #endif // CUDA_VERSION >= 9020 diff --git a/lib/include/__clang_hip_cmath.h b/lib/include/__clang_hip_cmath.h index 7342705434..d488db0a94 100644 --- a/lib/include/__clang_hip_cmath.h +++ b/lib/include/__clang_hip_cmath.h @@ -10,7 +10,7 @@ #ifndef __CLANG_HIP_CMATH_H__ #define __CLANG_HIP_CMATH_H__ -#if !defined(__HIP__) +#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) #error "This file is for HIP and OpenMP AMDGCN device compilation only." #endif @@ -25,31 +25,43 @@ #endif // !defined(__HIPCC_RTC__) #pragma push_macro("__DEVICE__") +#pragma push_macro("__CONSTEXPR__") +#ifdef __OPENMP_AMDGCN__ +#define __DEVICE__ static __attribute__((always_inline, nothrow)) +#define __CONSTEXPR__ constexpr +#else #define __DEVICE__ static __device__ inline __attribute__((always_inline)) +#define __CONSTEXPR__ +#endif // __OPENMP_AMDGCN__ // Start with functions that cannot be defined by DEF macros below. #if defined(__cplusplus) -__DEVICE__ double abs(double __x) { return ::fabs(__x); } -__DEVICE__ float abs(float __x) { return ::fabsf(__x); } -__DEVICE__ long long abs(long long __n) { return ::llabs(__n); } -__DEVICE__ long abs(long __n) { return ::labs(__n); } -__DEVICE__ float fma(float __x, float __y, float __z) { +#if defined __OPENMP_AMDGCN__ +__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); } +__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); } +__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); } +#endif +__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); } +__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); } +__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); } +__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); } +__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) { return ::fmaf(__x, __y, __z); } #if !defined(__HIPCC_RTC__) // The value returned by fpclassify is platform dependent, therefore it is not // supported by hipRTC. -__DEVICE__ int fpclassify(float __x) { +__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) { return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x); } -__DEVICE__ int fpclassify(double __x) { +__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) { return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, __x); } #endif // !defined(__HIPCC_RTC__) -__DEVICE__ float frexp(float __arg, int *__exp) { +__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) { return ::frexpf(__arg, __exp); } @@ -71,93 +83,101 @@ __DEVICE__ float frexp(float __arg, int *__exp) { // of the variants inside the inner region and avoid the clash. #pragma omp begin declare variant match(implementation = {vendor(llvm)}) -__DEVICE__ int isinf(float __x) { return ::__isinff(__x); } -__DEVICE__ int isinf(double __x) { return ::__isinf(__x); } -__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); } -__DEVICE__ int isfinite(double __x) { return ::__finite(__x); } -__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); } -__DEVICE__ int isnan(double __x) { return ::__isnan(__x); } +__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); } +__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); } +__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); } +__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); } +__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); } +__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); } #pragma omp end declare variant #endif // defined(__OPENMP_AMDGCN__) -__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); } -__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); } -__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); } -__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); } -__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); } -__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); } +__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); } +__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); } +__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); } +__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); } +__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); } +__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); } #if defined(__OPENMP_AMDGCN__) #pragma omp end declare variant #endif // defined(__OPENMP_AMDGCN__) -__DEVICE__ bool isgreater(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) { return __builtin_isgreater(__x, __y); } -__DEVICE__ bool isgreater(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) { return __builtin_isgreater(__x, __y); } -__DEVICE__ bool isgreaterequal(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) { return __builtin_isgreaterequal(__x, __y); } -__DEVICE__ bool isgreaterequal(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) { return __builtin_isgreaterequal(__x, __y); } -__DEVICE__ bool isless(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) { return __builtin_isless(__x, __y); } -__DEVICE__ bool isless(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) { return __builtin_isless(__x, __y); } -__DEVICE__ bool islessequal(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) { return __builtin_islessequal(__x, __y); } -__DEVICE__ bool islessequal(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) { return __builtin_islessequal(__x, __y); } -__DEVICE__ bool islessgreater(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) { return __builtin_islessgreater(__x, __y); } -__DEVICE__ bool islessgreater(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) { return __builtin_islessgreater(__x, __y); } -__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); } -__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); } -__DEVICE__ bool isunordered(float __x, float __y) { +__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) { + return __builtin_isnormal(__x); +} +__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) { + return __builtin_isnormal(__x); +} +__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) { return __builtin_isunordered(__x, __y); } -__DEVICE__ bool isunordered(double __x, double __y) { +__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) { return __builtin_isunordered(__x, __y); } -__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); } -__DEVICE__ float pow(float __base, int __iexp) { +__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) { + return ::modff(__x, __iptr); +} +__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) { return ::powif(__base, __iexp); } -__DEVICE__ double pow(double __base, int __iexp) { +__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) { return ::powi(__base, __iexp); } -__DEVICE__ float remquo(float __x, float __y, int *__quo) { +__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) { return ::remquof(__x, __y, __quo); } -__DEVICE__ float scalbln(float __x, long int __n) { +__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) { return ::scalblnf(__x, __n); } -__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); } -__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); } +__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); } +__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); } // Notably missing above is nexttoward. We omit it because // ocml doesn't provide an implementation, and we don't want to be in the // business of implementing tricky libm functions in this header. // Other functions. -__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) { +__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y, + _Float16 __z) { return __ocml_fma_f16(__x, __y, __z); } -__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) { +__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) { return __ocml_pown_f16(__base, __iexp); } +#ifndef __OPENMP_AMDGCN__ // BEGIN DEF_FUN and HIP_OVERLOAD // BEGIN DEF_FUN @@ -168,18 +188,19 @@ __DEVICE__ _Float16 pow(_Float16 __base, int __iexp) { // Define cmath functions with float argument and returns __retty. #define __DEF_FUN1(__retty, __func) \ - __DEVICE__ \ - __retty __func(float __x) { return __func##f(__x); } + __DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); } // Define cmath functions with two float arguments and returns __retty. #define __DEF_FUN2(__retty, __func) \ - __DEVICE__ \ - __retty __func(float __x, float __y) { return __func##f(__x, __y); } + __DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) { \ + return __func##f(__x, __y); \ + } // Define cmath functions with a float and an int argument and returns __retty. #define __DEF_FUN2_FI(__retty, __func) \ - __DEVICE__ \ - __retty __func(float __x, int __y) { return __func##f(__x, __y); } + __DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) { \ + return __func##f(__x, __y); \ + } __DEF_FUN1(float, acos) __DEF_FUN1(float, acosh) @@ -426,7 +447,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {}; // floor(double). #define __HIP_OVERLOAD1(__retty, __fn) \ template \ - __DEVICE__ \ + __DEVICE__ __CONSTEXPR__ \ typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type \ __fn(__T __x) { \ return ::__fn((double)__x); \ @@ -438,7 +459,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {}; #if __cplusplus >= 201103L #define __HIP_OVERLOAD2(__retty, __fn) \ template \ - __DEVICE__ typename __hip_enable_if< \ + __DEVICE__ __CONSTEXPR__ typename __hip_enable_if< \ __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, \ typename __hip::__promote<__T1, __T2>::type>::type \ __fn(__T1 __x, __T2 __y) { \ @@ -448,10 +469,11 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {}; #else #define __HIP_OVERLOAD2(__retty, __fn) \ template \ - __DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \ - __hip::is_arithmetic<__T2>::value, \ - __retty>::type \ - __fn(__T1 __x, __T2 __y) { \ + __DEVICE__ __CONSTEXPR__ \ + typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \ + __hip::is_arithmetic<__T2>::value, \ + __retty>::type \ + __fn(__T1 __x, __T2 __y) { \ return __fn((double)__x, (double)__y); \ } #endif @@ -526,7 +548,7 @@ __HIP_OVERLOAD2(double, min) // Additional Overloads that don't quite match HIP_OVERLOAD. #if __cplusplus >= 201103L template -__DEVICE__ typename __hip_enable_if< +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value && __hip::is_arithmetic<__T3>::value, typename __hip::__promote<__T1, __T2, __T3>::type>::type @@ -536,31 +558,32 @@ fma(__T1 __x, __T2 __y, __T3 __z) { } #else template -__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && - __hip::is_arithmetic<__T2>::value && - __hip::is_arithmetic<__T3>::value, - double>::type -fma(__T1 __x, __T2 __y, __T3 __z) { +__DEVICE__ __CONSTEXPR__ + typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && + __hip::is_arithmetic<__T2>::value && + __hip::is_arithmetic<__T3>::value, + double>::type + fma(__T1 __x, __T2 __y, __T3 __z) { return ::fma((double)__x, (double)__y, (double)__z); } #endif template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type frexp(__T __x, int *__exp) { return ::frexp((double)__x, __exp); } template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type ldexp(__T __x, int __exp) { return ::ldexp((double)__x, __exp); } template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type modf(__T __x, double *__exp) { return ::modf((double)__x, __exp); @@ -568,7 +591,7 @@ __DEVICE__ #if __cplusplus >= 201103L template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type @@ -578,23 +601,24 @@ __DEVICE__ } #else template -__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && - __hip::is_arithmetic<__T2>::value, - double>::type -remquo(__T1 __x, __T2 __y, int *__quo) { +__DEVICE__ __CONSTEXPR__ + typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && + __hip::is_arithmetic<__T2>::value, + double>::type + remquo(__T1 __x, __T2 __y, int *__quo) { return ::remquo((double)__x, (double)__y, __quo); } #endif template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type scalbln(__T __x, long int __exp) { return ::scalbln((double)__x, __exp); } template -__DEVICE__ +__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type scalbn(__T __x, int __exp) { return ::scalbn((double)__x, __exp); @@ -607,8 +631,10 @@ __DEVICE__ // END DEF_FUN and HIP_OVERLOAD +#endif // ifndef __OPENMP_AMDGCN__ #endif // defined(__cplusplus) +#ifndef __OPENMP_AMDGCN__ // Define these overloads inside the namespace our standard library uses. #if !defined(__HIPCC_RTC__) #ifdef _LIBCPP_BEGIN_NAMESPACE_STD @@ -781,22 +807,26 @@ _GLIBCXX_END_NAMESPACE_VERSION #if defined(__cplusplus) extern "C" { #endif // defined(__cplusplus) -__DEVICE__ __attribute__((overloadable)) double _Cosh(double x, double y) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x, + double y) { return cosh(x) * y; } -__DEVICE__ __attribute__((overloadable)) float _FCosh(float x, float y) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x, + float y) { return coshf(x) * y; } -__DEVICE__ __attribute__((overloadable)) short _Dtest(double *p) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) { return fpclassify(*p); } -__DEVICE__ __attribute__((overloadable)) short _FDtest(float *p) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) { return fpclassify(*p); } -__DEVICE__ __attribute__((overloadable)) double _Sinh(double x, double y) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x, + double y) { return sinh(x) * y; } -__DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) { +__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x, + float y) { return sinhf(x) * y; } #if defined(__cplusplus) @@ -804,7 +834,9 @@ __DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) { #endif // defined(__cplusplus) #endif // defined(_MSC_VER) #endif // !defined(__HIPCC_RTC__) +#endif // ifndef __OPENMP_AMDGCN__ #pragma pop_macro("__DEVICE__") +#pragma pop_macro("__CONSTEXPR__") #endif // __CLANG_HIP_CMATH_H__ diff --git a/lib/include/__clang_hip_math.h b/lib/include/__clang_hip_math.h index 1f0982d92e..ef7e087b83 100644 --- a/lib/include/__clang_hip_math.h +++ b/lib/include/__clang_hip_math.h @@ -9,7 +9,7 @@ #ifndef __CLANG_HIP_MATH_H__ #define __CLANG_HIP_MATH_H__ -#if !defined(__HIP__) +#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) #error "This file is for HIP and OpenMP AMDGCN device compilation only." #endif @@ -19,18 +19,30 @@ #endif #include #include -#endif // __HIPCC_RTC__ +#ifdef __OPENMP_AMDGCN__ +#include +#endif +#endif // !defined(__HIPCC_RTC__) #pragma push_macro("__DEVICE__") + +#ifdef __OPENMP_AMDGCN__ +#define __DEVICE__ static inline __attribute__((always_inline, nothrow)) +#else #define __DEVICE__ static __device__ inline __attribute__((always_inline)) +#endif // A few functions return bool type starting only in C++11. #pragma push_macro("__RETURN_TYPE") +#ifdef __OPENMP_AMDGCN__ +#define __RETURN_TYPE int +#else #if defined(__cplusplus) #define __RETURN_TYPE bool #else #define __RETURN_TYPE int #endif +#endif // __OPENMP_AMDGCN__ #if defined (__cplusplus) && __cplusplus < 201103L // emulate static_assert on type sizes @@ -249,6 +261,9 @@ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } __DEVICE__ float frexpf(float __x, int *__nptr) { int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif float __r = __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp); *__nptr = __tmp; @@ -334,6 +349,9 @@ long int lroundf(float __x) { return __ocml_round_f32(__x); } __DEVICE__ float modff(float __x, float *__iptr) { float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif float __r = __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); *__iptr = __tmp; @@ -414,6 +432,9 @@ float remainderf(float __x, float __y) { __DEVICE__ float remquof(float __x, float __y, int *__quo) { int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif float __r = __ocml_remquo_f32( __x, __y, (__attribute__((address_space(5))) int *)&__tmp); *__quo = __tmp; @@ -470,6 +491,9 @@ __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); } __DEVICE__ void sincosf(float __x, float *__sinptr, float *__cosptr) { float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif *__sinptr = __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); *__cosptr = __tmp; @@ -478,6 +502,9 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) { __DEVICE__ void sincospif(float __x, float *__sinptr, float *__cosptr) { float __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif *__sinptr = __ocml_sincospi_f32( __x, (__attribute__((address_space(5))) float *)&__tmp); *__cosptr = __tmp; @@ -790,6 +817,9 @@ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } __DEVICE__ double frexp(double __x, int *__nptr) { int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif double __r = __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp); *__nptr = __tmp; @@ -874,6 +904,9 @@ long int lround(double __x) { return __ocml_round_f64(__x); } __DEVICE__ double modf(double __x, double *__iptr) { double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif double __r = __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); *__iptr = __tmp; @@ -962,6 +995,9 @@ double remainder(double __x, double __y) { __DEVICE__ double remquo(double __x, double __y, int *__quo) { int __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif double __r = __ocml_remquo_f64( __x, __y, (__attribute__((address_space(5))) int *)&__tmp); *__quo = __tmp; @@ -1020,6 +1056,9 @@ double sin(double __x) { return __ocml_sin_f64(__x); } __DEVICE__ void sincos(double __x, double *__sinptr, double *__cosptr) { double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif *__sinptr = __ocml_sincos_f64( __x, (__attribute__((address_space(5))) double *)&__tmp); *__cosptr = __tmp; @@ -1028,6 +1067,9 @@ void sincos(double __x, double *__sinptr, double *__cosptr) { __DEVICE__ void sincospi(double __x, double *__sinptr, double *__cosptr) { double __tmp; +#ifdef __OPENMP_AMDGCN__ +#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) +#endif *__sinptr = __ocml_sincospi_f64( __x, (__attribute__((address_space(5))) double *)&__tmp); *__cosptr = __tmp; @@ -1262,7 +1304,7 @@ float min(float __x, float __y) { return fminf(__x, __y); } __DEVICE__ double min(double __x, double __y) { return fmin(__x, __y); } -#if !defined(__HIPCC_RTC__) +#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) __host__ inline static int min(int __arg1, int __arg2) { return std::min(__arg1, __arg2); } @@ -1270,7 +1312,7 @@ __host__ inline static int min(int __arg1, int __arg2) { __host__ inline static int max(int __arg1, int __arg2) { return std::max(__arg1, __arg2); } -#endif // __HIPCC_RTC__ +#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) #endif #pragma pop_macro("__DEVICE__") diff --git a/lib/include/intrin.h b/lib/include/intrin.h index ff8eb8fca2..34ec79d6ac 100644 --- a/lib/include/intrin.h +++ b/lib/include/intrin.h @@ -574,6 +574,9 @@ void _WriteStatusReg(int, __int64); unsigned short __cdecl _byteswap_ushort(unsigned short val); unsigned long __cdecl _byteswap_ulong (unsigned long val); unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val); + +__int64 __mulh(__int64 __a, __int64 __b); +unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b); #endif /*----------------------------------------------------------------------------*\ diff --git a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h index 953857badf..279fb26fba 100644 --- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h +++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h @@ -14,13 +14,13 @@ #error "This file is for OpenMP compilation only." #endif -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) - #ifdef __cplusplus extern "C" { #endif +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) + #define __CUDA__ #define __OPENMP_NVPTX__ @@ -33,12 +33,34 @@ extern "C" { #undef __OPENMP_NVPTX__ #undef __CUDA__ +#pragma omp end declare variant + +#ifdef __AMDGCN__ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +// Import types which will be used by __clang_hip_libdevice_declares.h +#ifndef __cplusplus +#include +#include +#endif + +#define __OPENMP_AMDGCN__ +#pragma push_macro("__device__") +#define __device__ + +/// Include declarations for libdevice functions. +#include <__clang_hip_libdevice_declares.h> + +#pragma pop_macro("__device__") +#undef __OPENMP_AMDGCN__ + +#pragma omp end declare variant +#endif + #ifdef __cplusplus } // extern "C" #endif -#pragma omp end declare variant - // Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the // need to `include ` in C++ mode. #ifdef __cplusplus diff --git a/lib/include/openmp_wrappers/cmath b/lib/include/openmp_wrappers/cmath index 1aff66af7d..22a720aca9 100644 --- a/lib/include/openmp_wrappers/cmath +++ b/lib/include/openmp_wrappers/cmath @@ -75,4 +75,58 @@ __DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); } #pragma omp end declare variant +#ifdef __AMDGCN__ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +#pragma push_macro("__constant__") +#define __constant__ __attribute__((constant)) +#define __OPENMP_AMDGCN__ + +#include <__clang_hip_cmath.h> + +#pragma pop_macro("__constant__") +#undef __OPENMP_AMDGCN__ + +// Define overloads otherwise which are absent +#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow)) + +__DEVICE__ float acos(float __x) { return ::acosf(__x); } +__DEVICE__ float acosh(float __x) { return ::acoshf(__x); } +__DEVICE__ float asin(float __x) { return ::asinf(__x); } +__DEVICE__ float asinh(float __x) { return ::asinhf(__x); } +__DEVICE__ float atan(float __x) { return ::atanf(__x); } +__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); } +__DEVICE__ float atanh(float __x) { return ::atanhf(__x); } +__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); } +__DEVICE__ float cosh(float __x) { return ::coshf(__x); } +__DEVICE__ float erf(float __x) { return ::erff(__x); } +__DEVICE__ float erfc(float __x) { return ::erfcf(__x); } +__DEVICE__ float exp2(float __x) { return ::exp2f(__x); } +__DEVICE__ float expm1(float __x) { return ::expm1f(__x); } +__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); } +__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); } +__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); } +__DEVICE__ float ldexp(float __arg, int __exp) { + return ::ldexpf(__arg, __exp); +} +__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); } +__DEVICE__ float log1p(float __x) { return ::log1pf(__x); } +__DEVICE__ float logb(float __x) { return ::logbf(__x); } +__DEVICE__ float nextafter(float __x, float __y) { + return ::nextafterf(__x, __y); +} +__DEVICE__ float remainder(float __x, float __y) { + return ::remainderf(__x, __y); +} +__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); } +__DEVICE__ float sinh(float __x) { return ::sinhf(__x); } +__DEVICE__ float tan(float __x) { return ::tanf(__x); } +__DEVICE__ float tanh(float __x) { return ::tanhf(__x); } +__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); } + +#undef __DEVICE__ + +#pragma omp end declare variant +#endif // __AMDGCN__ + #endif diff --git a/lib/include/openmp_wrappers/math.h b/lib/include/openmp_wrappers/math.h index c64af8b13e..1e3c07cfdb 100644 --- a/lib/include/openmp_wrappers/math.h +++ b/lib/include/openmp_wrappers/math.h @@ -48,4 +48,14 @@ #pragma omp end declare variant +#ifdef __AMDGCN__ +#pragma omp begin declare variant match(device = {arch(amdgcn)}) + +#define __OPENMP_AMDGCN__ +#include <__clang_hip_math.h> +#undef __OPENMP_AMDGCN__ + +#pragma omp end declare variant +#endif + #endif diff --git a/lib/libcxx/include/cwctype b/lib/libcxx/include/cwctype index 17c68d6d45..27eea2f157 100644 --- a/lib/libcxx/include/cwctype +++ b/lib/libcxx/include/cwctype @@ -59,6 +59,7 @@ wctrans_t wctrans(const char* property); _LIBCPP_BEGIN_NAMESPACE_STD +#if defined(_LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H) using ::wint_t _LIBCPP_USING_IF_EXISTS; using ::wctrans_t _LIBCPP_USING_IF_EXISTS; using ::wctype_t _LIBCPP_USING_IF_EXISTS; @@ -80,6 +81,7 @@ using ::towlower _LIBCPP_USING_IF_EXISTS; using ::towupper _LIBCPP_USING_IF_EXISTS; using ::towctrans _LIBCPP_USING_IF_EXISTS; using ::wctrans _LIBCPP_USING_IF_EXISTS; +#endif // _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H _LIBCPP_END_NAMESPACE_STD diff --git a/lib/libcxx/include/string b/lib/libcxx/include/string index 4940021b0c..4159ea5803 100644 --- a/lib/libcxx/include/string +++ b/lib/libcxx/include/string @@ -522,6 +522,7 @@ basic_string operator "" s( const char32_t *str, size_t len ); // C++1 #include #include #include // EOF +#include #include #include #include @@ -1714,6 +1715,24 @@ private: return data() <= __p && __p <= data() + size(); } + _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI + void __throw_length_error() const { +#ifndef _LIBCPP_NO_EXCEPTIONS + __basic_string_common::__throw_length_error(); +#else + _VSTD::abort(); +#endif + } + + _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI + void __throw_out_of_range() const { +#ifndef _LIBCPP_NO_EXCEPTIONS + __basic_string_common::__throw_out_of_range(); +#else + _VSTD::abort(); +#endif + } + friend basic_string operator+<>(const basic_string&, const basic_string&); friend basic_string operator+<>(const value_type*, const basic_string&); friend basic_string operator+<>(value_type, const basic_string&); diff --git a/lib/libcxx/include/vector b/lib/libcxx/include/vector index 9189ed44a8..90d8b946f1 100644 --- a/lib/libcxx/include/vector +++ b/lib/libcxx/include/vector @@ -281,6 +281,7 @@ erase_if(vector& c, Predicate pred); // C++20 #include #include #include +#include #include #include #include // for forward declaration of vector @@ -390,6 +391,25 @@ protected: is_nothrow_move_assignable::value) {__move_assign_alloc(__c, integral_constant());} + + _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI + void __throw_length_error() const { +#ifndef _LIBCPP_NO_EXCEPTIONS + __vector_base_common::__throw_length_error(); +#else + _VSTD::abort(); +#endif + } + + _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI + void __throw_out_of_range() const { +#ifndef _LIBCPP_NO_EXCEPTIONS + __vector_base_common::__throw_out_of_range(); +#else + _VSTD::abort(); +#endif + } + private: _LIBCPP_INLINE_VISIBILITY void __copy_assign_alloc(const __vector_base& __c, true_type) diff --git a/lib/libcxx/include/wctype.h b/lib/libcxx/include/wctype.h index 1b4b146149..3b614759ac 100644 --- a/lib/libcxx/include/wctype.h +++ b/lib/libcxx/include/wctype.h @@ -50,8 +50,18 @@ wctrans_t wctrans(const char* property); #pragma GCC system_header #endif +// TODO: +// In the future, we should unconditionally include_next here and instead +// have a mode under which the library does not need libc++'s or +// at all (i.e. a mode without wchar_t). As it stands, we need to do that to completely +// bypass the using declarations in when we did not include . +// Otherwise, a using declaration like `using ::wint_t` in will refer to +// nothing (with using_if_exists), and if we include another header that defines one +// of these declarations (e.g. ), the second `using ::wint_t` with using_if_exists +// will fail because it does not refer to the same declaration. #if __has_include_next() # include_next +# define _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H #endif #ifdef __cplusplus diff --git a/lib/libcxxabi/src/cxa_personality.cpp b/lib/libcxxabi/src/cxa_personality.cpp index a4f81d7473..91b584eb8c 100644 --- a/lib/libcxxabi/src/cxa_personality.cpp +++ b/lib/libcxxabi/src/cxa_personality.cpp @@ -702,10 +702,10 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions, return; } landingPad = (uintptr_t)lpStart + landingPad; - results.landingPad = landingPad; #else // __USING_SJLJ_EXCEPTIONS__ ++landingPad; #endif // __USING_SJLJ_EXCEPTIONS__ + results.landingPad = landingPad; if (actionEntry == 0) { // Found a cleanup