7 #ifndef HPP_CUDA_UTILS_H 8 #define HPP_CUDA_UTILS_H 11 #include <cuda_runtime.h> 18 #include <cuComplex.h> 23 #define ALIGN(x) __align__(x) 26 #define ALIGN(x) __attribute__ ((aligned (x))) 33 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 34 __inline__ __device__
double atomicAdd(
double* address,
double val)
36 unsigned long long int* address_as_ull =
37 (
unsigned long long int*)address;
38 unsigned long long int old = *address_as_ull, assumed;
42 old = atomicCAS(address_as_ull, assumed,
43 __double_as_longlong(val +
44 __longlong_as_double(assumed)));
47 }
while (assumed != old);
49 return __longlong_as_double(old);
58 #define CUDA_CHK(ans) {cudaCheck((ans), __FILE__, __LINE__); } 59 inline void cudaCheck(cudaError_t code,
const char *file,
int line,
bool abort=
false){
60 if (code != cudaSuccess){
61 fprintf(stderr,
"CUDA error at %s:%d -> %s\n", file, line, cudaGetErrorString(code));
64 throw std::runtime_error(
"CUDA error. See previous message.");
76 struct CudaFreeDelete {
77 void operator()(
void* x) {
78 CUDA_CHK(cudaFree(x));
83 std::shared_ptr<T> cudaSharedPtr(T *devPtr) {
84 return std::shared_ptr<T>(devPtr, CudaFreeDelete());
88 T* allocDeviceMemory() {
90 CUDA_CHK(cudaMalloc((
void**)&devPtr,
sizeof(T)));
99 std::shared_ptr<T> allocDeviceMemorySharedPtr() {
100 return cudaSharedPtr(allocDeviceMemory<T>());
103 template <
typename T>
104 T* allocDeviceMemory(
size_t n) {
106 CUDA_CHK(cudaMalloc((
void**)&devPtr, n*
sizeof(T)));
110 template <
typename T>
111 std::shared_ptr<T> allocDeviceMemorySharedPtr(
size_t n) {
112 return cudaSharedPtr(allocDeviceMemory<T>(n));
115 template <
typename T>
116 T *makeDeviceCopy(
const T& hostVal) {
117 T *devPtr = allocDeviceMemory<T>();
118 CUDA_CHK(cudaMemcpy(devPtr, &hostVal,
sizeof(T), cudaMemcpyHostToDevice));
122 template <
typename T>
123 std::shared_ptr<T> makeDeviceCopySharedPtr(
const T& hostVal) {
124 std::shared_ptr<T> devPtr = allocDeviceMemorySharedPtr<T>();
125 CUDA_CHK(cudaMemcpy(devPtr.get(), &hostVal,
sizeof(T), cudaMemcpyHostToDevice));
129 template <
typename T>
130 std::shared_ptr<T> makeDeviceCopySharedPtrFromPtr(
const T* hostPtr) {
131 std::shared_ptr<T> devPtr = allocDeviceMemorySharedPtr<T>();
132 CUDA_CHK(cudaMemcpy(devPtr.get(), hostPtr,
sizeof(T), cudaMemcpyHostToDevice));
141 template <
typename T>
142 T getHostValue(
const std::shared_ptr<T>& devPtr) {
144 CUDA_CHK(cudaMemcpy((
void*)&hostVal, devPtr.get(),
sizeof(T), cudaMemcpyDeviceToHost));
148 template <
typename T>
149 T getHostValue(T *devPtr) {
151 CUDA_CHK(cudaMemcpy((
void*)&hostVal, devPtr,
sizeof(T), cudaMemcpyDeviceToHost));
155 template <
typename T>
156 void copyToHost(
const std::shared_ptr<T>& devPtr, T *hostPtr) {
157 CUDA_CHK(cudaMemcpy((
void*)hostPtr, devPtr.get(),
sizeof(T), cudaMemcpyDeviceToHost));
160 template<
typename T,
typename A>
161 T *makeDeviceCopyVec(
const std::vector<T,A>& vec) {
162 size_t size = vec.size();
163 size_t memSize = size*
sizeof(T);
165 CUDA_CHK(cudaMalloc((
void**)&devPtr, memSize));
166 CUDA_CHK(cudaMemcpy(devPtr, vec.data(), memSize, cudaMemcpyHostToDevice));
170 template<
typename T,
typename A>
171 void copyVecToDeviceSharedPtr(
const std::vector<T,A>& vec, std::shared_ptr<T>& devPtr) {
172 size_t size = vec.size();
173 size_t memSize = size*
sizeof(T);
174 CUDA_CHK(cudaMemcpy(devPtr.get(), vec.data(), memSize, cudaMemcpyHostToDevice));
177 template<
typename T,
typename A>
178 std::shared_ptr<T> makeDeviceCopyVecSharedPtr(
const std::vector<T,A>& vec) {
179 return cudaSharedPtr(makeDeviceCopyVec(vec));
183 std::vector<T> makeHostVecFromSharedPtr(std::shared_ptr<T>& devPtr,
size_t size) {
184 std::vector<T> vec(size);
185 size_t memSize = size*
sizeof(T);
186 CUDA_CHK(cudaMemcpy(vec.data(), devPtr.get(), memSize, cudaMemcpyDeviceToHost));
190 template <
typename T>
191 T *makeDevCopyOfDevArray(T *devPtrIn,
size_t n) {
192 size_t memSize = n*
sizeof(T);
194 CUDA_CHK(cudaMalloc((
void**)&devPtrCopy, memSize));
195 CUDA_CHK(cudaMemcpy(devPtrCopy, devPtrIn, memSize, cudaMemcpyDeviceToDevice));
199 struct CudaKernelConfig {
205 unsigned int maxResidentWarps(
const cudaDeviceProp& devProp);
207 CudaKernelConfig getKernelConfigMaxOccupancy(
const cudaDeviceProp& devProp,
const void *kernelPtr,
unsigned int nThreads);
209 std::ostream&
operator<<(std::ostream& out,
const CudaKernelConfig& cfg);
215 __inline__ __device__
float sinIntr(
float x) {
218 __inline__ __device__
double sinIntr(
double x) {
222 __inline__ __device__
float cosIntr(
float x) {
225 __inline__ __device__
double cosIntr(
double x) {
229 __inline__ __device__
void sincosIntr(
float a,
float *b,
float *c) {
232 __inline__ __device__
void sincosIntr(
double a,
double *b,
double *c) {
236 __inline__ __device__
void sincosFull(
float a,
float *b,
float *c) {
239 __inline__ __device__
void sincosFull(
double a,
double *b,
double *c) {
243 __inline__ __device__
float powIntr(
float a,
float b) {
246 __inline__ __device__
double powIntr(
double a,
double b) {
250 __inline__ __device__
float powFull(
float a,
float b) {
253 __inline__ __device__
double powFull(
double a,
double b) {
257 __inline__ __device__
float fmaIntr(
float x,
float y,
float z) {
258 return __fmaf_rd(x,y,z);
260 __inline__ __device__
double fmaIntr(
double x,
double y,
double z) {
261 return __fma_rd(x,y,z);
264 __inline__ __device__
float sqrtIntr(
float x) {
267 __inline__ __device__
double sqrtIntr(
double x) {
271 __inline__ __device__
float expIntr(
float x) {
275 __inline__ __device__
double expIntr(
double x) {
280 template <
typename T>
289 typedef cuFloatComplex complex;
293 class cuTypes<double>
296 typedef cuDoubleComplex complex;
300 __inline__ __host__ __device__ cuFloatComplex make_cuComplex(
float x,
float y) {
301 return make_cuFloatComplex(x,y);
304 __inline__ __host__ __device__ cuDoubleComplex make_cuComplex(
double x,
double y) {
305 return make_cuDoubleComplex(x,y);
308 __inline__ __host__ __device__ cuFloatComplex cuConj(cuFloatComplex z) {
312 __inline__ __device__ cuFloatComplex expIntr(cuFloatComplex z) {
313 float expx = expf(z.x);
315 sincosIntr(z.y, &sy, &cy);
316 return make_cuComplex(expx*cy, expx*sy);
319 __inline__ __device__ cuDoubleComplex expIntr(cuDoubleComplex z) {
320 double expx = exp(z.x);
322 sincosIntr(z.y, &sy, &cy);
323 return make_cuComplex(expx*cy, expx*sy);
326 __inline__ __host__ __device__ cuFloatComplex
operator*(cuFloatComplex z, cuFloatComplex w) {
327 return cuCmulf(z, w);
330 __inline__ __host__ __device__ cuDoubleComplex
operator*(cuDoubleComplex z, cuDoubleComplex w) {
334 __inline__ __host__ __device__ cuFloatComplex
operator+(cuFloatComplex z, cuFloatComplex w) {
335 return cuCaddf(z, w);
338 __inline__ __host__ __device__ cuDoubleComplex
operator+(cuDoubleComplex z, cuDoubleComplex w) {
343 __inline__ __host__ __device__
typename cuTypes<T>::complex
operator/(
const typename cuTypes<T>::complex z, T a) {
344 return make_cuComplex(z.x/a, z.y/a);
348 __inline__ __host__ __device__
typename cuTypes<T>::complex
operator*(T a,
const typename cuTypes<T>::complex z) {
349 return make_cuComplex(a*z.x, a*z.y);
353 __inline__ __host__ __device__
typename cuTypes<T>::complex
operator*(
const typename cuTypes<T>::complex z, T a) {
358 __inline__ __host__ __device__
typename cuTypes<T>::complex
operator-(
const typename cuTypes<T>::complex& z, T a) {
359 return make_cuComplex(z.x-a, z.y);
363 __inline__ __host__ __device__
typename cuTypes<T>::complex
operator+(
const typename cuTypes<T>::complex& z, T a) {
364 return make_cuComplex(z.x+a, z.y);
367 __inline__ __host__ __device__ cuFloatComplex
operator-(
const cuFloatComplex& z) {
368 return make_cuComplex(-z.x, -z.y);
371 __inline__ __host__ __device__ cuDoubleComplex
operator-(
const cuDoubleComplex& z) {
372 return make_cuComplex(-z.x, -z.y);
380 __inline__ __device__
unsigned int cvtFloatToUint(
float x) {
382 unsigned int raw = *((
int*)&x);
384 unsigned int exponent = ((raw&2139095040)>>23)-127;
386 unsigned int significand = raw&8388607;
388 significand |= 8388608;
390 unsigned int val = significand>>(23-exponent);
394 __inline__ __host__ __device__
unsigned int log2u(
unsigned int val) {
396 while (val >>= 1) ++output;
407 template <
typename T>
408 __inline__ __device__ T warpReduceSum(T val) {
409 const int warpSize = 32;
410 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
411 val += __shfl_down(val, offset);
421 template <
typename T>
422 __inline__ __device__ T blockReduceSum(T val) {
423 const int warpSize = 32;
424 static __shared__ T shared[warpSize];
425 int lane = threadIdx.x % warpSize;
426 int wid = threadIdx.x / warpSize;
428 val = warpReduceSum(val);
430 if (lane==0) shared[wid]=val;
435 val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
437 if (wid==0) val = warpReduceSum(val);
449 template <
typename T>
450 __global__
void BLOCK_REDUCE_KEPLER(T *in, T* out,
int N) {
453 for (
int i = blockIdx.x * blockDim.x + threadIdx.x; i<N; i += blockDim.x * gridDim.x) {
456 sum = blockReduceSum(sum);
457 if (threadIdx.x==0) {
462 inline size_t getUsedMemoryBytes() {
465 CUDA_CHK(cudaMemGetInfo(&freeBytes, &totalBytes));
466 size_t usedBytes = totalBytes-freeBytes;
470 inline double getUsedMemoryGB() {
471 size_t usedBytes = getUsedMemoryBytes();
472 double usedGB = ((double)usedBytes)/(1024*1024*1024);
476 inline double getUsedMemoryGiB() {
477 size_t usedBytes = getUsedMemoryBytes();
478 double usedGiB = ((double)usedBytes)/(1000*1000*1000);
482 inline double getClockRateGHz(
int deviceID) {
483 cudaDeviceProp devProp;
484 CUDA_CHK(cudaGetDeviceProperties(&devProp, deviceID));
485 return ((
double)devProp.clockRate)/1000000.0;
Definition: casesUtils.cpp:4
std::vector< T > operator*(const std::vector< T > &vec, const T scalar)
Definition: tensor.h:72
std::ostream & operator<<(std::ostream &out, const EulerAngles< T > &angles)
Definition: rotation.h:92
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
std::vector< T > operator-(const std::vector< T > &vec1, const std::vector< T > &vec2)
Definition: tensor.h:110
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210