High Performance Plasticity  0.5.0
cudaUtils.h
Go to the documentation of this file.
1 
7 #ifndef HPP_CUDA_UTILS_H
8 #define HPP_CUDA_UTILS_H
9 
10 #include <cuda.h>
11 #include <cuda_runtime.h>
12 #include <math.h>
13 #include <memory>
14 #include <iostream>
15 #include <stdio.h>
16 #include <stdexcept>
17 #include <vector>
18 #include <cuComplex.h>
19 #include <hpp/config.h>
20 
21 // Aligned memory
22 #ifdef __CUDACC__
23  #define ALIGN(x) __align__(x)
24 #else
25  #if defined(__GNUC__)
26  #define ALIGN(x) __attribute__ ((aligned (x)))
27  #else
28  #define ALIGN(x)
29  #endif
30 #endif
31 
32 // Atomic add for doubles on CC < 6.0
33 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
34 __inline__ __device__ double atomicAdd(double* address, double val)
35 {
36  unsigned long long int* address_as_ull =
37  (unsigned long long int*)address;
38  unsigned long long int old = *address_as_ull, assumed;
39 
40  do {
41  assumed = old;
42  old = atomicCAS(address_as_ull, assumed,
43  __double_as_longlong(val +
44  __longlong_as_double(assumed)));
45 
46  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
47  } while (assumed != old);
48 
49  return __longlong_as_double(old);
50 }
51 #endif
52 
53 namespace hpp
54 {
55 #ifdef HPP_USE_CUDA
56 
57 // Based on http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
58 #define CUDA_CHK(ans) {cudaCheck((ans), __FILE__, __LINE__); }
59 inline void cudaCheck(cudaError_t code, const char *file, int line, bool abort=false){
60  if (code != cudaSuccess){
61  fprintf(stderr,"CUDA error at %s:%d -> %s\n", file, line, cudaGetErrorString(code));
62  cudaGetLastError();
63  if (abort) {
64  throw std::runtime_error("CUDA error. See previous message.");
65  }
66  }
67 }
68 
76 struct CudaFreeDelete {
77  void operator()(void* x) {
78  CUDA_CHK(cudaFree(x));
79  }
80 };
81 
82 template <typename T>
83 std::shared_ptr<T> cudaSharedPtr(T *devPtr) {
84  return std::shared_ptr<T>(devPtr, CudaFreeDelete());
85 }
86 
87 template <typename T>
88 T* allocDeviceMemory() {
89  T *devPtr;
90  CUDA_CHK(cudaMalloc((void**)&devPtr, sizeof(T)));
91  return devPtr;
92 }
93 
98 template <typename T>
99 std::shared_ptr<T> allocDeviceMemorySharedPtr() {
100  return cudaSharedPtr(allocDeviceMemory<T>());
101 }
102 
103 template <typename T>
104 T* allocDeviceMemory(size_t n) {
105  T *devPtr;
106  CUDA_CHK(cudaMalloc((void**)&devPtr, n*sizeof(T)));
107  return devPtr;
108 }
109 
110 template <typename T>
111 std::shared_ptr<T> allocDeviceMemorySharedPtr(size_t n) {
112  return cudaSharedPtr(allocDeviceMemory<T>(n));
113 }
114 
115 template <typename T>
116 T *makeDeviceCopy(const T& hostVal) {
117  T *devPtr = allocDeviceMemory<T>();
118  CUDA_CHK(cudaMemcpy(devPtr, &hostVal, sizeof(T), cudaMemcpyHostToDevice));
119  return devPtr;
120 }
121 
122 template <typename T>
123 std::shared_ptr<T> makeDeviceCopySharedPtr(const T& hostVal) {
124  std::shared_ptr<T> devPtr = allocDeviceMemorySharedPtr<T>();
125  CUDA_CHK(cudaMemcpy(devPtr.get(), &hostVal, sizeof(T), cudaMemcpyHostToDevice));
126  return devPtr;
127 }
128 
129 template <typename T>
130 std::shared_ptr<T> makeDeviceCopySharedPtrFromPtr(const T* hostPtr) {
131  std::shared_ptr<T> devPtr = allocDeviceMemorySharedPtr<T>();
132  CUDA_CHK(cudaMemcpy(devPtr.get(), hostPtr, sizeof(T), cudaMemcpyHostToDevice));
133  return devPtr;
134 }
135 
141 template <typename T>
142 T getHostValue(const std::shared_ptr<T>& devPtr) {
143  T hostVal;
144  CUDA_CHK(cudaMemcpy((void*)&hostVal, devPtr.get(), sizeof(T), cudaMemcpyDeviceToHost));
145  return hostVal;
146 }
147 
148 template <typename T>
149 T getHostValue(T *devPtr) {
150  T hostVal;
151  CUDA_CHK(cudaMemcpy((void*)&hostVal, devPtr, sizeof(T), cudaMemcpyDeviceToHost));
152  return hostVal;
153 }
154 
155 template <typename T>
156 void copyToHost(const std::shared_ptr<T>& devPtr, T *hostPtr) {
157  CUDA_CHK(cudaMemcpy((void*)hostPtr, devPtr.get(), sizeof(T), cudaMemcpyDeviceToHost));
158 }
159 
160 template<typename T, typename A>
161 T *makeDeviceCopyVec(const std::vector<T,A>& vec) {
162  size_t size = vec.size();
163  size_t memSize = size*sizeof(T);
164  T *devPtr;
165  CUDA_CHK(cudaMalloc((void**)&devPtr, memSize));
166  CUDA_CHK(cudaMemcpy(devPtr, vec.data(), memSize, cudaMemcpyHostToDevice));
167  return devPtr;
168 }
169 
170 template<typename T, typename A>
171 void copyVecToDeviceSharedPtr(const std::vector<T,A>& vec, std::shared_ptr<T>& devPtr) {
172  size_t size = vec.size();
173  size_t memSize = size*sizeof(T);
174  CUDA_CHK(cudaMemcpy(devPtr.get(), vec.data(), memSize, cudaMemcpyHostToDevice));
175 }
176 
177 template<typename T, typename A>
178 std::shared_ptr<T> makeDeviceCopyVecSharedPtr(const std::vector<T,A>& vec) {
179  return cudaSharedPtr(makeDeviceCopyVec(vec));
180 }
181 
182 template<typename T>
183 std::vector<T> makeHostVecFromSharedPtr(std::shared_ptr<T>& devPtr, size_t size) {
184  std::vector<T> vec(size);
185  size_t memSize = size*sizeof(T);
186  CUDA_CHK(cudaMemcpy(vec.data(), devPtr.get(), memSize, cudaMemcpyDeviceToHost));
187  return vec;
188 }
189 
190 template <typename T>
191 T *makeDevCopyOfDevArray(T *devPtrIn, size_t n) {
192  size_t memSize = n*sizeof(T);
193  T *devPtrCopy;
194  CUDA_CHK(cudaMalloc((void**)&devPtrCopy, memSize));
195  CUDA_CHK(cudaMemcpy(devPtrCopy, devPtrIn, memSize, cudaMemcpyDeviceToDevice));
196  return devPtrCopy;
197 }
198 
199 struct CudaKernelConfig {
200  dim3 dB = {0,0,0};
201  dim3 dG = {0,0,0};
202  float occupancy;
203 };
204 
205 unsigned int maxResidentWarps(const cudaDeviceProp& devProp);
206 
207 CudaKernelConfig getKernelConfigMaxOccupancy(const cudaDeviceProp& devProp, const void *kernelPtr, unsigned int nThreads);
208 
209 std::ostream& operator<<(std::ostream& out, const CudaKernelConfig& cfg);
210 
211 // Mathematical function wrappers
212 // For functionIntr, only calls it if it exists, otherwise defaults
213 // to non-intrinsic version
214 
215 __inline__ __device__ float sinIntr(float x) {
216  return __sinf(x);
217 }
218 __inline__ __device__ double sinIntr(double x) {
219  return sin(x);
220 }
221 
222 __inline__ __device__ float cosIntr(float x) {
223  return __cosf(x);
224 }
225 __inline__ __device__ double cosIntr(double x) {
226  return cos(x);
227 }
228 
229 __inline__ __device__ void sincosIntr(float a, float *b, float *c) {
230  __sincosf(a,b,c);
231 }
232 __inline__ __device__ void sincosIntr(double a, double *b, double *c) {
233  sincos(a,b,c);
234 }
235 
236 __inline__ __device__ void sincosFull(float a, float *b, float *c) {
237  sincosf(a,b,c);
238 }
239 __inline__ __device__ void sincosFull(double a, double *b, double *c) {
240  sincos(a,b,c);
241 }
242 
243 __inline__ __device__ float powIntr(float a, float b) {
244  return __powf(a,b);
245 }
246 __inline__ __device__ double powIntr(double a, double b) {
247  return pow(a,b);
248 }
249 
250 __inline__ __device__ float powFull(float a, float b) {
251  return powf(a,b);
252 }
253 __inline__ __device__ double powFull(double a, double b) {
254  return pow(a,b);
255 }
256 
257 __inline__ __device__ float fmaIntr(float x, float y, float z) {
258  return __fmaf_rd(x,y,z);
259 }
260 __inline__ __device__ double fmaIntr(double x, double y, double z) {
261  return __fma_rd(x,y,z);
262 }
263 
264 __inline__ __device__ float sqrtIntr(float x) {
265  return sqrtf(x);
266 }
267 __inline__ __device__ double sqrtIntr(double x) {
268  return sqrt(x);
269 }
270 
271 __inline__ __device__ float expIntr(float x) {
272  return __expf(x);
273 }
274 
275 __inline__ __device__ double expIntr(double x) {
276  return exp(x);
277 }
278 
279 // Support for CUDA derived types based on underlying scalar type
280 template <typename T>
281 class cuTypes {
282  typedef int complex;
283 };
284 
285 template <>
286 class cuTypes<float>
287 {
288  public:
289  typedef cuFloatComplex complex;
290 };
291 
292 template <>
293 class cuTypes<double>
294 {
295  public:
296  typedef cuDoubleComplex complex;
297 };
298 
299 // Complex number support
300 __inline__ __host__ __device__ cuFloatComplex make_cuComplex(float x, float y) {
301  return make_cuFloatComplex(x,y);
302 }
303 
304 __inline__ __host__ __device__ cuDoubleComplex make_cuComplex(double x, double y) {
305  return make_cuDoubleComplex(x,y);
306 }
307 
308 __inline__ __host__ __device__ cuFloatComplex cuConj(cuFloatComplex z) {
309  return cuConjf(z);
310 }
311 
312 __inline__ __device__ cuFloatComplex expIntr(cuFloatComplex z) {
313  float expx = expf(z.x);
314  float cy, sy;
315  sincosIntr(z.y, &sy, &cy);
316  return make_cuComplex(expx*cy, expx*sy);
317 }
318 
319 __inline__ __device__ cuDoubleComplex expIntr(cuDoubleComplex z) {
320  double expx = exp(z.x);
321  double cy, sy;
322  sincosIntr(z.y, &sy, &cy);
323  return make_cuComplex(expx*cy, expx*sy);
324 }
325 
326 __inline__ __host__ __device__ cuFloatComplex operator*(cuFloatComplex z, cuFloatComplex w) {
327  return cuCmulf(z, w);
328 }
329 
330 __inline__ __host__ __device__ cuDoubleComplex operator*(cuDoubleComplex z, cuDoubleComplex w) {
331  return cuCmul(z, w);
332 }
333 
334 __inline__ __host__ __device__ cuFloatComplex operator+(cuFloatComplex z, cuFloatComplex w) {
335  return cuCaddf(z, w);
336 }
337 
338 __inline__ __host__ __device__ cuDoubleComplex operator+(cuDoubleComplex z, cuDoubleComplex w) {
339  return cuCadd(z, w);
340 }
341 
342 template<typename T>
343 __inline__ __host__ __device__ typename cuTypes<T>::complex operator/(const typename cuTypes<T>::complex z, T a) {
344  return make_cuComplex(z.x/a, z.y/a);
345 }
346 
347 template<typename T>
348 __inline__ __host__ __device__ typename cuTypes<T>::complex operator*(T a, const typename cuTypes<T>::complex z) {
349  return make_cuComplex(a*z.x, a*z.y);
350 }
351 
352 template<typename T>
353 __inline__ __host__ __device__ typename cuTypes<T>::complex operator*(const typename cuTypes<T>::complex z, T a) {
354  return a*z;
355 }
356 
357 template<typename T>
358 __inline__ __host__ __device__ typename cuTypes<T>::complex operator-(const typename cuTypes<T>::complex& z, T a) {
359  return make_cuComplex(z.x-a, z.y);
360 }
361 
362 template<typename T>
363 __inline__ __host__ __device__ typename cuTypes<T>::complex operator+(const typename cuTypes<T>::complex& z, T a) {
364  return make_cuComplex(z.x+a, z.y);
365 }
366 
367 __inline__ __host__ __device__ cuFloatComplex operator-(const cuFloatComplex& z) {
368  return make_cuComplex(-z.x, -z.y);
369 }
370 
371 __inline__ __host__ __device__ cuDoubleComplex operator-(const cuDoubleComplex& z) {
372  return make_cuComplex(-z.x, -z.y);
373 }
374 
380 __inline__ __device__ unsigned int cvtFloatToUint(float x) {
381  // Cast memory
382  unsigned int raw = *((int*)&x);
383  // Get exponent
384  unsigned int exponent = ((raw&2139095040)>>23)-127;
385  // Mask the significand
386  unsigned int significand = raw&8388607;
387  // Prepend the 1
388  significand |= 8388608;
389  // Recompose
390  unsigned int val = significand>>(23-exponent);
391  // Return
392  return val;
393 }
394 __inline__ __host__ __device__ unsigned int log2u(unsigned int val) {
395  int output = 0;
396  while (val >>= 1) ++output;
397  return output;
398 }
399 
407 template <typename T>
408 __inline__ __device__ T warpReduceSum(T val) {
409  const int warpSize = 32;
410  for (int offset = warpSize/2; offset > 0; offset /= 2) {
411  val += __shfl_down(val, offset);
412  }
413  return val;
414 }
415 
421 template <typename T>
422 __inline__ __device__ T blockReduceSum(T val) {
423  const int warpSize = 32;
424  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
425  int lane = threadIdx.x % warpSize;
426  int wid = threadIdx.x / warpSize;
427 
428  val = warpReduceSum(val); // Each warp performs partial reduction
429 
430  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
431 
432  __syncthreads(); // Wait for all partial reductions
433 
434  //read from shared memory only if that warp existed
435  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
436 
437  if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
438 
439  return val;
440 }
441 
449 template <typename T>
450 __global__ void BLOCK_REDUCE_KEPLER(T *in, T* out, int N) {
451  T sum = (T)0.0;
452  //reduce multiple elements per thread
453  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i<N; i += blockDim.x * gridDim.x) {
454  sum += in[i];
455  }
456  sum = blockReduceSum(sum);
457  if (threadIdx.x==0) {
458  out[blockIdx.x]=sum;
459  }
460 }
461 
462 inline size_t getUsedMemoryBytes() {
463  size_t freeBytes;
464  size_t totalBytes;
465  CUDA_CHK(cudaMemGetInfo(&freeBytes, &totalBytes));
466  size_t usedBytes = totalBytes-freeBytes;
467  return usedBytes;
468 }
469 
470 inline double getUsedMemoryGB() {
471  size_t usedBytes = getUsedMemoryBytes();
472  double usedGB = ((double)usedBytes)/(1024*1024*1024);
473  return usedGB;
474 }
475 
476 inline double getUsedMemoryGiB() {
477  size_t usedBytes = getUsedMemoryBytes();
478  double usedGiB = ((double)usedBytes)/(1000*1000*1000);
479  return usedGiB;
480 }
481 
482 inline double getClockRateGHz(int deviceID) {
483  cudaDeviceProp devProp;
484  CUDA_CHK(cudaGetDeviceProperties(&devProp, deviceID));
485  return ((double)devProp.clockRate)/1000000.0;
486 }
487 
488 #endif /* HPP_USE_CUDA */
489 }//END NAMESPACE HPP
490 
491 #endif /* HPP_CUDA_UTILS_H */
Definition: casesUtils.cpp:4
std::vector< T > operator*(const std::vector< T > &vec, const T scalar)
Definition: tensor.h:72
std::ostream & operator<<(std::ostream &out, const EulerAngles< T > &angles)
Definition: rotation.h:92
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
std::vector< T > operator-(const std::vector< T > &vec1, const std::vector< T > &vec2)
Definition: tensor.h:110
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210