High Performance Plasticity  0.5.0
gshCUDA.h
Go to the documentation of this file.
1 #ifndef HPP_GSHCUDA_H
5 #define HPP_GSHCUDA_H
6 
7 #include <hpp/config.h>
9 #include <cuComplex.h>
10 #include <hpp/cudaUtils.h>
11 
12 namespace hpp
13 {
14 
27 template <typename T>
29  public:
30  __host__ __device__ GSHCoeffsCUDA(){
31  for (int i=0; i<nl0; i++) {
32  l0[i].x = 0.0;
33  l0[i].y = 0.0;
34  }
35  for (int i=0; i<nl1; i++) {
36  l1[i].x = 0.0;
37  l1[i].y = 0.0;
38  }
39  for (int i=0; i<nl2; i++) {
40  l2[i].x = 0.0;
41  l2[i].y = 0.0;
42  }
43  for (int i=0; i<nl3; i++) {
44  l3[i].x = 0.0;
45  l3[i].y = 0.0;
46  }
47  for (int i=0; i<nl4; i++) {
48  l4[i].x = 0.0;
49  l4[i].y = 0.0;
50  }
51  }
52 
53  __host__ __device__ bool isInSymmetrizedSection(int l, int m, int n) {
54  int L = 2*l+1;
55  int nElements = (L*L+1)/2;
56  int mIdx = m+l;
57  int nIdx = n+l;
58  if (mIdx*L + nIdx > nElements-1) {
59  return true;
60  }
61  else {
62  return false;
63  }
64  }
65 
66  __host__ __device__ unsigned int getFlatIdx(int l, int m, int n) {
67  // If lower triangular, switch to upper triangular section
68  if (this->isInSymmetrizedSection(l,m,n)) {
69  n = -n;
70  m = -m;
71  }
72  unsigned int mIdx = m+l;
73  unsigned int nIdx = n+l;
74 
75  // mIdx and nIdx are indices in an LxL matrix
76  unsigned int L = 2*l+1;
77 
78  // Return
79  return mIdx*L + nIdx;
80  }
81 
83  __host__ __device__ typename cuTypes<T>::complex get(int l, int m, int n) {
84  unsigned int flatIdx = this->getFlatIdx(l,m,n);
85 
86  // Fetch the value
87  typename cuTypes<T>::complex val;
88  switch (l) {
89  case 0:
90  val = l0[flatIdx];
91  break;
92  case 1:
93  val = l1[flatIdx];
94  break;
95  case 2:
96  val = l2[flatIdx];
97  break;
98  case 3:
99  val = l3[flatIdx];
100  break;
101  case 4:
102  val = l4[flatIdx];
103  break;
104  default:
105  return;
106  }
107 
108  // Modify for symmetry if in symmetrized section
109  if (this->isInSymmetrizedSection(l,m,n)) {
110  typename cuTypes<T>::complex mult;
111  mult.x = pow(-1.0, m+n);
112  mult.y = 0.0;
113  val = mult*cuConj(val);
114  }
115 
116  // Return
117  return val;
118 
119  }
120 
122  __host__ __device__ void set(int l, int m, int n, typename cuTypes<T>::complex val) {
123  unsigned int flatIdx = this->getFlatIdx(l,m,n);
124 
125  // Modify for symmetry if in symmetrized section
126  if (this->isInSymmetrizedSection(l,m,n)) {
127  typename cuTypes<T>::complex mult;
128  mult.x = pow(-1.0, m+n);
129  mult.y = 0.0;
130  val = mult*cuConj(val);
131  }
132 
133  // Set value
134  switch (l) {
135  case 0:
136  l0[flatIdx] = val;
137  break;
138  case 1:
139  l1[flatIdx] = val;
140  break;
141  case 2:
142  l2[flatIdx] = val;
143  break;
144  case 3:
145  l3[flatIdx] = val;
146  break;
147  case 4:
148  l4[flatIdx] = val;
149  break;
150  default:
151  return;
152  }
153  }
154 
155  __host__ std::vector<T> getl0Reals() {
156  std::vector<T> vals(2*nl0);
157  for (int i=0; i<nl0; i++) {
158  vals[2*i] = l0[i].x;
159  vals[2*i+1] = l0[i].y;
160  }
161  return vals;
162  }
163 
164  __host__ std::vector<T> getl1Reals() {
165  std::vector<T> vals(2*nl1);
166  for (int i=0; i<nl1; i++) {
167  vals[2*i] = l1[i].x;
168  vals[2*i+1] = l1[i].y;
169  }
170  return vals;
171  }
172 
173  __host__ std::vector<T> getl2Reals() {
174  std::vector<T> vals(2*nl2);
175  for (int i=0; i<nl2; i++) {
176  vals[2*i] = l2[i].x;
177  vals[2*i+1] = l2[i].y;
178  }
179  return vals;
180  }
181 
182  __host__ std::vector<T> getl3Reals() {
183  std::vector<T> vals(2*nl3);
184  for (int i=0; i<nl3; i++) {
185  vals[2*i] = l3[i].x;
186  vals[2*i+1] = l3[i].y;
187  }
188  return vals;
189  }
190 
191  __host__ std::vector<T> getl4Reals() {
192  std::vector<T> vals(2*nl4);
193  for (int i=0; i<nl4; i++) {
194  vals[2*i] = l4[i].x;
195  vals[2*i+1] = l4[i].y;
196  }
197  return vals;
198  }
199 
200  int nl0 = 2*0*(0+1)+1;
201  int nl1 = 2*1*(1+1)+1;
202  int nl2 = 2*2*(2+1)+1;
203  int nl3 = 2*3*(3+1)+1;
204  int nl4 = 2*4*(4+1)+1;
205  typename cuTypes<T>::complex l0[2*0*(0+1)+1];
206  typename cuTypes<T>::complex l1[2*1*(1+1)+1];
207  typename cuTypes<T>::complex l2[2*2*(2+1)+1];
208  typename cuTypes<T>::complex l3[2*3*(3+1)+1];
209  typename cuTypes<T>::complex l4[2*4*(4+1)+1];
210 };
211 
212 template <typename T>
213 __host__ __device__ GSHCoeffsCUDA<T> operator+(const GSHCoeffsCUDA<T>& coeffs1, const GSHCoeffsCUDA<T>& coeffs2) {
214  GSHCoeffsCUDA<T> res;
215  for (int i=0; i<res.nl0; i++) {
216  res.l0[i] = coeffs1.l0[i]+coeffs2.l0[i];
217  }
218  for (int i=0; i<res.nl1; i++) {
219  res.l1[i] = coeffs1.l1[i]+coeffs2.l1[i];
220  }
221  for (int i=0; i<res.nl2; i++) {
222  res.l2[i] = coeffs1.l2[i]+coeffs2.l2[i];
223  }
224  for (int i=0; i<res.nl3; i++) {
225  res.l3[i] = coeffs1.l3[i]+coeffs2.l3[i];
226  }
227  for (int i=0; i<res.nl4; i++) {
228  res.l4[i] = coeffs1.l4[i]+coeffs2.l4[i];
229  }
230  return res;
231 }
232 
233 template <typename T>
234 __host__ __device__ void operator+=(GSHCoeffsCUDA<T>& A, const GSHCoeffsCUDA<T>& B) {
235  A = A+B;
236 }
237 
238 template <typename T>
239 __host__ __device__ GSHCoeffsCUDA<T> operator/(const GSHCoeffsCUDA<T>& coeffs, T val) {
240  GSHCoeffsCUDA<T> res;
241  for (int i=0; i<res.nl0; i++) {
242  res.l0[i] = coeffs.l0[i]/val;
243  }
244  for (int i=0; i<res.nl1; i++) {
245  res.l1[i] = coeffs.l1[i]/val;
246  }
247  for (int i=0; i<res.nl2; i++) {
248  res.l2[i] = coeffs.l2[i]/val;
249  }
250  for (int i=0; i<res.nl3; i++) {
251  res.l3[i] = coeffs.l3[i]/val;
252  }
253  for (int i=0; i<res.nl4; i++) {
254  res.l4[i] = coeffs.l4[i]/val;
255  }
256  return res;
257 }
258 
259 // PARALLEL REDUCTION //
267 template <typename T>
269  const int warpSize = 32;
270  for (unsigned int i=0; i<coeffs.nl0; i++) {
271  for (int offset = warpSize/2; offset > 0; offset /= 2) {
272  coeffs.l0[i].x += __shfl_down(coeffs.l0[i].x, offset);
273  }
274  for (int offset = warpSize/2; offset > 0; offset /= 2) {
275  coeffs.l0[i].y += __shfl_down(coeffs.l0[i].y, offset);
276  }
277  }
278  for (unsigned int i=0; i<coeffs.nl1; i++) {
279  for (int offset = warpSize/2; offset > 0; offset /= 2) {
280  coeffs.l1[i].x += __shfl_down(coeffs.l1[i].x, offset);
281  }
282  for (int offset = warpSize/2; offset > 0; offset /= 2) {
283  coeffs.l1[i].y += __shfl_down(coeffs.l1[i].y, offset);
284  }
285  }
286  for (unsigned int i=0; i<coeffs.nl2; i++) {
287  for (int offset = warpSize/2; offset > 0; offset /= 2) {
288  coeffs.l2[i].x += __shfl_down(coeffs.l2[i].x, offset);
289  }
290  for (int offset = warpSize/2; offset > 0; offset /= 2) {
291  coeffs.l2[i].y += __shfl_down(coeffs.l2[i].y, offset);
292  }
293  }
294  for (unsigned int i=0; i<coeffs.nl3; i++) {
295  for (int offset = warpSize/2; offset > 0; offset /= 2) {
296  coeffs.l3[i].x += __shfl_down(coeffs.l3[i].x, offset);
297  }
298  for (int offset = warpSize/2; offset > 0; offset /= 2) {
299  coeffs.l3[i].y += __shfl_down(coeffs.l3[i].y, offset);
300  }
301  }
302  for (unsigned int i=0; i<coeffs.nl4; i++) {
303  for (int offset = warpSize/2; offset > 0; offset /= 2) {
304  coeffs.l4[i].x += __shfl_down(coeffs.l4[i].x, offset);
305  }
306  for (int offset = warpSize/2; offset > 0; offset /= 2) {
307  coeffs.l4[i].y += __shfl_down(coeffs.l4[i].y, offset);
308  }
309  }
310  return coeffs;
311 }
312 
318 template <typename T>
320  const int warpSize = 32;
321  static __shared__ GSHCoeffsCUDA<T> shared[warpSize]; // Shared mem for 32 partial sums
322  __syncthreads();
323  int lane = threadIdx.x % warpSize;
324  int wid = threadIdx.x / warpSize;
325 
326  val = warpReduceSumGSHCoeffs(val); // Each warp performs partial reduction
327 
328  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
329 
330  __syncthreads(); // Wait for all partial reductions
331 
332  //read from shared memory only if that warp existed
333  if (threadIdx.x < blockDim.x / warpSize) {
334  val = shared[lane];
335  }
336  else {
337  val = GSHCoeffsCUDA<T>();
338  }
339 
340  if (wid==0) val = warpReduceSumGSHCoeffs(val); //Final reduce within first warp
341 
342  return val;
343 }
344 
345 template <typename T>
346 __global__ void BLOCK_REDUCE_KEPLER_GSH_COEFFS(GSHCoeffsCUDA<T> *in, GSHCoeffsCUDA<T> *out, int nTerms) {
347  GSHCoeffsCUDA<T> sum;
348  //reduce multiple elements per thread
349  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i<nTerms; i += blockDim.x * gridDim.x) {
350  sum += in[i];
351  }
352  sum = blockReduceSumGSHCoeffs(sum);
353  if (threadIdx.x==0) {
354  out[blockIdx.x]=sum;
355  }
356 }
357 
358 } //END NAMESPACE HPP
359 #endif /* HPP_GSHCUDA_H */
cuTypes< T >::complex l4[2 *4 *(4+1)+1]
Definition: gshCUDA.h:209
int nl4
Definition: gshCUDA.h:204
void operator+=(GSHCoeffs< T > &A, const GSHCoeffs< T > &B)
Definition: gsh.h:231
cuTypes< T >::complex l1[2 *1 *(1+1)+1]
Definition: gshCUDA.h:206
Definition: casesUtils.cpp:4
__device__ GSHCoeffsCUDA< T > blockReduceSumGSHCoeffs(GSHCoeffsCUDA< T > val)
Definition: gshCUDA.h:319
cuTypes< T >::complex l3[2 *3 *(3+1)+1]
Definition: gshCUDA.h:208
#define HPP_CHECK_CUDA_ENABLED_BUILD
Definition: config.h:44
__global__ void BLOCK_REDUCE_KEPLER_GSH_COEFFS(GSHCoeffsCUDA< T > *in, GSHCoeffsCUDA< T > *out, int nTerms)
Definition: gshCUDA.h:346
__device__ GSHCoeffsCUDA< T > warpReduceSumGSHCoeffs(GSHCoeffsCUDA< T > coeffs)
Definition: gshCUDA.h:268
__host__ __device__ bool isInSymmetrizedSection(int l, int m, int n)
Definition: gshCUDA.h:53
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
int nl3
Definition: gshCUDA.h:203
__host__ std::vector< T > getl0Reals()
Definition: gshCUDA.h:155
__host__ std::vector< T > getl4Reals()
Definition: gshCUDA.h:191
Definition: gshCUDA.h:28
__host__ std::vector< T > getl3Reals()
Definition: gshCUDA.h:182
Header file CUDA utility functions.
__host__ __device__ GSHCoeffsCUDA()
Definition: gshCUDA.h:30
cuTypes< T >::complex l2[2 *2 *(2+1)+1]
Definition: gshCUDA.h:207
__host__ std::vector< T > getl1Reals()
Definition: gshCUDA.h:164
cuTypes< T >::complex l0[2 *0 *(0+1)+1]
Definition: gshCUDA.h:205
int nl0
Definition: gshCUDA.h:200
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210
__host__ std::vector< T > getl2Reals()
Definition: gshCUDA.h:173
__host__ __device__ unsigned int getFlatIdx(int l, int m, int n)
Definition: gshCUDA.h:66
int nl1
Definition: gshCUDA.h:201
int nl2
Definition: gshCUDA.h:202