31 for (
int i=0; i<
nl0; i++) {
35 for (
int i=0; i<
nl1; i++) {
39 for (
int i=0; i<
nl2; i++) {
43 for (
int i=0; i<
nl3; i++) {
47 for (
int i=0; i<
nl4; i++) {
55 int nElements = (L*L+1)/2;
58 if (mIdx*L + nIdx > nElements-1) {
66 __host__ __device__
unsigned int getFlatIdx(
int l,
int m,
int n) {
72 unsigned int mIdx = m+l;
73 unsigned int nIdx = n+l;
76 unsigned int L = 2*l+1;
83 __host__ __device__
typename cuTypes<T>::complex
get(
int l,
int m,
int n) {
84 unsigned int flatIdx = this->
getFlatIdx(l,m,n);
87 typename cuTypes<T>::complex val;
110 typename cuTypes<T>::complex mult;
111 mult.x = pow(-1.0, m+n);
113 val = mult*cuConj(val);
122 __host__ __device__
void set(
int l,
int m,
int n,
typename cuTypes<T>::complex val) {
123 unsigned int flatIdx = this->
getFlatIdx(l,m,n);
127 typename cuTypes<T>::complex mult;
128 mult.x = pow(-1.0, m+n);
130 val = mult*cuConj(val);
156 std::vector<T> vals(2*
nl0);
157 for (
int i=0; i<
nl0; i++) {
159 vals[2*i+1] =
l0[i].y;
165 std::vector<T> vals(2*
nl1);
166 for (
int i=0; i<
nl1; i++) {
168 vals[2*i+1] =
l1[i].y;
174 std::vector<T> vals(2*
nl2);
175 for (
int i=0; i<
nl2; i++) {
177 vals[2*i+1] =
l2[i].y;
183 std::vector<T> vals(2*
nl3);
184 for (
int i=0; i<
nl3; i++) {
186 vals[2*i+1] =
l3[i].y;
192 std::vector<T> vals(2*
nl4);
193 for (
int i=0; i<
nl4; i++) {
195 vals[2*i+1] =
l4[i].y;
205 typename cuTypes<T>::complex
l0[2*0*(0+1)+1];
206 typename cuTypes<T>::complex
l1[2*1*(1+1)+1];
207 typename cuTypes<T>::complex
l2[2*2*(2+1)+1];
208 typename cuTypes<T>::complex
l3[2*3*(3+1)+1];
209 typename cuTypes<T>::complex
l4[2*4*(4+1)+1];
212 template <
typename T>
215 for (
int i=0; i<res.
nl0; i++) {
216 res.
l0[i] = coeffs1.
l0[i]+coeffs2.
l0[i];
218 for (
int i=0; i<res.
nl1; i++) {
219 res.
l1[i] = coeffs1.
l1[i]+coeffs2.
l1[i];
221 for (
int i=0; i<res.
nl2; i++) {
222 res.
l2[i] = coeffs1.
l2[i]+coeffs2.
l2[i];
224 for (
int i=0; i<res.
nl3; i++) {
225 res.
l3[i] = coeffs1.
l3[i]+coeffs2.
l3[i];
227 for (
int i=0; i<res.
nl4; i++) {
228 res.
l4[i] = coeffs1.
l4[i]+coeffs2.
l4[i];
233 template <
typename T>
238 template <
typename T>
241 for (
int i=0; i<res.
nl0; i++) {
242 res.
l0[i] = coeffs.
l0[i]/val;
244 for (
int i=0; i<res.
nl1; i++) {
245 res.
l1[i] = coeffs.
l1[i]/val;
247 for (
int i=0; i<res.
nl2; i++) {
248 res.
l2[i] = coeffs.
l2[i]/val;
250 for (
int i=0; i<res.
nl3; i++) {
251 res.
l3[i] = coeffs.
l3[i]/val;
253 for (
int i=0; i<res.
nl4; i++) {
254 res.
l4[i] = coeffs.
l4[i]/val;
267 template <
typename T>
269 const int warpSize = 32;
270 for (
unsigned int i=0; i<coeffs.
nl0; i++) {
271 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
272 coeffs.
l0[i].x += __shfl_down(coeffs.
l0[i].x, offset);
274 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
275 coeffs.
l0[i].y += __shfl_down(coeffs.
l0[i].y, offset);
278 for (
unsigned int i=0; i<coeffs.
nl1; i++) {
279 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
280 coeffs.
l1[i].x += __shfl_down(coeffs.
l1[i].x, offset);
282 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
283 coeffs.
l1[i].y += __shfl_down(coeffs.
l1[i].y, offset);
286 for (
unsigned int i=0; i<coeffs.
nl2; i++) {
287 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
288 coeffs.
l2[i].x += __shfl_down(coeffs.
l2[i].x, offset);
290 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
291 coeffs.
l2[i].y += __shfl_down(coeffs.
l2[i].y, offset);
294 for (
unsigned int i=0; i<coeffs.
nl3; i++) {
295 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
296 coeffs.
l3[i].x += __shfl_down(coeffs.
l3[i].x, offset);
298 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
299 coeffs.
l3[i].y += __shfl_down(coeffs.
l3[i].y, offset);
302 for (
unsigned int i=0; i<coeffs.
nl4; i++) {
303 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
304 coeffs.
l4[i].x += __shfl_down(coeffs.
l4[i].x, offset);
306 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
307 coeffs.
l4[i].y += __shfl_down(coeffs.
l4[i].y, offset);
318 template <
typename T>
320 const int warpSize = 32;
323 int lane = threadIdx.x % warpSize;
324 int wid = threadIdx.x / warpSize;
328 if (lane==0) shared[wid]=val;
333 if (threadIdx.x < blockDim.x / warpSize) {
345 template <
typename T>
349 for (
int i = blockIdx.x * blockDim.x + threadIdx.x; i<nTerms; i += blockDim.x * gridDim.x) {
353 if (threadIdx.x==0) {
cuTypes< T >::complex l4[2 *4 *(4+1)+1]
Definition: gshCUDA.h:209
int nl4
Definition: gshCUDA.h:204
void operator+=(GSHCoeffs< T > &A, const GSHCoeffs< T > &B)
Definition: gsh.h:231
cuTypes< T >::complex l1[2 *1 *(1+1)+1]
Definition: gshCUDA.h:206
Definition: casesUtils.cpp:4
__device__ GSHCoeffsCUDA< T > blockReduceSumGSHCoeffs(GSHCoeffsCUDA< T > val)
Definition: gshCUDA.h:319
cuTypes< T >::complex l3[2 *3 *(3+1)+1]
Definition: gshCUDA.h:208
#define HPP_CHECK_CUDA_ENABLED_BUILD
Definition: config.h:44
__global__ void BLOCK_REDUCE_KEPLER_GSH_COEFFS(GSHCoeffsCUDA< T > *in, GSHCoeffsCUDA< T > *out, int nTerms)
Definition: gshCUDA.h:346
__device__ GSHCoeffsCUDA< T > warpReduceSumGSHCoeffs(GSHCoeffsCUDA< T > coeffs)
Definition: gshCUDA.h:268
__host__ __device__ bool isInSymmetrizedSection(int l, int m, int n)
Definition: gshCUDA.h:53
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
int nl3
Definition: gshCUDA.h:203
__host__ std::vector< T > getl0Reals()
Definition: gshCUDA.h:155
__host__ std::vector< T > getl4Reals()
Definition: gshCUDA.h:191
__host__ std::vector< T > getl3Reals()
Definition: gshCUDA.h:182
Header file CUDA utility functions.
__host__ __device__ GSHCoeffsCUDA()
Definition: gshCUDA.h:30
cuTypes< T >::complex l2[2 *2 *(2+1)+1]
Definition: gshCUDA.h:207
__host__ std::vector< T > getl1Reals()
Definition: gshCUDA.h:164
cuTypes< T >::complex l0[2 *0 *(0+1)+1]
Definition: gshCUDA.h:205
int nl0
Definition: gshCUDA.h:200
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210
__host__ std::vector< T > getl2Reals()
Definition: gshCUDA.h:173
__host__ __device__ unsigned int getFlatIdx(int l, int m, int n)
Definition: gshCUDA.h:66
int nl1
Definition: gshCUDA.h:201
int nl2
Definition: gshCUDA.h:202