7 #ifndef HPP_SPECTRAL_UTILS_CUDA_H 8 #define HPP_SPECTRAL_UTILS_CUDA_H 27 template<
typename T,
unsigned int N>
33 template<
unsigned int N>
37 __host__ __device__
unsigned int getVal(
const unsigned int i)
const {
40 __host__ __device__
unsigned int&
operator()(
const unsigned int i) {
63 template<
typename T,
unsigned int N>
90 template<
typename T,
unsigned int N>
96 __device__ T getIDFTRealD(
unsigned int dsetIdx,
unsigned int *spatialCoord)
const;
98 T getIDFTRealH(
unsigned int dsetIdx, std::vector<unsigned int> spatialCoord)
const;
136 template <
typename T,
unsigned int N>
145 T expArgFactor = 2*((T)M_PI)/gridDims[0];
148 for (
unsigned int i=0; i<dset.
nTerms; i++) {
150 unsigned int expInd = 0;
151 for (
unsigned int j=0; j<N; j++) {
152 expInd += spatialCoord[j]*dset.
coords[i](j);
157 expInd = expInd&(gridDims[0]-1);
160 T expArg = expInd*expArgFactor;
162 sincosIntr(expArg, &(expVal[1]), &(expVal[0]));
165 val = fmaIntr(dset.
coeffs[i].re, expVal[0], val);
166 val = fmaIntr(dset.
coeffs[i].im, expVal[1], val);
191 template <
typename T,
unsigned int N>
197 unsigned int nTerms = dset.
nTerms;
203 T expArgFactor = 2*((T)M_PI)/gridDims[0];
209 unsigned int termsPerLargeRead = nShared;
210 unsigned int nReads = nTerms/termsPerLargeRead;
211 if (nTerms % termsPerLargeRead != 0) nReads++;
215 unsigned int termsPerBlockRead = blockDim.x;
218 for (
unsigned int iRead=0; iRead<nReads; iRead++) {
220 unsigned int readStartGlobal = iRead*termsPerLargeRead+threadIdx.x;
221 unsigned int readEndGlobal = umin((iRead+1)*termsPerLargeRead, nTerms);
222 for (
unsigned int readIdxGlobal = readStartGlobal; readIdxGlobal<readEndGlobal; readIdxGlobal+=termsPerBlockRead) {
223 unsigned int readIdxShared = readIdxGlobal%termsPerLargeRead;
224 sharedCoords[readIdxShared] = dset.
coords[readIdxGlobal];
225 sharedCoeffs[readIdxShared] = dset.
coeffs[readIdxGlobal];
232 unsigned int termsStartGlobal = iRead*termsPerLargeRead;
233 unsigned int termsEndGlobal = umin(termsStartGlobal+termsPerLargeRead, nTerms);
234 unsigned int termsStartShared = 0;
235 unsigned int termsEndShared = termsEndGlobal-termsStartGlobal;
236 for (
unsigned int i=termsStartShared; i<termsEndShared; i++) {
238 unsigned int expInd = spatialCoord[0]*sharedCoords[i](0);
239 for (
unsigned int j=1; j<N; j++) {
240 expInd += spatialCoord[j]*sharedCoords[i](j);
244 expInd = expInd&(gridDims[0]-1);
247 T expArg = expInd*expArgFactor;
249 sincosIntr(expArg, &(expVal[1]), &(expVal[0]));
252 val = fmaIntr(sharedCoeffs[i].re, expVal[0], val);
253 val = fmaIntr(sharedCoeffs[i].im, expVal[1], val);
265 template <
typename T,
unsigned int N>
284 template<
typename T,
unsigned int N,
unsigned int P>
303 template<
typename T,
unsigned int N,
unsigned int P>
309 __device__
void getIDFTRealDShared(
unsigned int *spatialCoord, T *outputs,
unsigned int nShared,
SpectralDataUnifiedCUDA<T,N,P> *sharedData)
const;
310 __device__
void getIDFTRealDSharedPair(
unsigned int *spatialCoord0, T *outputs0,
unsigned int *spatialCoord1, T *outputs1,
unsigned int nShared,
SpectralDataUnifiedCUDA<T,N,P> *sharedData)
const;
344 template <
typename T,
unsigned int N>
347 unsigned int expInd = spatialCoord[0]*coord(0);
348 for (
unsigned int j=1; j<N; j++) {
349 expInd += spatialCoord[j]*coord(j);
354 expInd = expInd&(gridDim-1);
357 T expArg = expInd*expArgFactor;
358 sincosIntr(expArg, expValIm, expValRe);
370 template <
typename T,
unsigned int N,
unsigned int P>
373 const unsigned int gridDimReg = gridDims[0];
374 const unsigned int nTermsReg = nTerms;
377 T expArgFactor = 2*((T)M_PI)/gridDimReg;
381 int *globalDataAsInt = (
int*)data;
382 int *sharedDataAsInt = (
int*)sharedData;
383 unsigned int readElementSize =
sizeof(int);
389 unsigned int termsPerLargeRead = nShared;
392 unsigned int nReads = nTermsReg/termsPerLargeRead;
393 if (nTermsReg % termsPerLargeRead != 0) nReads++;
397 unsigned int elementsPerBlockRead = blockDim.x;
400 for (
unsigned int i=0; i<P; i++) {
405 for (
unsigned int iRead=0; iRead<nReads; iRead++) {
407 unsigned int dataReadStartGlobal = iRead*elementsPerLargeRead+threadIdx.x;
408 unsigned int dataReadEndGlobal = umin((iRead+1)*elementsPerLargeRead, totalElementsToRead);
409 for (
unsigned int readIdxGlobal = dataReadStartGlobal; readIdxGlobal<dataReadEndGlobal; readIdxGlobal+=elementsPerBlockRead) {
410 unsigned int readIdxShared = readIdxGlobal%elementsPerLargeRead;
411 sharedDataAsInt[readIdxShared] = globalDataAsInt[readIdxGlobal];
418 unsigned int termsStartGlobal = iRead*termsPerLargeRead;
419 unsigned int termsEndGlobal = umin(termsStartGlobal+termsPerLargeRead, nTermsReg);
420 unsigned int termsStartShared = 0;
421 unsigned int termsEndShared = termsEndGlobal-termsStartGlobal;
424 for (
unsigned int i=termsStartShared; i<termsEndShared; i++) {
426 unsigned int expInd = spatialCoord[0]*sharedData[i].coord(0);
427 for (
unsigned int j=1; j<N; j++) {
428 expInd += spatialCoord[j]*sharedData[i].coord(j);
433 expInd = expInd&(gridDimReg-1);
436 T expArg = expInd*expArgFactor;
438 sincosIntr(expArg, &(expVal[1]), &(expVal[0]));
441 for (
unsigned int iDset = 0; iDset<P; iDset++) {
442 outputs[iDset] = fmaIntr(sharedData[i].coeffs[iDset].re, expVal[0], outputs[iDset]);
443 outputs[iDset] = fmaIntr(sharedData[i].coeffs[iDset].im, expVal[1], outputs[iDset]);
463 template <
typename T,
unsigned int N,
unsigned int P>
466 const unsigned int gridDimReg = gridDims[0];
467 const unsigned int nTermsReg = nTerms;
470 T expArgFactor = 2*((T)M_PI)/gridDimReg;
474 int *globalDataAsInt = (
int*)data;
475 int *sharedDataAsInt = (
int*)sharedData;
476 unsigned int readElementSize =
sizeof(int);
482 unsigned int termsPerLargeRead = nShared;
485 unsigned int nReads = nTermsReg/termsPerLargeRead;
486 if (nTermsReg % termsPerLargeRead != 0) nReads++;
490 unsigned int elementsPerBlockRead = blockDim.x;
493 for (
unsigned int i=0; i<P; i++) {
494 outputs0[i] = (T)0.0;
495 outputs1[i] = (T)0.0;
499 for (
unsigned int iRead=0; iRead<nReads; iRead++) {
501 unsigned int dataReadStartGlobal = iRead*elementsPerLargeRead+threadIdx.x;
502 unsigned int dataReadEndGlobal = umin((iRead+1)*elementsPerLargeRead, totalElementsToRead);
503 for (
unsigned int readIdxGlobal = dataReadStartGlobal; readIdxGlobal<dataReadEndGlobal; readIdxGlobal+=elementsPerBlockRead) {
504 unsigned int readIdxShared = readIdxGlobal%elementsPerLargeRead;
505 sharedDataAsInt[readIdxShared] = globalDataAsInt[readIdxGlobal];
512 unsigned int termsStartGlobal = iRead*termsPerLargeRead;
513 unsigned int termsEndGlobal = umin(termsStartGlobal+termsPerLargeRead, nTermsReg);
514 unsigned int termsStartShared = 0;
515 unsigned int termsEndShared = termsEndGlobal-termsStartGlobal;
518 for (
unsigned int i=termsStartShared; i<termsEndShared; i++) {
523 T expValRe, expValIm;
524 getExpVal(spatialCoord0, unifiedData.coord, gridDimReg, expArgFactor, &expValRe, &expValIm);
525 for (
unsigned int iDset = 0; iDset<P; iDset++) {
526 outputs0[iDset] = fmaIntr(unifiedData.coeffs[iDset].re, expValRe, outputs0[iDset]);
527 outputs0[iDset] = fmaIntr(unifiedData.coeffs[iDset].im, expValIm, outputs0[iDset]);
531 getExpVal(spatialCoord1, unifiedData.coord, gridDimReg, expArgFactor, &expValRe, &expValIm);
532 for (
unsigned int iDset = 0; iDset<P; iDset++) {
533 outputs1[iDset] = fmaIntr(unifiedData.coeffs[iDset].re, expValRe, outputs1[iDset]);
534 outputs1[iDset] = fmaIntr(unifiedData.coeffs[iDset].im, expValIm, outputs1[iDset]);
std::shared_ptr< SpectralDatasetCUDA< T, N > > dsetsSharedPtr
Definition: spectralUtilsCUDA.h:130
Definition: spectralUtils.h:317
__device__ T * getGridStarts()
Definition: spectralUtilsCUDA.h:101
SpectralDataUnifiedCUDA< T, N, P > * data
Definition: spectralUtilsCUDA.h:338
unsigned int * gridDims
Definition: spectralUtilsCUDA.h:322
T im
Definition: spectralUtilsCUDA.h:51
Definition: spectralUtilsCUDA.h:28
std::vector< std::shared_ptr< SpectralCoeffCUDA< T > > > coeffSharedPtrs
Definition: spectralUtilsCUDA.h:132
unsigned int getNDsets() const
Definition: spectralUtilsCUDA.h:104
std::shared_ptr< T > gridStartsSharedPtr
Definition: spectralUtilsCUDA.h:327
T * gridStarts
Definition: spectralUtilsCUDA.h:115
SpectralDatasetCUDA< T, N > * dsets
Definition: spectralUtilsCUDA.h:127
Definition: casesUtils.cpp:4
Definition: spectralUtilsCUDA.h:34
T re
Definition: spectralUtilsCUDA.h:50
std::shared_ptr< unsigned int > gridDimsSharedPtr
Definition: spectralUtilsCUDA.h:112
__device__ T * getGridSteps()
Definition: spectralUtilsCUDA.h:313
#define HPP_CHECK_CUDA_ENABLED_BUILD
Definition: config.h:44
unsigned int nTermsTypical
Definition: spectralUtilsCUDA.h:124
__host__ __device__ unsigned int & operator()(const unsigned int i)
Definition: spectralUtilsCUDA.h:40
Definition: spectralUtils.h:225
__host__ __device__ SpectralCoordCUDA()
Definition: spectralUtilsCUDA.h:36
unsigned int getNTerms() const
Definition: spectralUtilsCUDA.h:316
unsigned int coords[N]
Definition: spectralUtilsCUDA.h:29
Definition: spectralUtilsCUDA.h:304
Definition: spectralUtilsCUDA.h:64
__device__ void getIDFTRealDShared(unsigned int *spatialCoord, T *outputs, unsigned int nShared, SpectralDataUnifiedCUDA< T, N, P > *sharedData) const
Device IDFTD.
Definition: spectralUtilsCUDA.h:371
Definition: spectralUtilsCUDA.h:91
std::shared_ptr< T > gridStepsSharedPtr
Definition: spectralUtilsCUDA.h:118
Header file CUDA utility functions.
std::shared_ptr< unsigned int > gridDimsSharedPtr
Definition: spectralUtilsCUDA.h:323
__device__ T * getGridStarts()
Definition: spectralUtilsCUDA.h:312
T * gridSteps
Definition: spectralUtilsCUDA.h:328
unsigned int nDsets
Definition: spectralUtilsCUDA.h:121
std::shared_ptr< T > gridStepsSharedPtr
Definition: spectralUtilsCUDA.h:329
T * gridStarts
Definition: spectralUtilsCUDA.h:326
__host__ __device__ unsigned int getVal(const unsigned int i) const
Definition: spectralUtilsCUDA.h:37
T * gridSteps
Definition: spectralUtilsCUDA.h:117
__global__ void GET_IDFT_REAL(SpectralDatabaseCUDA< T, N > *db, unsigned int dsetIdx, unsigned int *spatialCoord, T *val)
Definition: spectralUtilsCUDA.h:266
Header file for crystal classes.
__device__ void getIDFTRealDSharedPair(unsigned int *spatialCoord0, T *outputs0, unsigned int *spatialCoord1, T *outputs1, unsigned int nShared, SpectralDataUnifiedCUDA< T, N, P > *sharedData) const
Device IDFTD.
Definition: spectralUtilsCUDA.h:464
__device__ T * getGridSteps()
Definition: spectralUtilsCUDA.h:102
std::shared_ptr< SpectralDataUnifiedCUDA< T, N, P > > dataSharedPtr
Definition: spectralUtilsCUDA.h:341
SpectralCoeffCUDA< T > * coeffs
Definition: spectralUtilsCUDA.h:65
__device__ void getExpVal(unsigned int *spatialCoord, SpectralCoordCUDA< N > &coord, unsigned int gridDim, T expArgFactor, T *expValRe, T *expValIm)
Definition: spectralUtilsCUDA.h:345
unsigned int nDsets
Definition: spectralUtilsCUDA.h:332
__device__ unsigned int * getGridDims()
Definition: spectralUtilsCUDA.h:103
SpectralCoordCUDA< N > * coords
Definition: spectralUtilsCUDA.h:66
std::vector< std::shared_ptr< SpectralCoordCUDA< N > > > coordSharedPtrs
Definition: spectralUtilsCUDA.h:131
unsigned int nTerms
Definition: spectralUtilsCUDA.h:335
unsigned int getNDsets() const
Definition: spectralUtilsCUDA.h:315
std::shared_ptr< T > gridStartsSharedPtr
Definition: spectralUtilsCUDA.h:116
unsigned int nTerms
Definition: spectralUtilsCUDA.h:67
__device__ T getIDFTRealDShared(unsigned int dsetIdx, unsigned int *spatialCoord, unsigned int nShared, SpectralCoordCUDA< N > *sharedCoords, SpectralCoeffCUDA< T > *sharedCoeffs) const
Device IDFTD.
Definition: spectralUtilsCUDA.h:192
unsigned int getNTermsTypical() const
Definition: spectralUtilsCUDA.h:105
T coeff[2]
Definition: spectralUtilsCUDA.h:30
struct ALIGN(16) SpectralDataUnifiedCUDA
Definition: spectralUtilsCUDA.h:285
Definition: spectralUtilsCUDA.h:49
__device__ T getIDFTRealD(unsigned int dsetIdx, unsigned int *spatialCoord) const
Definition: spectralUtilsCUDA.h:137
unsigned int * gridDims
Definition: spectralUtilsCUDA.h:111
__device__ unsigned int * getGridDims()
Definition: spectralUtilsCUDA.h:314