10 #ifndef HPP_TENSOR_CUDA_H 11 #define HPP_TENSOR_CUDA_H 15 #include <initializer_list> 25 template <
typename U,
unsigned int N>
27 template <
typename T,
unsigned int M,
unsigned int N>
29 template <
typename T,
unsigned int N>
31 template <
typename T,
unsigned int N>
38 template <
typename T,
unsigned int N>
42 for (
unsigned int i=0; i<N; i++) {
48 VecCUDA(
const std::vector<T>& in);
49 VecCUDA(
const std::initializer_list<T>& in);
58 __host__ __device__ T&
operator()(
const unsigned int i) {
62 __host__ __device__ T
getVal(
const unsigned int i)
const {
66 __host__ __device__
void setVal(
const unsigned int i,
const T val) {
72 T sumOfSquares = (T)0.0;
73 for (
unsigned int i=0; i<N; i++) {
76 return sqrtIntr(sumOfSquares);
83 template <
typename T,
unsigned int N>
91 std::copy(in.begin(), in.end(), &(
vals[0]));
94 template <
typename T,
unsigned int N>
102 std::copy(in.begin(), in.end(), &(
vals[0]));
106 template <
typename T,
unsigned int N>
109 for (
unsigned int i=0; i<N; i++) {
110 outVec(i) = inVec.
getVal(i)/scalar;
120 template <
typename T>
123 T r = cartVec.
norm();
127 T theta = atan2(unitVec(1), unitVec(0));
130 T phi = acos(unitVec(2));
135 sphereVec(1) = theta;
144 template <
typename T,
unsigned int M,
unsigned int N>
149 for (
unsigned int i=0; i<M; i++) {
150 for (
unsigned int j=0; j<N; j++) {
161 for (
unsigned int i=0; i<N; i++) {
162 for (
unsigned int j=0; j<N; j++) {
170 memcpy(
vals, input.
vals, M*N*
sizeof(T));
176 for (
unsigned int i=0; i<N; i++) {
177 for (
unsigned int j=0; j<N; j++) {
185 __host__ __device__ T&
operator()(
const unsigned int i,
const unsigned int j) {
189 __host__ __device__ T
getVal(
const unsigned int i,
const unsigned int j)
const {
193 __host__ __device__
void setVal(
const unsigned int i,
const unsigned int j,
const T val) {
200 for (
unsigned int i=0; i<M; i++) {
201 for (
unsigned int j=0; j<N; j++) {
202 A(j,i) = this->
getVal(i,j);
217 T ooDet = (T)1.0/det;
236 void printToStream(std::ostream& out)
const;
240 std::vector<hsize_t> tensorDims = {M, N};
241 writeSingleHDF5Array<T>(dataset, arrayOffset, tensorDims, &(
vals[0][0]));
251 template <
typename T,
unsigned int M,
unsigned int N>
254 if (M != in.
n1 || N != in.
n2) {
259 std::copy(&(in.
vals[0]), &(in.
vals[0])+M*N, &(
vals[0][0]));
263 template <
typename T,
unsigned int M,
unsigned N>
266 for (
unsigned int i=0; i<M; i++) {
267 for (
unsigned int j=0; j<N; j++) {
275 template <
typename T,
unsigned int M,
unsigned N>
278 for (
unsigned int i=0; i<M; i++) {
279 for (
unsigned int j=0; j<N; j++) {
285 template <
typename T,
unsigned int M,
unsigned N>
291 template <
typename T,
unsigned int M,
unsigned N>
294 for (
unsigned int i=0; i<M; i++) {
295 for (
unsigned int j=0; j<N; j++) {
296 B(i,j) = scalar*A.
getVal(i,j);
301 template <
typename T,
unsigned int M,
unsigned N>
307 template <
typename T,
unsigned int M,
unsigned N>
310 for (
unsigned int i=0; i<M; i++) {
311 for (
unsigned int j=0; j<N; j++) {
312 B(i,j) = A.
getVal(i,j)/scalar;
317 template <
typename T,
unsigned int M,
unsigned N>
323 template <
typename T,
unsigned int M,
unsigned int N,
unsigned int P>
326 for (
unsigned int i=0; i<M; i++) {
327 for (
unsigned int j=0; j<P; j++) {
328 for (
unsigned int k=0; k<N; k++) {
337 template <
typename T,
unsigned int M,
unsigned int N>
340 for (
unsigned int i=0; i<M; i++) {
341 for (
unsigned int j=0; j<N; j++) {
354 template <
typename T,
unsigned int M>
356 return Q.
trans()*A*Q;
365 template <
typename T,
unsigned int M>
367 return Q*A_star*Q.
trans();
370 template <
typename T,
unsigned int M>
372 return Q.
trans()*A*Q;
375 template <
typename T,
unsigned int M>
377 return Q*A_star*Q.
trans();
380 template <
typename T>
383 T c1, c2, c3, s1, s2, s3;
384 sincosIntr(alpha, &s1, &c1);
385 sincosIntr(beta, &s2, &c2);
386 sincosIntr(gamma, &s3, &c3);
387 R(0,0) = c1*c3 - c2*s1*s3;
388 R(0,1) = -c1*s3 - c2*c3*s1;
390 R(1,0) = c3*s1 + c1*c2*s3;
391 R(1,1) = c1*c2*c3 - s1*s3;
400 template <
typename T,
unsigned int M,
unsigned int N>
404 for (
unsigned int i=0; i<M; i++) {
406 for (
unsigned int j=0; j<N; j++) {
424 template <
typename T,
unsigned int M,
unsigned int N>
425 std::ostream& operator<<(std::ostream& out, const Tensor2CUDA<T,M,N>& A)
427 A.printToStream(out);
439 template <
typename T,
unsigned int M,
unsigned N>
441 const int warpSize = 32;
442 for (
unsigned int i=0; i<M; i++) {
443 for (
unsigned int j=0; j<N; j++) {
444 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
445 A(i,j) += __shfl_down(A(i,j), offset);
457 template <
typename T,
unsigned int M,
unsigned N>
459 const int warpSize = 32;
462 int lane = threadIdx.x % warpSize;
463 int wid = threadIdx.x / warpSize;
467 if (lane==0) shared[wid]=val;
472 if (threadIdx.x < blockDim.x / warpSize) {
491 template <
typename T,
unsigned int M,
unsigned N>
495 for (
int i = blockIdx.x * blockDim.x + threadIdx.x; i<nTerms; i += blockDim.x * gridDim.x) {
499 if (threadIdx.x==0) {
509 template <
typename T,
unsigned int M,
unsigned int N>
518 float floatEpsilon = 1.19209e-07;
519 if (angle.
beta > 1e3*floatEpsilon) {
527 T alphaPlusGamma = atan2(-R.
getVal(0,1), R.
getVal(0,0));
530 angle.
alpha = alphaPlusGamma/2.0;
531 angle.
gamma = alphaPlusGamma/2.0;
535 if (angle.
alpha < 0) angle.
alpha += 2*(T)M_PI;
536 if (angle.
gamma < 0) angle.
gamma += 2*(T)M_PI;
557 template <
typename T,
unsigned int N>
562 for (
unsigned int i=0; i<this->getNelements(); i++) {
569 memcpy(
vals, input.
vals, this->getNelements()*
sizeof(T));
578 for (
unsigned int i=0; i<N; i++) {
579 for (
unsigned int j=i; j<N; j++) {
588 T closeEnough = 100*std::numeric_limits<T>::epsilon();
589 for (
unsigned int i=0; i<N; i++) {
590 for (
unsigned int j=i+1; j<N; j++) {
591 T val = input.
getVal(i,j);
592 T symmVal = input.
getVal(j,i);
593 if (
std::abs(val-symmVal) > closeEnough) {
594 std::cerr <<
"(" << i <<
"," << j <<
")=" << val << std::endl;
595 std::cerr <<
"(" << j <<
"," << i <<
")=" << symmVal << std::endl;
596 throw std::runtime_error(
"Input tensor is not symmetric.");
602 for (
unsigned int i=0; i<N; i++) {
603 for (
unsigned int j=i; j<N; j++) {
626 __host__ __device__
unsigned int getUpperFlatIdx(
const unsigned int i,
const unsigned int j)
const {
628 unsigned int idx = getNelements()-1;
631 idx -= ((N-i)*(N-i+1))/2;
641 __host__ __device__
void setVal(
const unsigned int i,
const unsigned int j,
const T val) {
643 vals[getUpperFlatIdx(i,j)] = val;
646 vals[getUpperFlatIdx(j,i)] = val;
651 __host__ __device__ T
getVal(
const unsigned int i,
const unsigned int j)
const {
653 return vals[getUpperFlatIdx(i,j)];
656 return vals[getUpperFlatIdx(j,i)];
661 template <
typename U,
unsigned int M>
663 template <
typename U,
unsigned int M>
677 template <
typename T,
unsigned int N>
687 template <
typename T,
unsigned int N>
698 template <
typename T,
unsigned int N,
unsigned int P>
701 for (
unsigned int i=0; i<N; i++) {
702 for (
unsigned int j=0; j<P; j++) {
703 for (
unsigned int k=0; k<N; k++) {
726 template <
typename T,
unsigned int N>
731 for (
unsigned int i=0; i<this->getNelements(); i++) {
738 memcpy(
vals, input.
vals, this->getNelements()*
sizeof(T));
745 T closeEnough = 100*std::numeric_limits<T>::epsilon();
746 for (
unsigned int i=0; i<N; i++) {
747 for (
unsigned int j=i; j<N; j++) {
748 T val = input.
getVal(i,j);
749 T asymmVal = input.
getVal(j,i);
750 if (
std::abs(val+asymmVal) > closeEnough) {
751 std::cerr <<
"(" << i <<
"," << j <<
")=" << val << std::endl;
752 std::cerr <<
"(" << j <<
"," << i <<
")=" << asymmVal << std::endl;
753 throw std::runtime_error(
"Input tensor is not anti-symmetric.");
759 for (
unsigned int i=0; i<N-1; i++) {
760 for (
unsigned int j=i+1; j<N; j++) {
782 __host__ __device__
unsigned int getUpperFlatIdx(
const unsigned int i,
const unsigned int j)
const {
784 unsigned int idx = getNelements()-1;
787 idx -= ((N-i-1)*(N-i))/2;
797 __host__ __device__
void setVal(
const unsigned int i,
const unsigned int j,
const T val) {
799 vals[getUpperFlatIdx(i,j)] = val;
802 vals[getUpperFlatIdx(j,i)] = -val;
807 __host__ __device__ T
getVal(
const unsigned int i,
const unsigned int j)
const {
813 return vals[getUpperFlatIdx(i,j)];
816 return -
vals[getUpperFlatIdx(j,i)];
822 template <
typename U,
unsigned int M>
824 template <
typename U,
unsigned int M>
838 template <
typename T,
unsigned int N>
847 template <
typename T,
unsigned int N>
850 for (
unsigned int i=0; i<N; i++) {
851 for (
unsigned int j=0; j<N; j++) {
859 template <
typename T,
unsigned int N>
870 template <
typename T,
unsigned int N,
unsigned int P>
873 for (
unsigned int i=0; i<N; i++) {
874 for (
unsigned int j=0; j<P; j++) {
875 for (
unsigned int k=0; k<N; k++) {
884 template <
typename T,
unsigned int N,
unsigned int P>
887 for (
unsigned int i=0; i<N; i++) {
888 for (
unsigned int j=0; j<P; j++) {
889 for (
unsigned int k=0; k<P; k++) {
901 template <
typename U,
unsigned int M,
unsigned int N,
unsigned int P,
unsigned int Q>
911 template <
typename U,
unsigned int M,
unsigned int N,
unsigned int P,
unsigned int Q>
914 if (M != in.
n1 || N != in.
n2 || P != in.
n3 || Q != in.
n4) {
919 std::copy(&(in.
vals[0]), &(in.
vals[0])+M*N*P*Q, &(
vals[0][0][0][0]));
Tensor2< T > transformOutOfFrame(const Tensor2< T > &A_star, const Tensor2< T > &Q)
Transform tensor out of the frame given by the columns of .
Definition: tensor.h:1613
void operator+=(GSHCoeffs< T > &A, const GSHCoeffs< T > &B)
Definition: gsh.h:231
T alpha
Definition: rotation.h:44
T beta
Definition: rotation.h:45
Tensor2< T > transformIntoFrame(const Tensor2< T > &A, const Tensor2< T > &Q)
Transform tensor into the frame given by the columns of .
Definition: tensor.h:1602
const T * end() const
Definition: tensorCUDA.h:55
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:651
Definition: tensorCUDA.h:28
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:807
T * begin()
Definition: tensorCUDA.h:52
void operator/=(GSHCoeffs< T > &A, const T B)
Definition: gsh.h:257
Definition: casesUtils.cpp:4
__host__ __device__ Tensor2CUDA< T, N, N > & operator=(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:175
Header file for helper functions with HDF, C++ API.
void printToStream(std::ostream &out) const
Definition: tensorCUDA.h:401
__host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const
Get flat index for upper triangular portion.
Definition: tensorCUDA.h:782
std::vector< T > operator*(const std::vector< T > &vec, const T scalar)
Definition: tensor.h:72
__device__ Tensor2CUDA< T, M, N > warpReduceSumTensor2(Tensor2CUDA< T, M, N > A)
Definition: tensorCUDA.h:440
__host__ __device__ unsigned int getNelements() const
Definition: tensorCUDA.h:668
#define HPP_CHECK_CUDA_ENABLED_BUILD
Definition: config.h:44
Definition: tensorCUDA.h:30
__host__ __device__ Tensor2AsymmCUDA< T, N > & operator=(const Tensor2AsymmCUDA< T, N > &input)
Definition: tensorCUDA.h:737
__host__ Tensor2SymmCUDA(const Tensor2< T > &input)
Definition: tensorCUDA.h:611
__host__ __device__ Tensor2CUDA(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:160
__host__ __device__ Tensor2AsymmCUDA()
Definition: tensorCUDA.h:730
Header file for rotation classes and functions.
__host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const
Get flat index for upper triangular portion.
Definition: tensorCUDA.h:626
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:193
Header file for tensor classes.
__host__ __device__ T getVal(const unsigned int i) const
Definition: tensorCUDA.h:62
__host__ __device__ Tensor2CUDA< T, 3, 3 > inv() const
3x3 inverse
Definition: tensorCUDA.h:212
Definition: tensorCUDA.h:32
A class for second order tensors.
Definition: tensor.h:303
unsigned int n2
Definition: tensor.h:1104
std::vector< T > cartesianToSpherical(const std::vector< T > &cartVec)
Definition: crystal.cpp:1575
__host__ __device__ Tensor2SymmCUDA< T, N > & operator=(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:568
T * end()
Definition: tensorCUDA.h:53
Header file CUDA utility functions.
__device__ Tensor2CUDA< T, 3, 3 > EulerZXZRotationMatrixCUDA(EulerAngles< T > angles)
Definition: crystalCUDA.h:371
__host__ Tensor2AsymmCUDA(const Tensor2< T > &input)
Definition: tensorCUDA.h:767
unsigned int n3
Definition: tensor.h:1105
unsigned int n1
Definition: tensor.h:1103
T vals[M][N]
Definition: tensorCUDA.h:247
std::vector< T > abs(const std::vector< T > &vec)
Definition: tensor.h:89
const T * begin() const
Definition: tensorCUDA.h:54
std::vector< T > operator-(const std::vector< T > &vec1, const std::vector< T > &vec2)
Definition: tensor.h:110
T gamma
Definition: rotation.h:46
__host__ __device__ Tensor2SymmCUDA()
Definition: tensorCUDA.h:561
__device__ T norm() const
Definition: tensorCUDA.h:71
std::valarray< T > vals
Definition: tensor.h:1110
__host__ __device__ VecCUDA()
Definition: tensorCUDA.h:41
Tensor4CUDA()
Definition: tensorCUDA.h:904
__host__ __device__ Tensor2CUDA()
Definition: tensorCUDA.h:148
unsigned int n1
the first dimension of the tensor
Definition: tensor.h:504
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:641
T vals[(N *(N-1))/2]
Definition: tensorCUDA.h:834
__host__ __device__ T & operator()(const unsigned int i, const unsigned int j)
Definition: tensorCUDA.h:185
EulerAngles< T > getEulerZXZAngles(Tensor2< T > R)
Get Euler angles from rotation matrix.
Definition: rotation.h:149
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210
unsigned int n4
Definition: tensor.h:1106
__host__ Tensor2AsymmCUDA(const Tensor2CUDA< T, N, N > &input)
Definition: tensorCUDA.h:743
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:797
__global__ void BLOCK_REDUCE_KEPLER_TENSOR2(Tensor2CUDA< T, M, N > *in, Tensor2CUDA< T, M, N > *out, int nTerms)
Definition: tensorCUDA.h:492
std::valarray< T > vals
the underlying 1D array
Definition: tensor.h:511
__host__ __device__ T & operator()(const unsigned int i)
Definition: tensorCUDA.h:58
T vals[N]
Definition: tensorCUDA.h:79
__host__ __device__ unsigned int getNelements() const
Definition: tensorCUDA.h:829
__host__ __device__ Tensor2CUDA< T, M, N > & operator=(const Tensor2CUDA< T, M, N > &input)
Definition: tensorCUDA.h:169
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:189
__host__ __device__ void setVal(const unsigned int i, const T val)
Definition: tensorCUDA.h:66
Definition: tensorCUDA.h:26
__host__ __device__ Tensor2CUDA< T, N, M > trans() const
Definition: tensorCUDA.h:198
Definition: rotation.h:43
void writeToExistingHDF5Dataset(H5::DataSet &dataset, std::vector< hsize_t > arrayOffset)
Definition: tensorCUDA.h:239
__device__ Tensor2CUDA< T, M, N > blockReduceSumTensor2(Tensor2CUDA< T, M, N > val)
Definition: tensorCUDA.h:458
T vals[(N *(N+1))/2]
Definition: tensorCUDA.h:673
__host__ Tensor2SymmCUDA(const Tensor2CUDA< T, N, N > &input)
Definition: tensorCUDA.h:586
unsigned int n2
the second dimension of the tensor
Definition: tensor.h:506
Definition: tensorCUDA.h:902