High Performance Plasticity  0.5.0
tensorCUDA.h
Go to the documentation of this file.
1 
10 #ifndef HPP_TENSOR_CUDA_H
11 #define HPP_TENSOR_CUDA_H
12 
13 #include <hpp/config.h>
15 #include <initializer_list>
16 #include <hpp/tensor.h>
17 #include <hpp/rotation.h>
18 #include <hpp/cudaUtils.h>
19 #include <hpp/hdfUtilsCpp.h>
20 
21 namespace hpp
22 {
23 
24 // Forward declarations
25 template <typename U, unsigned int N>
26 class VecCUDA;
27 template <typename T, unsigned int M, unsigned int N>
29 template <typename T, unsigned int N>
31 template <typename T, unsigned int N>
33 
34 
36 // VECCUDA //
38 template <typename T, unsigned int N>
39 class VecCUDA {
40 public:
41  __host__ __device__ VecCUDA(){
42  for (unsigned int i=0; i<N; i++) {
43  vals[i] = (T)0.0;
44  }
45  }
46 
47  // Constructors
48  VecCUDA(const std::vector<T>& in);
49  VecCUDA(const std::initializer_list<T>& in);
50 
51  // Iterators
52  T* begin() {return &(vals[0]);}
53  T* end() {return begin()+N;}
54  const T* begin() const {return &(vals[0]);}
55  const T* end() const {return begin()+N;}
56 
57  // Get read/write reference to value
58  __host__ __device__ T& operator()(const unsigned int i) {
59  return vals[i];
60  }
61  // Get read-only value
62  __host__ __device__ T getVal(const unsigned int i) const {
63  return vals[i];
64  }
65  // Set value
66  __host__ __device__ void setVal(const unsigned int i, const T val) {
67  vals[i] = val;
68  }
69 
70  // Norm
71  __device__ T norm() const{
72  T sumOfSquares = (T)0.0;
73  for (unsigned int i=0; i<N; i++) {
74  sumOfSquares += vals[i]*vals[i];
75  }
76  return sqrtIntr(sumOfSquares);
77  }
78 protected:
79  T vals[N];
80 };
81 
82 // Constructors
83 template <typename T, unsigned int N>
84 VecCUDA<T,N>::VecCUDA(const std::vector<T>& in) {
85  // Check size
86  if (N != in.size()) {
87  throw TensorError("Size mismatch.");
88  }
89 
90  // Copy
91  std::copy(in.begin(), in.end(), &(vals[0]));
92 }
93 
94 template <typename T, unsigned int N>
95 VecCUDA<T,N>::VecCUDA(const std::initializer_list<T>& in) {
96  // Check size
97  if (N != in.size()) {
98  throw TensorError("Size mismatch.");
99  }
100 
101  // Copy
102  std::copy(in.begin(), in.end(), &(vals[0]));
103 }
104 
105 // Operators
106 template <typename T, unsigned int N>
107 __device__ VecCUDA<T,N> operator/(const VecCUDA<T,N>& inVec, T scalar) {
108  VecCUDA<T,N> outVec;
109  for (unsigned int i=0; i<N; i++) {
110  outVec(i) = inVec.getVal(i)/scalar;
111  }
112  return outVec;
113 }
114 
120 template <typename T>
121 __device__ VecCUDA<T,3> cartesianToSpherical(const VecCUDA<T,3>& cartVec) {
122  // Magnitude
123  T r = cartVec.norm();
124  VecCUDA<T,3> unitVec = cartVec/r;
125 
126  // Azimuthal component
127  T theta = atan2(unitVec(1), unitVec(0));
128 
129  // Polar
130  T phi = acos(unitVec(2));
131 
132  // Return
133  VecCUDA<T,3> sphereVec;
134  sphereVec(0) = r;
135  sphereVec(1) = theta;
136  sphereVec(2) = phi;
137  return sphereVec;
138 }
139 
141 // TENSOR 2 //
143 
144 template <typename T, unsigned int M, unsigned int N>
145 class Tensor2CUDA {
146 public:
147  // Default constructor
148  __host__ __device__ Tensor2CUDA(){
149  for (unsigned int i=0; i<M; i++) {
150  for (unsigned int j=0; j<N; j++) {
151  vals[i][j] = (T)0.0;
152  }
153  }
154  }
155 
156  // Construct from standard Tensor2
157  Tensor2CUDA(const Tensor2<T>& in);
158 
159  // Construct from symmetric CUDA tensor
160  __host__ __device__ Tensor2CUDA(const Tensor2SymmCUDA<T,N>& input) {
161  for (unsigned int i=0; i<N; i++) {
162  for (unsigned int j=0; j<N; j++) {
163  vals[i][j] = input.getVal(i,j);
164  }
165  }
166  }
167 
168  // Assignment
169  __host__ __device__ Tensor2CUDA<T,M,N>& operator=(const Tensor2CUDA<T,M,N>& input) {
170  memcpy(vals, input.vals, M*N*sizeof(T));
171  return *this;
172  }
173 
174  // Assign from symmetric CUDA tensor
175  __host__ __device__ Tensor2CUDA<T,N,N>& operator=(const Tensor2SymmCUDA<T,N>& input) {
176  for (unsigned int i=0; i<N; i++) {
177  for (unsigned int j=0; j<N; j++) {
178  vals[i][j] = input.getVal(i,j);
179  }
180  }
181  return *this;
182  }
183 
184  // Get read/write reference to value
185  __host__ __device__ T& operator()(const unsigned int i, const unsigned int j) {
186  return vals[i][j];
187  }
188  // Get read-only value
189  __host__ __device__ T getVal(const unsigned int i, const unsigned int j) const {
190  return vals[i][j];
191  }
192  // Set value
193  __host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val) {
194  vals[i][j] = val;
195  }
196 
197  // Transpose
198  __host__ __device__ Tensor2CUDA<T,N,M> trans() const {
200  for (unsigned int i=0; i<M; i++) {
201  for (unsigned int j=0; j<N; j++) {
202  A(j,i) = this->getVal(i,j);
203  }
204  }
205  return A;
206  }
207 
212  __host__ __device__ Tensor2CUDA<T,3,3> inv() const {
213  // Determinant
214  T det = this->getVal(0,0)*(this->getVal(1,1)*this->getVal(2,2) - this->getVal(2,1)*this->getVal(1,2));
215  det -= this->getVal(0,1)*(this->getVal(1,0)*this->getVal(2,2) - this->getVal(1,2)*this->getVal(2,0));
216  det += this->getVal(0,2)*(this->getVal(1,0)*this->getVal(2,1) - this->getVal(1,1)*this->getVal(2,0));
217  T ooDet = (T)1.0/det;
218 
219  // Inverse
221  A(0,0) = (this->getVal(1,1)*this->getVal(2,2) - this->getVal(2,1)*this->getVal(1,2))*ooDet;
222  A(0,1) = (this->getVal(0,2)*this->getVal(2,1) - this->getVal(0,1)*this->getVal(2,2))*ooDet;
223  A(0,2) = (this->getVal(0,1)*this->getVal(1,2) - this->getVal(0,2)*this->getVal(1,1))*ooDet;
224  A(1,0) = (this->getVal(1,2)*this->getVal(2,0) - this->getVal(1,0)*this->getVal(2,2))*ooDet;
225  A(1,1) = (this->getVal(0,0)*this->getVal(2,2) - this->getVal(0,2)*this->getVal(2,0))*ooDet;
226  A(1,2) = (this->getVal(1,0)*this->getVal(0,2) - this->getVal(0,0)*this->getVal(1,2))*ooDet;
227  A(2,0) = (this->getVal(1,0)*this->getVal(2,1) - this->getVal(2,0)*this->getVal(1,1))*ooDet;
228  A(2,1) = (this->getVal(2,0)*this->getVal(0,1) - this->getVal(0,0)*this->getVal(2,1))*ooDet;
229  A(2,2) = (this->getVal(0,0)*this->getVal(1,1) - this->getVal(1,0)*this->getVal(0,1))*ooDet;
230 
231  // Return
232  return A;
233  }
234 
235  // Print to stream
236  void printToStream(std::ostream& out) const;
237 
238  // Write to HDF5
239  void writeToExistingHDF5Dataset(H5::DataSet& dataset, std::vector<hsize_t> arrayOffset) {
240  std::vector<hsize_t> tensorDims = {M, N};
241  writeSingleHDF5Array<T>(dataset, arrayOffset, tensorDims, &(vals[0][0]));
242  }
243 
244  // Friends
245 private:
246  // Tensor values
247  T vals[M][N];
248 };
249 
250 // Constructors
251 template <typename T, unsigned int M, unsigned int N>
253  // Check size
254  if (M != in.n1 || N != in.n2) {
255  throw TensorError("Size mismatch.");
256  }
257 
258  // Copy
259  std::copy(&(in.vals[0]), &(in.vals[0])+M*N, &(vals[0][0]));
260 }
261 
262 // Subtraction
263 template <typename T, unsigned int M, unsigned N>
264 __host__ __device__ Tensor2CUDA<T,M,N> operator-(const Tensor2CUDA<T,M,N>& A, const Tensor2CUDA<T,M,N>& B) {
266  for (unsigned int i=0; i<M; i++) {
267  for (unsigned int j=0; j<N; j++) {
268  C(i,j) = A.getVal(i,j) - B.getVal(i,j);
269  }
270  }
271  return C;
272 }
273 
274 // Addition
275 template <typename T, unsigned int M, unsigned N>
276 __host__ __device__ Tensor2CUDA<T,M,N> operator+(const Tensor2CUDA<T,M,N>& A, const Tensor2CUDA<T,M,N>& B) {
278  for (unsigned int i=0; i<M; i++) {
279  for (unsigned int j=0; j<N; j++) {
280  C(i,j) = A.getVal(i,j) + B.getVal(i,j);
281  }
282  }
283  return C;
284 }
285 template <typename T, unsigned int M, unsigned N>
286 __host__ __device__ void operator+=(Tensor2CUDA<T,M,N>& A, const Tensor2CUDA<T,M,N>& B) {
287  A = A+B;
288 }
289 
290 // Scalar Multiplication
291 template <typename T, unsigned int M, unsigned N>
292 __host__ __device__ Tensor2CUDA<T,M,N> operator*(const Tensor2CUDA<T,M,N>& A, T scalar) {
294  for (unsigned int i=0; i<M; i++) {
295  for (unsigned int j=0; j<N; j++) {
296  B(i,j) = scalar*A.getVal(i,j);
297  }
298  }
299  return B;
300 }
301 template <typename T, unsigned int M, unsigned N>
302 __host__ __device__ Tensor2CUDA<T,M,N> operator*(T scalar, const Tensor2CUDA<T,M,N>& A) {
303  return A*scalar;
304 }
305 
306 // Scalar division
307 template <typename T, unsigned int M, unsigned N>
308 __host__ __device__ Tensor2CUDA<T,M,N> operator/(const Tensor2CUDA<T,M,N>& A, T scalar) {
310  for (unsigned int i=0; i<M; i++) {
311  for (unsigned int j=0; j<N; j++) {
312  B(i,j) = A.getVal(i,j)/scalar;
313  }
314  }
315  return B;
316 }
317 template <typename T, unsigned int M, unsigned N>
318 __host__ __device__ void operator/=(Tensor2CUDA<T,M,N>& A, T scalar) {
319  A = A/scalar;}
320 
321 
322 // Matrix multiplication
323 template <typename T, unsigned int M, unsigned int N, unsigned int P>
324 __host__ __device__ Tensor2CUDA<T,M,P> operator*(const Tensor2CUDA<T,M,N>& A, const Tensor2CUDA<T,N,P>& B) {
326  for (unsigned int i=0; i<M; i++) {
327  for (unsigned int j=0; j<P; j++) {
328  for (unsigned int k=0; k<N; k++) {
329  C(i,j) += A.getVal(i,k)*B.getVal(k,j);
330  }
331  }
332  }
333  return C;
334 }
335 
336 // Matrix-vector multiplication
337 template <typename T, unsigned int M, unsigned int N>
338 __host__ __device__ VecCUDA<T,M> operator*(const Tensor2CUDA<T,M,N>& A, const VecCUDA<T,N>& x) {
339  VecCUDA<T,M> b;
340  for (unsigned int i=0; i<M; i++) {
341  for (unsigned int j=0; j<N; j++) {
342  b(i) += A.getVal(i,j)*x.getVal(j);
343  }
344  }
345  return b;
346 }
347 
354 template <typename T, unsigned int M>
356  return Q.trans()*A*Q;
357 }
358 
365 template <typename T, unsigned int M>
366 __host__ __device__ Tensor2CUDA<T,M,M> transformOutOfFrame(const Tensor2CUDA<T,M,M>& A_star, const Tensor2CUDA<T,M,M>& Q) {
367  return Q*A_star*Q.trans();
368 }
369 
370 template <typename T, unsigned int M>
372  return Q.trans()*A*Q;
373 }
374 
375 template <typename T, unsigned int M>
377  return Q*A_star*Q.trans();
378 }
379 
380 template <typename T>
381 __device__ Tensor2CUDA<T,3,3> EulerZXZRotationMatrixCUDA(T alpha, T beta, T gamma) {
383  T c1, c2, c3, s1, s2, s3;
384  sincosIntr(alpha, &s1, &c1);
385  sincosIntr(beta, &s2, &c2);
386  sincosIntr(gamma, &s3, &c3);
387  R(0,0) = c1*c3 - c2*s1*s3;
388  R(0,1) = -c1*s3 - c2*c3*s1;
389  R(0,2) = s1*s2;
390  R(1,0) = c3*s1 + c1*c2*s3;
391  R(1,1) = c1*c2*c3 - s1*s3;
392  R(1,2) = -c1*s2;
393  R(2,0) = s2*s3;
394  R(2,1) = c3*s2;
395  R(2,2) = c2;
396  return R;
397 }
398 
399 // Printing to stream
400 template <typename T, unsigned int M, unsigned int N>
401 void Tensor2CUDA<T,M,N>::printToStream(std::ostream& out) const
402 {
403  out << "[";
404  for (unsigned int i=0; i<M; i++) {
405  out << "[";
406  for (unsigned int j=0; j<N; j++) {
407  out << this->getVal(i,j);
408  if (j != N-1) {
409  out << ", ";
410  }
411  }
412  out << "]";
413  if (i==M-1) {
414  out << "]";
415  }
416  else {
417  out << ",";
418  }
419  out << std::endl;
420  }
421 }
422 
423 // Stream output
424 template <typename T, unsigned int M, unsigned int N>
425 std::ostream& operator<<(std::ostream& out, const Tensor2CUDA<T,M,N>& A)
426 {
427  A.printToStream(out);
428  return out;
429 }
430 
431 // PARALLEL REDUCTION //
439 template <typename T, unsigned int M, unsigned N>
441  const int warpSize = 32;
442  for (unsigned int i=0; i<M; i++) {
443  for (unsigned int j=0; j<N; j++) {
444  for (int offset = warpSize/2; offset > 0; offset /= 2) {
445  A(i,j) += __shfl_down(A(i,j), offset);
446  }
447  }
448  }
449  return A;
450 }
451 
457 template <typename T, unsigned int M, unsigned N>
459  const int warpSize = 32;
460  static __shared__ Tensor2CUDA<T,M,N> shared[warpSize]; // Shared mem for 32 partial sums
461  __syncthreads();
462  int lane = threadIdx.x % warpSize;
463  int wid = threadIdx.x / warpSize;
464 
465  val = warpReduceSumTensor2(val); // Each warp performs partial reduction
466 
467  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
468 
469  __syncthreads(); // Wait for all partial reductions
470 
471  //read from shared memory only if that warp existed
472  if (threadIdx.x < blockDim.x / warpSize) {
473  val = shared[lane];
474  }
475  else {
476  val = Tensor2CUDA<T,M,N>();
477  }
478 
479  if (wid==0) val = warpReduceSumTensor2(val); //Final reduce within first warp
480 
481  return val;
482 }
483 
491 template <typename T, unsigned int M, unsigned N>
492 __global__ void BLOCK_REDUCE_KEPLER_TENSOR2(Tensor2CUDA<T,M,N> *in, Tensor2CUDA<T,M,N>* out, int nTerms) {
493  Tensor2CUDA<T,M,N> sum;
494  //reduce multiple elements per thread
495  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i<nTerms; i += blockDim.x * gridDim.x) {
496  sum += in[i];
497  }
498  sum = blockReduceSumTensor2(sum);
499  if (threadIdx.x==0) {
500  out[blockIdx.x]=sum;
501  }
502 }
503 
509 template <typename T, unsigned int M, unsigned int N>
511 {
512  EulerAngles<T> angle;
513 
514  // Angle beta
515  angle.beta = acos(R.getVal(2,2));
516 
517  // Nonsingular case
518  float floatEpsilon = 1.19209e-07;
519  if (angle.beta > 1e3*floatEpsilon) {
520  // The other 2 angles
521  angle.alpha = atan2(R.getVal(0,2),-R.getVal(1,2));
522  angle.gamma = atan2(R.getVal(2,0),R.getVal(2,1));
523  }
524  // Singular case
525  else {
526  angle.beta = 0.0;
527  T alphaPlusGamma = atan2(-R.getVal(0,1), R.getVal(0,0));
528 
529  // Not uniquely determined, so just pick a combination
530  angle.alpha = alphaPlusGamma/2.0;
531  angle.gamma = alphaPlusGamma/2.0;
532  }
533 
534  // Correct the angle ranges if necessary
535  if (angle.alpha < 0) angle.alpha += 2*(T)M_PI;
536  if (angle.gamma < 0) angle.gamma += 2*(T)M_PI;
537 
538  // Return
539  return angle;
540 }
541 
543 // TENSOR 2 SYMMETRIC//
545 
557 template <typename T, unsigned int N>
558 class Tensor2SymmCUDA {
559 public:
560  // Default constructor
561  __host__ __device__ Tensor2SymmCUDA(){
562  for (unsigned int i=0; i<this->getNelements(); i++) {
563  vals[i] = (T)0.0;
564  }
565  }
566 
567  // Assignment from symmetric type
568  __host__ __device__ Tensor2SymmCUDA<T,N>& operator=(const Tensor2SymmCUDA<T,N>& input) {
569  memcpy(vals, input.vals, this->getNelements()*sizeof(T));
570  return *this;
571  }
572 
573  // Assignment from symmetric instance of arbitrary type
574  #ifdef __CUDA_ARCH__
575  // Device version
576  __device__ Tensor2SymmCUDA(const Tensor2CUDA<T,N,N>& input) {
577  // Assign values
578  for (unsigned int i=0; i<N; i++) {
579  for (unsigned int j=i; j<N; j++) {
580  this->setVal(i,j,input.getVal(i,j));
581  }
582  }
583  }
584  #else
585  // Host version
586  __host__ Tensor2SymmCUDA(const Tensor2CUDA<T,N,N>& input) {
587  // Check that input is indeed anti-symmetric
588  T closeEnough = 100*std::numeric_limits<T>::epsilon();
589  for (unsigned int i=0; i<N; i++) {
590  for (unsigned int j=i+1; j<N; j++) {
591  T val = input.getVal(i,j);
592  T symmVal = input.getVal(j,i);
593  if (std::abs(val-symmVal) > closeEnough) {
594  std::cerr << "(" << i << "," << j << ")=" << val << std::endl;
595  std::cerr << "(" << j << "," << i << ")=" << symmVal << std::endl;
596  throw std::runtime_error("Input tensor is not symmetric.");
597  }
598  }
599  }
600 
601  // Assign values
602  for (unsigned int i=0; i<N; i++) {
603  for (unsigned int j=i; j<N; j++) {
604  this->setVal(i,j,input.getVal(i,j));
605  }
606  }
607  }
608  #endif
609 
610  // Assignment from anti-symmetric instance of arbitrary type
611  __host__ Tensor2SymmCUDA(const Tensor2<T>& input) {
612  // Use the Tensor2CUDA copy constructor
613  Tensor2CUDA<T,N,N> inputCUDA = input;
614 
615  // Copy construct self from Tensor2CUDA
616  *this = inputCUDA;
617  }
618 
626  __host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const {
627  // Start with the index of the final element
628  unsigned int idx = getNelements()-1;
629 
630  // Subtract the triangular numbers below and including our row
631  idx -= ((N-i)*(N-i+1))/2;
632 
633  // Add our column offset
634  idx += (j-i+1);
635 
636  // Return
637  return idx;
638  }
639 
640  // Set values
641  __host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val) {
642  if (j>=i) {
643  vals[getUpperFlatIdx(i,j)] = val;
644  }
645  else {
646  vals[getUpperFlatIdx(j,i)] = val;
647  }
648  }
649 
650  // Get value
651  __host__ __device__ T getVal(const unsigned int i, const unsigned int j) const {
652  if (j>=i) {
653  return vals[getUpperFlatIdx(i,j)];
654  }
655  else {
656  return vals[getUpperFlatIdx(j,i)];
657  }
658  }
659 
660  // Friends
661  template <typename U, unsigned int M>
662  friend __host__ __device__ Tensor2SymmCUDA<U,M> operator-(const Tensor2SymmCUDA<U,M>& A, const Tensor2SymmCUDA<U,M>& B);
663  template <typename U, unsigned int M>
664  friend __host__ __device__ Tensor2SymmCUDA<U,M> operator+(const Tensor2SymmCUDA<U,M>& A, const Tensor2SymmCUDA<U,M>& B);
665 
666 protected:
667  // Total number of elements in underlying storage
668  __host__ __device__ unsigned int getNelements() const {
669  return (N*(N+1))/2;
670  }
671 
672  // Tensor values
673  T vals[(N*(N+1))/2];
674 };
675 
676 // Subtraction
677 template <typename T, unsigned int N>
680  for (unsigned int idx=0; idx<A.getNelements(); idx++) {
681  C.vals[idx] = A.vals[idx]-B.vals[idx];
682  }
683  return C;
684 }
685 
686 // Addition
687 template <typename T, unsigned int N>
690  for (unsigned int idx=0; idx<A.getNelements(); idx++) {
691  C.vals[idx] = A.vals[idx]+B.vals[idx];
692  }
693  return C;
694 }
695 
696 // Matrix multiplication
697 // Symmetric NxN times arbitrary NxP
698 template <typename T, unsigned int N, unsigned int P>
699 __host__ __device__ Tensor2CUDA<T,N,P> operator*(const Tensor2SymmCUDA<T,N>& A, const Tensor2CUDA<T,N,P>& B) {
701  for (unsigned int i=0; i<N; i++) {
702  for (unsigned int j=0; j<P; j++) {
703  for (unsigned int k=0; k<N; k++) {
704  C(i,j) += A.getVal(i,k)*B.getVal(k,j);
705  }
706  }
707  }
708  return C;
709 }
710 
712 // TENSOR 2 ANTI-SYMMETRIC//
714 
726 template <typename T, unsigned int N>
727 class Tensor2AsymmCUDA {
728 public:
729  // Default constructor
730  __host__ __device__ Tensor2AsymmCUDA(){
731  for (unsigned int i=0; i<this->getNelements(); i++) {
732  vals[i] = (T)0.0;
733  }
734  }
735 
736  // Assignment from anti-symmetric type
737  __host__ __device__ Tensor2AsymmCUDA<T,N>& operator=(const Tensor2AsymmCUDA<T,N>& input) {
738  memcpy(vals, input.vals, this->getNelements()*sizeof(T));
739  return *this;
740  }
741 
742  // Assignment from anti-symmetric instance of arbitrary type
743  __host__ Tensor2AsymmCUDA(const Tensor2CUDA<T,N,N>& input) {
744  // Check that input is indeed anti-symmetric
745  T closeEnough = 100*std::numeric_limits<T>::epsilon();
746  for (unsigned int i=0; i<N; i++) {
747  for (unsigned int j=i; j<N; j++) {
748  T val = input.getVal(i,j);
749  T asymmVal = input.getVal(j,i);
750  if (std::abs(val+asymmVal) > closeEnough) {
751  std::cerr << "(" << i << "," << j << ")=" << val << std::endl;
752  std::cerr << "(" << j << "," << i << ")=" << asymmVal << std::endl;
753  throw std::runtime_error("Input tensor is not anti-symmetric.");
754  }
755  }
756  }
757 
758  // Assign values
759  for (unsigned int i=0; i<N-1; i++) {
760  for (unsigned int j=i+1; j<N; j++) {
761  this->setVal(i,j,input.getVal(i,j));
762  }
763  }
764  }
765 
766  // Assignment from anti-symmetric instance of arbitrary type
767  __host__ Tensor2AsymmCUDA(const Tensor2<T>& input) {
768  // Use the Tensor2CUDA copy constructor
769  Tensor2CUDA<T,N,N> inputCUDA = input;
770 
771  // Copy construct self from Tensor2CUDA
772  *this = inputCUDA;
773  }
774 
782  __host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const {
783  // Start with the index of the final element
784  unsigned int idx = getNelements()-1;
785 
786  // Subtract the triangular numbers below and including our row
787  idx -= ((N-i-1)*(N-i))/2;
788 
789  // Add our column offset from the diagonal
790  idx += (j-i);
791 
792  // Return
793  return idx;
794  }
795 
796  // Get read/write reference to value
797  __host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val) {
798  if (j>i) {
799  vals[getUpperFlatIdx(i,j)] = val;
800  }
801  else if (i>j) {
802  vals[getUpperFlatIdx(j,i)] = -val;
803  }
804  }
805 
806  // Get read-only value
807  __host__ __device__ T getVal(const unsigned int i, const unsigned int j) const {
808  if (i==j) {
809  return (T)0.0;
810  }
811  else {
812  if (j>i) {
813  return vals[getUpperFlatIdx(i,j)];
814  }
815  else {
816  return -vals[getUpperFlatIdx(j,i)];
817  }
818  }
819  }
820 
821  // Friends
822  template <typename U, unsigned int M>
823  friend __host__ __device__ Tensor2AsymmCUDA<U,M> operator-(const Tensor2AsymmCUDA<U,M>& A, const Tensor2AsymmCUDA<U,M>& B);
824  template <typename U, unsigned int M>
825  friend __host__ __device__ Tensor2AsymmCUDA<U,M> operator+(const Tensor2AsymmCUDA<U,M>& A, const Tensor2AsymmCUDA<U,M>& B);
826 
827 protected:
828  // Total number of elements in underlying storage
829  __host__ __device__ unsigned int getNelements() const {
830  return (N*(N-1))/2;
831  }
832 
833  // Tensor values
834  T vals[(N*(N-1))/2];
835 };
836 
837 // Subtraction
838 template <typename T, unsigned int N>
841  for (unsigned int idx=0; idx<A.getNelements(); idx++) {
842  C.vals[idx] = A.vals[idx]-B.vals[idx];
843  }
844  return C;
845 }
846 
847 template <typename T, unsigned int N>
848 __host__ __device__ Tensor2CUDA<T,N,N> operator-(const Tensor2AsymmCUDA<T,N>& A, const Tensor2CUDA<T,N,N>& B) {
850  for (unsigned int i=0; i<N; i++) {
851  for (unsigned int j=0; j<N; j++) {
852  C(i,j) = A.getVal(i,j) - B.getVal(i,j);
853  }
854  }
855  return C;
856 }
857 
858 // Addition
859 template <typename T, unsigned int N>
862  for (unsigned int idx=0; idx<A.getNelements(); idx++) {
863  C.vals[idx] = A.vals[idx]+B.vals[idx];
864  }
865  return C;
866 }
867 
868 // Matrix multiplication
869 // Anti-symmetric NxN times arbitrary NxP
870 template <typename T, unsigned int N, unsigned int P>
871 __host__ __device__ Tensor2CUDA<T,N,P> operator*(const Tensor2AsymmCUDA<T,N>& A, const Tensor2CUDA<T,N,P>& B) {
873  for (unsigned int i=0; i<N; i++) {
874  for (unsigned int j=0; j<P; j++) {
875  for (unsigned int k=0; k<N; k++) {
876  C(i,j) += A.getVal(i,k)*B.getVal(k,j);
877  }
878  }
879  }
880  return C;
881 }
882 
883 // Arbitrary NXP times anti-symmetric PxP
884 template <typename T, unsigned int N, unsigned int P>
885 __host__ __device__ Tensor2CUDA<T,N,P> operator*(const Tensor2CUDA<T,N,P>& A, const Tensor2AsymmCUDA<T,P>& B) {
887  for (unsigned int i=0; i<N; i++) {
888  for (unsigned int j=0; j<P; j++) {
889  for (unsigned int k=0; k<P; k++) {
890  C(i,j) += A.getVal(i,k)*B.getVal(k,j);
891  }
892  }
893  }
894  return C;
895 }
896 
898 // TENSOR 4 //
900 
901 template <typename U, unsigned int M, unsigned int N, unsigned int P, unsigned int Q>
902 class Tensor4CUDA {
903 public:
905  Tensor4CUDA(const Tensor4<U>& in);
906 private:
907  U vals[M][N][P][Q];
908 };
909 
910 // Constructor from a standard Tensor4
911 template <typename U, unsigned int M, unsigned int N, unsigned int P, unsigned int Q>
913  // Check size
914  if (M != in.n1 || N != in.n2 || P != in.n3 || Q != in.n4) {
915  throw TensorError("Size mismatch.");
916  }
917 
918  // Copy
919  std::copy(&(in.vals[0]), &(in.vals[0])+M*N*P*Q, &(vals[0][0][0][0]));
920 }
921 
922 }//END NAMESPACE HPP
923 
924 #endif /* HPP_TENSOR_CUDA_H */
Tensor2< T > transformOutOfFrame(const Tensor2< T > &A_star, const Tensor2< T > &Q)
Transform tensor out of the frame given by the columns of .
Definition: tensor.h:1613
void operator+=(GSHCoeffs< T > &A, const GSHCoeffs< T > &B)
Definition: gsh.h:231
T alpha
Definition: rotation.h:44
T beta
Definition: rotation.h:45
Tensor2< T > transformIntoFrame(const Tensor2< T > &A, const Tensor2< T > &Q)
Transform tensor into the frame given by the columns of .
Definition: tensor.h:1602
const T * end() const
Definition: tensorCUDA.h:55
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:651
Definition: tensorCUDA.h:28
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:807
T * begin()
Definition: tensorCUDA.h:52
void operator/=(GSHCoeffs< T > &A, const T B)
Definition: gsh.h:257
Definition: casesUtils.cpp:4
__host__ __device__ Tensor2CUDA< T, N, N > & operator=(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:175
Header file for helper functions with HDF, C++ API.
void printToStream(std::ostream &out) const
Definition: tensorCUDA.h:401
__host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const
Get flat index for upper triangular portion.
Definition: tensorCUDA.h:782
std::vector< T > operator*(const std::vector< T > &vec, const T scalar)
Definition: tensor.h:72
__device__ Tensor2CUDA< T, M, N > warpReduceSumTensor2(Tensor2CUDA< T, M, N > A)
Definition: tensorCUDA.h:440
__host__ __device__ unsigned int getNelements() const
Definition: tensorCUDA.h:668
#define HPP_CHECK_CUDA_ENABLED_BUILD
Definition: config.h:44
Definition: tensorCUDA.h:30
__host__ __device__ Tensor2AsymmCUDA< T, N > & operator=(const Tensor2AsymmCUDA< T, N > &input)
Definition: tensorCUDA.h:737
__host__ Tensor2SymmCUDA(const Tensor2< T > &input)
Definition: tensorCUDA.h:611
__host__ __device__ Tensor2CUDA(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:160
__host__ __device__ Tensor2AsymmCUDA()
Definition: tensorCUDA.h:730
Header file for rotation classes and functions.
__host__ __device__ unsigned int getUpperFlatIdx(const unsigned int i, const unsigned int j) const
Get flat index for upper triangular portion.
Definition: tensorCUDA.h:626
GSHCoeffs< T > operator/(const GSHCoeffs< T > &coeffs, const T val)
Definition: gsh.h:236
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:193
Header file for tensor classes.
__host__ __device__ T getVal(const unsigned int i) const
Definition: tensorCUDA.h:62
__host__ __device__ Tensor2CUDA< T, 3, 3 > inv() const
3x3 inverse
Definition: tensorCUDA.h:212
Definition: tensorCUDA.h:32
A class for second order tensors.
Definition: tensor.h:303
unsigned int n2
Definition: tensor.h:1104
std::vector< T > cartesianToSpherical(const std::vector< T > &cartVec)
Definition: crystal.cpp:1575
Definition: tensor.h:166
__host__ __device__ Tensor2SymmCUDA< T, N > & operator=(const Tensor2SymmCUDA< T, N > &input)
Definition: tensorCUDA.h:568
T * end()
Definition: tensorCUDA.h:53
Header file CUDA utility functions.
__device__ Tensor2CUDA< T, 3, 3 > EulerZXZRotationMatrixCUDA(EulerAngles< T > angles)
Definition: crystalCUDA.h:371
__host__ Tensor2AsymmCUDA(const Tensor2< T > &input)
Definition: tensorCUDA.h:767
unsigned int n3
Definition: tensor.h:1105
unsigned int n1
Definition: tensor.h:1103
T vals[M][N]
Definition: tensorCUDA.h:247
std::vector< T > abs(const std::vector< T > &vec)
Definition: tensor.h:89
const T * begin() const
Definition: tensorCUDA.h:54
std::vector< T > operator-(const std::vector< T > &vec1, const std::vector< T > &vec2)
Definition: tensor.h:110
T gamma
Definition: rotation.h:46
__host__ __device__ Tensor2SymmCUDA()
Definition: tensorCUDA.h:561
__device__ T norm() const
Definition: tensorCUDA.h:71
std::valarray< T > vals
Definition: tensor.h:1110
__host__ __device__ VecCUDA()
Definition: tensorCUDA.h:41
Tensor4CUDA()
Definition: tensorCUDA.h:904
__host__ __device__ Tensor2CUDA()
Definition: tensorCUDA.h:148
unsigned int n1
the first dimension of the tensor
Definition: tensor.h:504
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:641
T vals[(N *(N-1))/2]
Definition: tensorCUDA.h:834
__host__ __device__ T & operator()(const unsigned int i, const unsigned int j)
Definition: tensorCUDA.h:185
EulerAngles< T > getEulerZXZAngles(Tensor2< T > R)
Get Euler angles from rotation matrix.
Definition: rotation.h:149
GSHCoeffs< T > operator+(const GSHCoeffs< T > &coeffs1, const GSHCoeffs< T > &coeffs2)
Definition: gsh.h:210
unsigned int n4
Definition: tensor.h:1106
__host__ Tensor2AsymmCUDA(const Tensor2CUDA< T, N, N > &input)
Definition: tensorCUDA.h:743
__host__ __device__ void setVal(const unsigned int i, const unsigned int j, const T val)
Definition: tensorCUDA.h:797
__global__ void BLOCK_REDUCE_KEPLER_TENSOR2(Tensor2CUDA< T, M, N > *in, Tensor2CUDA< T, M, N > *out, int nTerms)
Definition: tensorCUDA.h:492
std::valarray< T > vals
the underlying 1D array
Definition: tensor.h:511
__host__ __device__ T & operator()(const unsigned int i)
Definition: tensorCUDA.h:58
T vals[N]
Definition: tensorCUDA.h:79
__host__ __device__ unsigned int getNelements() const
Definition: tensorCUDA.h:829
__host__ __device__ Tensor2CUDA< T, M, N > & operator=(const Tensor2CUDA< T, M, N > &input)
Definition: tensorCUDA.h:169
__host__ __device__ T getVal(const unsigned int i, const unsigned int j) const
Definition: tensorCUDA.h:189
__host__ __device__ void setVal(const unsigned int i, const T val)
Definition: tensorCUDA.h:66
Definition: tensorCUDA.h:26
__host__ __device__ Tensor2CUDA< T, N, M > trans() const
Definition: tensorCUDA.h:198
Definition: rotation.h:43
void writeToExistingHDF5Dataset(H5::DataSet &dataset, std::vector< hsize_t > arrayOffset)
Definition: tensorCUDA.h:239
__device__ Tensor2CUDA< T, M, N > blockReduceSumTensor2(Tensor2CUDA< T, M, N > val)
Definition: tensorCUDA.h:458
T vals[(N *(N+1))/2]
Definition: tensorCUDA.h:673
__host__ Tensor2SymmCUDA(const Tensor2CUDA< T, N, N > &input)
Definition: tensorCUDA.h:586
unsigned int n2
the second dimension of the tensor
Definition: tensor.h:506
Definition: tensorCUDA.h:902