Reference documentation for deal.II version 8.4.2
vectorization.h
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2011 - 2015 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE at
12 // the top level of the deal.II distribution.
13 //
14 // ---------------------------------------------------------------------
15 
16 
17 #ifndef dealii__vectorization_h
18 #define dealii__vectorization_h
19 
20 #include <deal.II/base/config.h>
21 #include <deal.II/base/exceptions.h>
22 
23 #include <cmath>
24 
25 // Note:
26 // The flag DEAL_II_COMPILER_VECTORIZATION_LEVEL is essentially constructed
27 // according to the following scheme
28 // #ifdef __AVX512F__
29 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 3
30 // #elif defined (__AVX__)
31 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 2
32 // #elif defined (__SSE2__)
33 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 1
34 // #else
35 // #define DEAL_II_COMPILER_VECTORIZATION_LEVEL 0
36 // #endif
37 // In addition to checking the flags __AVX__ and __SSE2__, a CMake test,
38 // 'check_01_cpu_features.cmake', ensures that these feature are not only
39 // present in the compilation unit but also working properly.
40 
41 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 // AVX, AVX-512
42 #include <immintrin.h>
43 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2
44 #include <emmintrin.h>
45 #endif
46 
47 
48 // forward declarations
49 DEAL_II_NAMESPACE_OPEN
50 template <typename Number> class VectorizedArray;
51 template <typename T> struct EnableIfScalar;
52 DEAL_II_NAMESPACE_CLOSE
53 
54 
55 namespace std
56 {
57  template <typename Number> ::VectorizedArray<Number>
58  sqrt(const ::VectorizedArray<Number> &);
59  template <typename Number> ::VectorizedArray<Number>
60  abs(const ::VectorizedArray<Number> &);
61  template <typename Number> ::VectorizedArray<Number>
62  max(const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
63  template <typename Number> ::VectorizedArray<Number>
64  min (const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
65 }
66 
67 
68 DEAL_II_NAMESPACE_OPEN
69 
70 
71 // Enable the EnableIfScalar type trait for VectorizedArray<Number> such
72 // that it can be used as a Number type in Tensor<rank,dim,Number>, etc.
73 
74 template<typename Number>
75 struct EnableIfScalar<VectorizedArray<Number> >
76 {
78 };
79 
80 
131 template <typename Number>
132 class VectorizedArray
133 {
134 public:
138  static const unsigned int n_array_elements = 1;
139 
140  // POD means that there should be no user-defined constructors, destructors
141  // and copy functions (the standard is somewhat relaxed in C++2011, though).
142 
147  operator = (const Number scalar)
148  {
149  data = scalar;
150  return *this;
151  }
152 
156  Number &
157  operator [] (const unsigned int comp)
158  {
159  (void)comp;
160  AssertIndexRange (comp, 1);
161  return data;
162  }
163 
167  const Number &
168  operator [] (const unsigned int comp) const
169  {
170  (void)comp;
171  AssertIndexRange (comp, 1);
172  return data;
173  }
174 
179  operator += (const VectorizedArray<Number> &vec)
180  {
181  data+=vec.data;
182  return *this;
183  }
184 
189  operator -= (const VectorizedArray<Number> &vec)
190  {
191  data-=vec.data;
192  return *this;
193  }
194 
199  operator *= (const VectorizedArray<Number> &vec)
200  {
201  data*=vec.data;
202  return *this;
203  }
204 
209  operator /= (const VectorizedArray<Number> &vec)
210  {
211  data/=vec.data;
212  return *this;
213  }
214 
221  void load (const Number *ptr)
222  {
223  data = *ptr;
224  }
225 
232  void store (Number *ptr) const
233  {
234  *ptr = data;
235  }
236 
241  Number data;
242 
243 private:
249  get_sqrt () const
250  {
251  VectorizedArray res;
252  res.data = std::sqrt(data);
253  return res;
254  }
255 
261  get_abs () const
262  {
263  VectorizedArray res;
264  res.data = std::fabs(data);
265  return res;
266  }
267 
273  get_max (const VectorizedArray &other) const
274  {
275  VectorizedArray res;
276  res.data = std::max (data, other.data);
277  return res;
278  }
279 
285  get_min (const VectorizedArray &other) const
286  {
287  VectorizedArray res;
288  res.data = std::min (data, other.data);
289  return res;
290  }
291 
295  template <typename Number2> friend VectorizedArray<Number2>
296  std::sqrt (const VectorizedArray<Number2> &);
297  template <typename Number2> friend VectorizedArray<Number2>
298  std::abs (const VectorizedArray<Number2> &);
299  template <typename Number2> friend VectorizedArray<Number2>
300  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
301  template <typename Number2> friend VectorizedArray<Number2>
302  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
303 };
304 
305 
306 
313 template <typename Number>
314 inline
316 make_vectorized_array (const Number &u)
317 {
319  result = u;
320  return result;
321 }
322 
323 
324 
350 template <typename Number>
351 inline
352 void
353 vectorized_load_and_transpose(const unsigned int n_entries,
354  const Number *in,
355  const unsigned int *offsets,
357 {
358  for (unsigned int i=0; i<n_entries; ++i)
359  for (unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
360  out[i][v] = in[offsets[v]+i];
361 }
362 
363 
364 
403 template <typename Number>
404 inline
405 void
406 vectorized_transpose_and_store(const bool add_into,
407  const unsigned int n_entries,
408  const VectorizedArray<Number> *in,
409  const unsigned int *offsets,
410  Number *out)
411 {
412  if (add_into)
413  for (unsigned int i=0; i<n_entries; ++i)
414  for (unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
415  out[offsets[v]+i] += in[i][v];
416  else
417  for (unsigned int i=0; i<n_entries; ++i)
418  for (unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
419  out[offsets[v]+i] = in[i][v];
420 }
421 
422 
423 
424 // for safety, also check that __AVX512F__ is defined in case the user manually
425 // set some conflicting compile flags which prevent compilation
426 
427 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && defined(__AVX512F__)
428 
432 template <>
433 class VectorizedArray<double>
434 {
435 public:
439  static const unsigned int n_array_elements = 8;
440 
445  operator = (const double x)
446  {
447  data = _mm512_set1_pd(x);
448  return *this;
449  }
450 
454  double &
455  operator [] (const unsigned int comp)
456  {
457  AssertIndexRange (comp, 8);
458  return *(reinterpret_cast<double *>(&data)+comp);
459  }
460 
464  const double &
465  operator [] (const unsigned int comp) const
466  {
467  AssertIndexRange (comp, 8);
468  return *(reinterpret_cast<const double *>(&data)+comp);
469  }
470 
475  operator += (const VectorizedArray &vec)
476  {
477  // if the compiler supports vector arithmetics, we can simply use +=
478  // operator on the given data type. this allows the compiler to combine
479  // additions with multiplication (fused multiply-add) if those
480  // instructions are available. Otherwise, we need to use the built-in
481  // intrinsic command for __m512d
482 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
483  data += vec.data;
484 #else
485  data = _mm512_add_pd(data,vec.data);
486 #endif
487  return *this;
488  }
489 
494  operator -= (const VectorizedArray &vec)
495  {
496 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
497  data -= vec.data;
498 #else
499  data = _mm512_sub_pd(data,vec.data);
500 #endif
501  return *this;
502  }
507  operator *= (const VectorizedArray &vec)
508  {
509 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
510  data *= vec.data;
511 #else
512  data = _mm512_mul_pd(data,vec.data);
513 #endif
514  return *this;
515  }
516 
521  operator /= (const VectorizedArray &vec)
522  {
523 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
524  data /= vec.data;
525 #else
526  data = _mm512_div_pd(data,vec.data);
527 #endif
528  return *this;
529  }
530 
536  void load (const double *ptr)
537  {
538  data = _mm512_loadu_pd (ptr);
539  }
540 
547  void store (double *ptr) const
548  {
549  _mm512_storeu_pd (ptr, data);
550  }
551 
556  __m512d data;
557 
558 private:
564  get_sqrt () const
565  {
566  VectorizedArray res;
567  res.data = _mm512_sqrt_pd(data);
568  return res;
569  }
570 
576  get_abs () const
577  {
578  // to compute the absolute value, perform bitwise andnot with -0. This
579  // will leave all value and exponent bits unchanged but force the sign
580  // value to +. Since there is no andnot for AVX512, we interpret the data
581  // as 64 bit integers and do the andnot on those types (note that andnot
582  // is a bitwise operation so the data type does not matter)
583  __m512d mask = _mm512_set1_pd (-0.);
584  VectorizedArray res;
585  res.data = (__m512d)_mm512_andnot_epi64 ((__m512i)mask, (__m512i)data);
586  return res;
587  }
588 
594  get_max (const VectorizedArray &other) const
595  {
596  VectorizedArray res;
597  res.data = _mm512_max_pd (data, other.data);
598  return res;
599  }
600 
606  get_min (const VectorizedArray &other) const
607  {
608  VectorizedArray res;
609  res.data = _mm512_min_pd (data, other.data);
610  return res;
611  }
612 
616  template <typename Number2> friend VectorizedArray<Number2>
617  std::sqrt (const VectorizedArray<Number2> &);
618  template <typename Number2> friend VectorizedArray<Number2>
619  std::abs (const VectorizedArray<Number2> &);
620  template <typename Number2> friend VectorizedArray<Number2>
621  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
622  template <typename Number2> friend VectorizedArray<Number2>
623  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
624 };
625 
626 
627 
631 template<>
632 class VectorizedArray<float>
633 {
634 public:
638  static const unsigned int n_array_elements = 16;
639 
644  operator = (const float x)
645  {
646  data = _mm512_set1_ps(x);
647  return *this;
648  }
649 
653  float &
654  operator [] (const unsigned int comp)
655  {
656  AssertIndexRange (comp, 16);
657  return *(reinterpret_cast<float *>(&data)+comp);
658  }
659 
663  const float &
664  operator [] (const unsigned int comp) const
665  {
666  AssertIndexRange (comp, 16);
667  return *(reinterpret_cast<const float *>(&data)+comp);
668  }
669 
674  operator += (const VectorizedArray &vec)
675  {
676  // if the compiler supports vector arithmetics, we can simply use +=
677  // operator on the given data type. this allows the compiler to combine
678  // additions with multiplication (fused multiply-add) if those
679  // instructions are available. Otherwise, we need to use the built-in
680  // intrinsic command for __m512d
681 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
682  data += vec.data;
683 #else
684  data = _mm512_add_ps(data,vec.data);
685 #endif
686  return *this;
687  }
688 
693  operator -= (const VectorizedArray &vec)
694  {
695 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
696  data -= vec.data;
697 #else
698  data = _mm512_sub_ps(data,vec.data);
699 #endif
700  return *this;
701  }
706  operator *= (const VectorizedArray &vec)
707  {
708 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
709  data *= vec.data;
710 #else
711  data = _mm512_mul_ps(data,vec.data);
712 #endif
713  return *this;
714  }
715 
720  operator /= (const VectorizedArray &vec)
721  {
722 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
723  data /= vec.data;
724 #else
725  data = _mm512_div_ps(data,vec.data);
726 #endif
727  return *this;
728  }
729 
735  void load (const float *ptr)
736  {
737  data = _mm512_loadu_ps (ptr);
738  }
739 
746  void store (float *ptr) const
747  {
748  _mm512_storeu_ps (ptr, data);
749  }
750 
755  __m512 data;
756 
757 private:
758 
764  get_sqrt () const
765  {
766  VectorizedArray res;
767  res.data = _mm512_sqrt_ps(data);
768  return res;
769  }
770 
776  get_abs () const
777  {
778  // to compute the absolute value, perform bitwise andnot with -0. This
779  // will leave all value and exponent bits unchanged but force the sign
780  // value to +. Since there is no andnot for AVX512, we interpret the data
781  // as 32 bit integers and do the andnot on those types (note that andnot
782  // is a bitwise operation so the data type does not matter)
783  __m512 mask = _mm512_set1_ps (-0.f);
784  VectorizedArray res;
785  res.data = (__m512)_mm512_andnot_epi32 ((__m512i)mask, (__m512i)data);
786  return res;
787  }
788 
794  get_max (const VectorizedArray &other) const
795  {
796  VectorizedArray res;
797  res.data = _mm512_max_ps (data, other.data);
798  return res;
799  }
800 
806  get_min (const VectorizedArray &other) const
807  {
808  VectorizedArray res;
809  res.data = _mm512_min_ps (data, other.data);
810  return res;
811  }
812 
816  template <typename Number2> friend VectorizedArray<Number2>
817  std::sqrt (const VectorizedArray<Number2> &);
818  template <typename Number2> friend VectorizedArray<Number2>
819  std::abs (const VectorizedArray<Number2> &);
820  template <typename Number2> friend VectorizedArray<Number2>
821  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
822  template <typename Number2> friend VectorizedArray<Number2>
823  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
824 };
825 
826 
827 
828 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && defined(__AVX__)
829 
833 template <>
834 class VectorizedArray<double>
835 {
836 public:
840  static const unsigned int n_array_elements = 4;
841 
846  operator = (const double x)
847  {
848  data = _mm256_set1_pd(x);
849  return *this;
850  }
851 
855  double &
856  operator [] (const unsigned int comp)
857  {
858  AssertIndexRange (comp, 4);
859  return *(reinterpret_cast<double *>(&data)+comp);
860  }
861 
865  const double &
866  operator [] (const unsigned int comp) const
867  {
868  AssertIndexRange (comp, 4);
869  return *(reinterpret_cast<const double *>(&data)+comp);
870  }
871 
876  operator += (const VectorizedArray &vec)
877  {
878  // if the compiler supports vector arithmetics, we can simply use +=
879  // operator on the given data type. this allows the compiler to combine
880  // additions with multiplication (fused multiply-add) if those
881  // instructions are available. Otherwise, we need to use the built-in
882  // intrinsic command for __m256d
883 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
884  data += vec.data;
885 #else
886  data = _mm256_add_pd(data,vec.data);
887 #endif
888  return *this;
889  }
890 
895  operator -= (const VectorizedArray &vec)
896  {
897 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
898  data -= vec.data;
899 #else
900  data = _mm256_sub_pd(data,vec.data);
901 #endif
902  return *this;
903  }
908  operator *= (const VectorizedArray &vec)
909  {
910 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
911  data *= vec.data;
912 #else
913  data = _mm256_mul_pd(data,vec.data);
914 #endif
915  return *this;
916  }
917 
922  operator /= (const VectorizedArray &vec)
923  {
924 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
925  data /= vec.data;
926 #else
927  data = _mm256_div_pd(data,vec.data);
928 #endif
929  return *this;
930  }
931 
937  void load (const double *ptr)
938  {
939  data = _mm256_loadu_pd (ptr);
940  }
941 
948  void store (double *ptr) const
949  {
950  _mm256_storeu_pd (ptr, data);
951  }
952 
957  __m256d data;
958 
959 private:
965  get_sqrt () const
966  {
967  VectorizedArray res;
968  res.data = _mm256_sqrt_pd(data);
969  return res;
970  }
971 
977  get_abs () const
978  {
979  // to compute the absolute value, perform bitwise andnot with -0. This
980  // will leave all value and exponent bits unchanged but force the sign
981  // value to +.
982  __m256d mask = _mm256_set1_pd (-0.);
983  VectorizedArray res;
984  res.data = _mm256_andnot_pd(mask, data);
985  return res;
986  }
987 
993  get_max (const VectorizedArray &other) const
994  {
995  VectorizedArray res;
996  res.data = _mm256_max_pd (data, other.data);
997  return res;
998  }
999 
1005  get_min (const VectorizedArray &other) const
1006  {
1007  VectorizedArray res;
1008  res.data = _mm256_min_pd (data, other.data);
1009  return res;
1010  }
1011 
1015  template <typename Number2> friend VectorizedArray<Number2>
1016  std::sqrt (const VectorizedArray<Number2> &);
1017  template <typename Number2> friend VectorizedArray<Number2>
1018  std::abs (const VectorizedArray<Number2> &);
1019  template <typename Number2> friend VectorizedArray<Number2>
1020  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1021  template <typename Number2> friend VectorizedArray<Number2>
1022  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1023 };
1024 
1025 
1026 
1030 template <>
1031 inline
1032 void
1033 vectorized_load_and_transpose(const unsigned int n_entries,
1034  const double *in,
1035  const unsigned int *offsets,
1037 {
1038  const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1039  for (unsigned int i=0; i<n_chunks; ++i)
1040  {
1041  __m256d u0 = _mm256_loadu_pd(in+4*i+offsets[0]);
1042  __m256d u1 = _mm256_loadu_pd(in+4*i+offsets[1]);
1043  __m256d u2 = _mm256_loadu_pd(in+4*i+offsets[2]);
1044  __m256d u3 = _mm256_loadu_pd(in+4*i+offsets[3]);
1045  __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1046  __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1047  __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1048  __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1049  out[4*i+0].data = _mm256_unpacklo_pd (t0, t1);
1050  out[4*i+1].data = _mm256_unpackhi_pd (t0, t1);
1051  out[4*i+2].data = _mm256_unpacklo_pd (t2, t3);
1052  out[4*i+3].data = _mm256_unpackhi_pd (t2, t3);
1053  }
1054  if (remainder > 0 && n_chunks > 0)
1055  {
1056  // simple re-load all data in the last slot
1057  const unsigned int final_pos = n_chunks*4-4+remainder;
1058  Assert(final_pos+4 == n_entries, ExcInternalError());
1059  __m256d u0 = _mm256_loadu_pd(in+final_pos+offsets[0]);
1060  __m256d u1 = _mm256_loadu_pd(in+final_pos+offsets[1]);
1061  __m256d u2 = _mm256_loadu_pd(in+final_pos+offsets[2]);
1062  __m256d u3 = _mm256_loadu_pd(in+final_pos+offsets[3]);
1063  __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1064  __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1065  __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1066  __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1067  out[final_pos+0].data = _mm256_unpacklo_pd (t0, t1);
1068  out[final_pos+1].data = _mm256_unpackhi_pd (t0, t1);
1069  out[final_pos+2].data = _mm256_unpacklo_pd (t2, t3);
1070  out[final_pos+3].data = _mm256_unpackhi_pd (t2, t3);
1071  }
1072  else if (remainder > 0)
1073  for (unsigned int i=0; i<n_entries; ++i)
1074  for (unsigned int v=0; v<4; ++v)
1075  out[i][v] = in[offsets[v]+i];
1076 }
1077 
1078 
1079 
1083 template <>
1084 inline
1085 void
1086 vectorized_transpose_and_store(const bool add_into,
1087  const unsigned int n_entries,
1088  const VectorizedArray<double> *in,
1089  const unsigned int *offsets,
1090  double *out)
1091 {
1092  const unsigned int n_chunks = n_entries/4;
1093  for (unsigned int i=0; i<n_chunks; ++i)
1094  {
1095  __m256d u0 = in[4*i+0].data;
1096  __m256d u1 = in[4*i+1].data;
1097  __m256d u2 = in[4*i+2].data;
1098  __m256d u3 = in[4*i+3].data;
1099  __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1100  __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1101  __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1102  __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1103  __m256d res0 = _mm256_unpacklo_pd (t0, t1);
1104  __m256d res1 = _mm256_unpackhi_pd (t0, t1);
1105  __m256d res2 = _mm256_unpacklo_pd (t2, t3);
1106  __m256d res3 = _mm256_unpackhi_pd (t2, t3);
1107 
1108  // Cannot use the same store instructions in both paths of the 'if'
1109  // because the compiler cannot know that there is no aliasing between
1110  // pointers
1111  if (add_into)
1112  {
1113  res0 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[0]), res0);
1114  _mm256_storeu_pd(out+4*i+offsets[0], res0);
1115  res1 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[1]), res1);
1116  _mm256_storeu_pd(out+4*i+offsets[1], res1);
1117  res2 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[2]), res2);
1118  _mm256_storeu_pd(out+4*i+offsets[2], res2);
1119  res3 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[3]), res3);
1120  _mm256_storeu_pd(out+4*i+offsets[3], res3);
1121  }
1122  else
1123  {
1124  _mm256_storeu_pd(out+4*i+offsets[0], res0);
1125  _mm256_storeu_pd(out+4*i+offsets[1], res1);
1126  _mm256_storeu_pd(out+4*i+offsets[2], res2);
1127  _mm256_storeu_pd(out+4*i+offsets[3], res3);
1128  }
1129  }
1130  const unsigned int shift = n_chunks * 4;
1131  if (add_into)
1132  for (unsigned int i=shift; i<n_entries; ++i)
1133  for (unsigned int v=0; v<4; ++v)
1134  out[offsets[v]+i] += in[i][v];
1135  else
1136  for (unsigned int i=shift; i<n_entries; ++i)
1137  for (unsigned int v=0; v<4; ++v)
1138  out[offsets[v]+i] = in[i][v];
1139 }
1140 
1141 
1142 
1146 template<>
1147 class VectorizedArray<float>
1148 {
1149 public:
1153  static const unsigned int n_array_elements = 8;
1154 
1158  VectorizedArray &
1159  operator = (const float x)
1160  {
1161  data = _mm256_set1_ps(x);
1162  return *this;
1163  }
1164 
1168  float &
1169  operator [] (const unsigned int comp)
1170  {
1171  AssertIndexRange (comp, 8);
1172  return *(reinterpret_cast<float *>(&data)+comp);
1173  }
1174 
1178  const float &
1179  operator [] (const unsigned int comp) const
1180  {
1181  AssertIndexRange (comp, 8);
1182  return *(reinterpret_cast<const float *>(&data)+comp);
1183  }
1184 
1188  VectorizedArray &
1189  operator += (const VectorizedArray &vec)
1190  {
1191  // if the compiler supports vector arithmetics, we can simply use +=
1192  // operator on the given data type. this allows the compiler to combine
1193  // additions with multiplication (fused multiply-add) if those
1194  // instructions are available. Otherwise, we need to use the built-in
1195  // intrinsic command for __m256d
1196 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1197  data += vec.data;
1198 #else
1199  data = _mm256_add_ps(data,vec.data);
1200 #endif
1201  return *this;
1202  }
1203 
1207  VectorizedArray &
1208  operator -= (const VectorizedArray &vec)
1209  {
1210 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1211  data -= vec.data;
1212 #else
1213  data = _mm256_sub_ps(data,vec.data);
1214 #endif
1215  return *this;
1216  }
1220  VectorizedArray &
1221  operator *= (const VectorizedArray &vec)
1222  {
1223 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1224  data *= vec.data;
1225 #else
1226  data = _mm256_mul_ps(data,vec.data);
1227 #endif
1228  return *this;
1229  }
1230 
1234  VectorizedArray &
1235  operator /= (const VectorizedArray &vec)
1236  {
1237 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1238  data /= vec.data;
1239 #else
1240  data = _mm256_div_ps(data,vec.data);
1241 #endif
1242  return *this;
1243  }
1244 
1250  void load (const float *ptr)
1251  {
1252  data = _mm256_loadu_ps (ptr);
1253  }
1254 
1261  void store (float *ptr) const
1262  {
1263  _mm256_storeu_ps (ptr, data);
1264  }
1265 
1270  __m256 data;
1271 
1272 private:
1273 
1279  get_sqrt () const
1280  {
1281  VectorizedArray res;
1282  res.data = _mm256_sqrt_ps(data);
1283  return res;
1284  }
1285 
1291  get_abs () const
1292  {
1293  // to compute the absolute value, perform bitwise andnot with -0. This
1294  // will leave all value and exponent bits unchanged but force the sign
1295  // value to +.
1296  __m256 mask = _mm256_set1_ps (-0.f);
1297  VectorizedArray res;
1298  res.data = _mm256_andnot_ps(mask, data);
1299  return res;
1300  }
1301 
1307  get_max (const VectorizedArray &other) const
1308  {
1309  VectorizedArray res;
1310  res.data = _mm256_max_ps (data, other.data);
1311  return res;
1312  }
1313 
1319  get_min (const VectorizedArray &other) const
1320  {
1321  VectorizedArray res;
1322  res.data = _mm256_min_ps (data, other.data);
1323  return res;
1324  }
1325 
1329  template <typename Number2> friend VectorizedArray<Number2>
1330  std::sqrt (const VectorizedArray<Number2> &);
1331  template <typename Number2> friend VectorizedArray<Number2>
1332  std::abs (const VectorizedArray<Number2> &);
1333  template <typename Number2> friend VectorizedArray<Number2>
1334  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1335  template <typename Number2> friend VectorizedArray<Number2>
1336  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1337 };
1338 
1339 
1340 
1344 template <>
1345 inline
1346 void
1347 vectorized_load_and_transpose(const unsigned int n_entries,
1348  const float *in,
1349  const unsigned int *offsets,
1351 {
1352  const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1353  for (unsigned int i=0; i<n_chunks; ++i)
1354  {
1355  __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
1356  __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
1357  __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
1358  __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
1359  __m128 u4 = _mm_loadu_ps(in+4*i+offsets[4]);
1360  __m128 u5 = _mm_loadu_ps(in+4*i+offsets[5]);
1361  __m128 u6 = _mm_loadu_ps(in+4*i+offsets[6]);
1362  __m128 u7 = _mm_loadu_ps(in+4*i+offsets[7]);
1363  // To avoid warnings about uninitialized variables, need to initialize
1364  // one variable with zero before using it.
1365  __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1366  t0 = _mm256_insertf128_ps (t3, u0, 0);
1367  t0 = _mm256_insertf128_ps (t0, u4, 1);
1368  t1 = _mm256_insertf128_ps (t3, u1, 0);
1369  t1 = _mm256_insertf128_ps (t1, u5, 1);
1370  t2 = _mm256_insertf128_ps (t3, u2, 0);
1371  t2 = _mm256_insertf128_ps (t2, u6, 1);
1372  t3 = _mm256_insertf128_ps (t3, u3, 0);
1373  t3 = _mm256_insertf128_ps (t3, u7, 1);
1374  __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1375  __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1376  __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1377  __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1378  out[4*i+0].data = _mm256_shuffle_ps (v0, v2, 0x88);
1379  out[4*i+1].data = _mm256_shuffle_ps (v0, v2, 0xdd);
1380  out[4*i+2].data = _mm256_shuffle_ps (v1, v3, 0x88);
1381  out[4*i+3].data = _mm256_shuffle_ps (v1, v3, 0xdd);
1382  }
1383  if (remainder > 0 && n_chunks > 0)
1384  {
1385  // simple re-load all data in the last slot
1386  const unsigned int final_pos = n_chunks*4-4+remainder;
1387  Assert(final_pos+4 == n_entries, ExcInternalError());
1388  __m128 u0 = _mm_loadu_ps(in+final_pos+offsets[0]);
1389  __m128 u1 = _mm_loadu_ps(in+final_pos+offsets[1]);
1390  __m128 u2 = _mm_loadu_ps(in+final_pos+offsets[2]);
1391  __m128 u3 = _mm_loadu_ps(in+final_pos+offsets[3]);
1392  __m128 u4 = _mm_loadu_ps(in+final_pos+offsets[4]);
1393  __m128 u5 = _mm_loadu_ps(in+final_pos+offsets[5]);
1394  __m128 u6 = _mm_loadu_ps(in+final_pos+offsets[6]);
1395  __m128 u7 = _mm_loadu_ps(in+final_pos+offsets[7]);
1396  __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1397  t0 = _mm256_insertf128_ps (t3, u0, 0);
1398  t0 = _mm256_insertf128_ps (t0, u4, 1);
1399  t1 = _mm256_insertf128_ps (t3, u1, 0);
1400  t1 = _mm256_insertf128_ps (t1, u5, 1);
1401  t2 = _mm256_insertf128_ps (t3, u2, 0);
1402  t2 = _mm256_insertf128_ps (t2, u6, 1);
1403  t3 = _mm256_insertf128_ps (t3, u3, 0);
1404  t3 = _mm256_insertf128_ps (t3, u7, 1);
1405  __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1406  __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1407  __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1408  __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1409  out[final_pos+0].data = _mm256_shuffle_ps (v0, v2, 0x88);
1410  out[final_pos+1].data = _mm256_shuffle_ps (v0, v2, 0xdd);
1411  out[final_pos+2].data = _mm256_shuffle_ps (v1, v3, 0x88);
1412  out[final_pos+3].data = _mm256_shuffle_ps (v1, v3, 0xdd);
1413  }
1414  else if (remainder > 0)
1415  for (unsigned int i=0; i<n_entries; ++i)
1416  for (unsigned int v=0; v<8; ++v)
1417  out[i][v] = in[offsets[v]+i];
1418 }
1419 
1420 
1421 
1425 template <>
1426 inline
1427 void
1428 vectorized_transpose_and_store(const bool add_into,
1429  const unsigned int n_entries,
1430  const VectorizedArray<float> *in,
1431  const unsigned int *offsets,
1432  float *out)
1433 {
1434  const unsigned int n_chunks = n_entries/4;
1435  for (unsigned int i=0; i<n_chunks; ++i)
1436  {
1437  __m256 u0 = in[4*i+0].data;
1438  __m256 u1 = in[4*i+1].data;
1439  __m256 u2 = in[4*i+2].data;
1440  __m256 u3 = in[4*i+3].data;
1441  __m256 t0 = _mm256_shuffle_ps (u0, u1, 0x44);
1442  __m256 t1 = _mm256_shuffle_ps (u0, u1, 0xee);
1443  __m256 t2 = _mm256_shuffle_ps (u2, u3, 0x44);
1444  __m256 t3 = _mm256_shuffle_ps (u2, u3, 0xee);
1445  u0 = _mm256_shuffle_ps (t0, t2, 0x88);
1446  u1 = _mm256_shuffle_ps (t0, t2, 0xdd);
1447  u2 = _mm256_shuffle_ps (t1, t3, 0x88);
1448  u3 = _mm256_shuffle_ps (t1, t3, 0xdd);
1449  __m128 res0 = _mm256_extractf128_ps (u0, 0);
1450  __m128 res4 = _mm256_extractf128_ps (u0, 1);
1451  __m128 res1 = _mm256_extractf128_ps (u1, 0);
1452  __m128 res5 = _mm256_extractf128_ps (u1, 1);
1453  __m128 res2 = _mm256_extractf128_ps (u2, 0);
1454  __m128 res6 = _mm256_extractf128_ps (u2, 1);
1455  __m128 res3 = _mm256_extractf128_ps (u3, 0);
1456  __m128 res7 = _mm256_extractf128_ps (u3, 1);
1457 
1458  // Cannot use the same store instructions in both paths of the 'if'
1459  // because the compiler cannot know that there is no aliasing between
1460  // pointers
1461  if (add_into)
1462  {
1463  res0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), res0);
1464  _mm_storeu_ps(out+4*i+offsets[0], res0);
1465  res1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), res1);
1466  _mm_storeu_ps(out+4*i+offsets[1], res1);
1467  res2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), res2);
1468  _mm_storeu_ps(out+4*i+offsets[2], res2);
1469  res3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), res3);
1470  _mm_storeu_ps(out+4*i+offsets[3], res3);
1471  res4 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[4]), res4);
1472  _mm_storeu_ps(out+4*i+offsets[4], res4);
1473  res5 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[5]), res5);
1474  _mm_storeu_ps(out+4*i+offsets[5], res5);
1475  res6 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[6]), res6);
1476  _mm_storeu_ps(out+4*i+offsets[6], res6);
1477  res7 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[7]), res7);
1478  _mm_storeu_ps(out+4*i+offsets[7], res7);
1479  }
1480  else
1481  {
1482  _mm_storeu_ps(out+4*i+offsets[0], res0);
1483  _mm_storeu_ps(out+4*i+offsets[1], res1);
1484  _mm_storeu_ps(out+4*i+offsets[2], res2);
1485  _mm_storeu_ps(out+4*i+offsets[3], res3);
1486  _mm_storeu_ps(out+4*i+offsets[4], res4);
1487  _mm_storeu_ps(out+4*i+offsets[5], res5);
1488  _mm_storeu_ps(out+4*i+offsets[6], res6);
1489  _mm_storeu_ps(out+4*i+offsets[7], res7);
1490  }
1491  }
1492  const unsigned int shift = n_chunks * 4;
1493  if (add_into)
1494  for (unsigned int i=shift; i<n_entries; ++i)
1495  for (unsigned int v=0; v<8; ++v)
1496  out[offsets[v]+i] += in[i][v];
1497  else
1498  for (unsigned int i=shift; i<n_entries; ++i)
1499  for (unsigned int v=0; v<8; ++v)
1500  out[offsets[v]+i] = in[i][v];
1501 }
1502 
1503 
1504 
1505 // for safety, also check that __SSE2__ is defined in case the user manually
1506 // set some conflicting compile flags which prevent compilation
1507 
1508 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
1509 
1513 template <>
1514 class VectorizedArray<double>
1515 {
1516 public:
1520  static const unsigned int n_array_elements = 2;
1521 
1525  VectorizedArray &
1526  operator = (const double x)
1527  {
1528  data = _mm_set1_pd(x);
1529  return *this;
1530  }
1531 
1535  double &
1536  operator [] (const unsigned int comp)
1537  {
1538  AssertIndexRange (comp, 2);
1539  return *(reinterpret_cast<double *>(&data)+comp);
1540  }
1541 
1545  const double &
1546  operator [] (const unsigned int comp) const
1547  {
1548  AssertIndexRange (comp, 2);
1549  return *(reinterpret_cast<const double *>(&data)+comp);
1550  }
1551 
1555  VectorizedArray &
1556  operator += (const VectorizedArray &vec)
1557  {
1558 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1559  data += vec.data;
1560 #else
1561  data = _mm_add_pd(data,vec.data);
1562 #endif
1563  return *this;
1564  }
1565 
1569  VectorizedArray &
1570  operator -= (const VectorizedArray &vec)
1571  {
1572 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1573  data -= vec.data;
1574 #else
1575  data = _mm_sub_pd(data,vec.data);
1576 #endif
1577  return *this;
1578  }
1582  VectorizedArray &
1583  operator *= (const VectorizedArray &vec)
1584  {
1585 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1586  data *= vec.data;
1587 #else
1588  data = _mm_mul_pd(data,vec.data);
1589 #endif
1590  return *this;
1591  }
1592 
1596  VectorizedArray &
1597  operator /= (const VectorizedArray &vec)
1598  {
1599 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1600  data /= vec.data;
1601 #else
1602  data = _mm_div_pd(data,vec.data);
1603 #endif
1604  return *this;
1605  }
1606 
1612  void load (const double *ptr)
1613  {
1614  data = _mm_loadu_pd (ptr);
1615  }
1616 
1623  void store (double *ptr) const
1624  {
1625  _mm_storeu_pd (ptr, data);
1626  }
1627 
1632  __m128d data;
1633 
1634 private:
1640  get_sqrt () const
1641  {
1642  VectorizedArray res;
1643  res.data = _mm_sqrt_pd(data);
1644  return res;
1645  }
1646 
1652  get_abs () const
1653  {
1654  // to compute the absolute value, perform
1655  // bitwise andnot with -0. This will leave all
1656  // value and exponent bits unchanged but force
1657  // the sign value to +.
1658  __m128d mask = _mm_set1_pd (-0.);
1659  VectorizedArray res;
1660  res.data = _mm_andnot_pd(mask, data);
1661  return res;
1662  }
1663 
1669  get_max (const VectorizedArray &other) const
1670  {
1671  VectorizedArray res;
1672  res.data = _mm_max_pd (data, other.data);
1673  return res;
1674  }
1675 
1681  get_min (const VectorizedArray &other) const
1682  {
1683  VectorizedArray res;
1684  res.data = _mm_min_pd (data, other.data);
1685  return res;
1686  }
1687 
1691  template <typename Number2> friend VectorizedArray<Number2>
1692  std::sqrt (const VectorizedArray<Number2> &);
1693  template <typename Number2> friend VectorizedArray<Number2>
1694  std::abs (const VectorizedArray<Number2> &);
1695  template <typename Number2> friend VectorizedArray<Number2>
1696  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1697  template <typename Number2> friend VectorizedArray<Number2>
1698  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1699 };
1700 
1701 
1702 
1706 template <>
1707 inline
1708 void vectorized_load_and_transpose(const unsigned int n_entries,
1709  const double *in,
1710  const unsigned int *offsets,
1712 {
1713  const unsigned int n_chunks = n_entries/2, remainder = n_entries%2;
1714  for (unsigned int i=0; i<n_chunks; ++i)
1715  {
1716  __m128d u0 = _mm_loadu_pd(in+2*i+offsets[0]);
1717  __m128d u1 = _mm_loadu_pd(in+2*i+offsets[1]);
1718  out[2*i+0].data = _mm_unpacklo_pd (u0, u1);
1719  out[2*i+1].data = _mm_unpackhi_pd (u0, u1);
1720  }
1721  if (remainder > 0)
1722  for (unsigned int i=0; i<n_entries; ++i)
1723  for (unsigned int v=0; v<2; ++v)
1724  out[i][v] = in[offsets[v]+i];
1725 }
1726 
1727 
1728 
1732 template <>
1733 inline
1734 void
1735 vectorized_transpose_and_store(const bool add_into,
1736  const unsigned int n_entries,
1737  const VectorizedArray<double> *in,
1738  const unsigned int *offsets,
1739  double *out)
1740 {
1741  const unsigned int n_chunks = n_entries/2;
1742  if (add_into)
1743  {
1744  for (unsigned int i=0; i<n_chunks; ++i)
1745  {
1746  __m128d u0 = in[2*i+0].data;
1747  __m128d u1 = in[2*i+1].data;
1748  __m128d res0 = _mm_unpacklo_pd (u0, u1);
1749  __m128d res1 = _mm_unpackhi_pd (u0, u1);
1750  _mm_storeu_pd(out+2*i+offsets[0], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[0]), res0));
1751  _mm_storeu_pd(out+2*i+offsets[1], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[1]), res1));
1752  }
1753  const unsigned int shift = n_chunks * 2;
1754  for (unsigned int i=shift; i<n_entries; ++i)
1755  for (unsigned int v=0; v<2; ++v)
1756  out[offsets[v]+i] += in[i][v];
1757  }
1758  else
1759  {
1760  for (unsigned int i=0; i<n_chunks; ++i)
1761  {
1762  __m128d u0 = in[2*i+0].data;
1763  __m128d u1 = in[2*i+1].data;
1764  __m128d res0 = _mm_unpacklo_pd (u0, u1);
1765  __m128d res1 = _mm_unpackhi_pd (u0, u1);
1766  _mm_storeu_pd(out+2*i+offsets[0], res0);
1767  _mm_storeu_pd(out+2*i+offsets[1], res1);
1768  }
1769  const unsigned int shift = n_chunks * 2;
1770  for (unsigned int i=shift; i<n_entries; ++i)
1771  for (unsigned int v=0; v<2; ++v)
1772  out[offsets[v]+i] = in[i][v];
1773  }
1774 }
1775 
1776 
1777 
1781 template <>
1782 class VectorizedArray<float>
1783 {
1784 public:
1788  static const unsigned int n_array_elements = 4;
1789 
1794  VectorizedArray &
1795  operator = (const float x)
1796  {
1797  data = _mm_set1_ps(x);
1798  return *this;
1799  }
1800 
1804  float &
1805  operator [] (const unsigned int comp)
1806  {
1807  AssertIndexRange (comp, 4);
1808  return *(reinterpret_cast<float *>(&data)+comp);
1809  }
1810 
1814  const float &
1815  operator [] (const unsigned int comp) const
1816  {
1817  AssertIndexRange (comp, 4);
1818  return *(reinterpret_cast<const float *>(&data)+comp);
1819  }
1820 
1824  VectorizedArray &
1825  operator += (const VectorizedArray &vec)
1826  {
1827 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1828  data += vec.data;
1829 #else
1830  data = _mm_add_ps(data,vec.data);
1831 #endif
1832  return *this;
1833  }
1834 
1838  VectorizedArray &
1839  operator -= (const VectorizedArray &vec)
1840  {
1841 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1842  data -= vec.data;
1843 #else
1844  data = _mm_sub_ps(data,vec.data);
1845 #endif
1846  return *this;
1847  }
1848 
1852  VectorizedArray &
1853  operator *= (const VectorizedArray &vec)
1854  {
1855 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1856  data *= vec.data;
1857 #else
1858  data = _mm_mul_ps(data,vec.data);
1859 #endif
1860  return *this;
1861  }
1862 
1866  VectorizedArray &
1867  operator /= (const VectorizedArray &vec)
1868  {
1869 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS
1870  data /= vec.data;
1871 #else
1872  data = _mm_div_ps(data,vec.data);
1873 #endif
1874  return *this;
1875  }
1876 
1882  void load (const float *ptr)
1883  {
1884  data = _mm_loadu_ps (ptr);
1885  }
1886 
1893  void store (float *ptr) const
1894  {
1895  _mm_storeu_ps (ptr, data);
1896  }
1897 
1902  __m128 data;
1903 
1904 private:
1910  get_sqrt () const
1911  {
1912  VectorizedArray res;
1913  res.data = _mm_sqrt_ps(data);
1914  return res;
1915  }
1916 
1922  get_abs () const
1923  {
1924  // to compute the absolute value, perform bitwise andnot with -0. This
1925  // will leave all value and exponent bits unchanged but force the sign
1926  // value to +.
1927  __m128 mask = _mm_set1_ps (-0.f);
1928  VectorizedArray res;
1929  res.data = _mm_andnot_ps(mask, data);
1930  return res;
1931  }
1932 
1938  get_max (const VectorizedArray &other) const
1939  {
1940  VectorizedArray res;
1941  res.data = _mm_max_ps (data, other.data);
1942  return res;
1943  }
1944 
1950  get_min (const VectorizedArray &other) const
1951  {
1952  VectorizedArray res;
1953  res.data = _mm_min_ps (data, other.data);
1954  return res;
1955  }
1956 
1960  template <typename Number2> friend VectorizedArray<Number2>
1961  std::sqrt (const VectorizedArray<Number2> &);
1962  template <typename Number2> friend VectorizedArray<Number2>
1963  std::abs (const VectorizedArray<Number2> &);
1964  template <typename Number2> friend VectorizedArray<Number2>
1965  std::max (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1966  template <typename Number2> friend VectorizedArray<Number2>
1967  std::min (const VectorizedArray<Number2> &, const VectorizedArray<Number2> &);
1968 };
1969 
1970 
1971 
1975 template <>
1976 inline
1977 void vectorized_load_and_transpose(const unsigned int n_entries,
1978  const float *in,
1979  const unsigned int *offsets,
1981 {
1982  const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1983  for (unsigned int i=0; i<n_chunks; ++i)
1984  {
1985  __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
1986  __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
1987  __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
1988  __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
1989  __m128 v0 = _mm_shuffle_ps (u0, u1, 0x44);
1990  __m128 v1 = _mm_shuffle_ps (u0, u1, 0xee);
1991  __m128 v2 = _mm_shuffle_ps (u2, u3, 0x44);
1992  __m128 v3 = _mm_shuffle_ps (u2, u3, 0xee);
1993  out[4*i+0].data = _mm_shuffle_ps (v0, v2, 0x88);
1994  out[4*i+1].data = _mm_shuffle_ps (v0, v2, 0xdd);
1995  out[4*i+2].data = _mm_shuffle_ps (v1, v3, 0x88);
1996  out[4*i+3].data = _mm_shuffle_ps (v1, v3, 0xdd);
1997  }
1998  if (remainder > 0 && n_chunks > 0)
1999  {
2000  // simple re-load all data in the last slot
2001  const unsigned int final_pos = n_chunks*4-4+remainder;
2002  Assert(final_pos+4 == n_entries, ExcInternalError());
2003  __m128 u0 = _mm_loadu_ps(in+final_pos+offsets[0]);
2004  __m128 u1 = _mm_loadu_ps(in+final_pos+offsets[1]);
2005  __m128 u2 = _mm_loadu_ps(in+final_pos+offsets[2]);
2006  __m128 u3 = _mm_loadu_ps(in+final_pos+offsets[3]);
2007  __m128 v0 = _mm_shuffle_ps (u0, u1, 0x44);
2008  __m128 v1 = _mm_shuffle_ps (u0, u1, 0xee);
2009  __m128 v2 = _mm_shuffle_ps (u2, u3, 0x44);
2010  __m128 v3 = _mm_shuffle_ps (u2, u3, 0xee);
2011  out[final_pos+0].data = _mm_shuffle_ps (v0, v2, 0x88);
2012  out[final_pos+1].data = _mm_shuffle_ps (v0, v2, 0xdd);
2013  out[final_pos+2].data = _mm_shuffle_ps (v1, v3, 0x88);
2014  out[final_pos+3].data = _mm_shuffle_ps (v1, v3, 0xdd);
2015  }
2016  else if (remainder > 0)
2017  for (unsigned int i=0; i<n_entries; ++i)
2018  for (unsigned int v=0; v<4; ++v)
2019  out[i][v] = in[offsets[v]+i];
2020 }
2021 
2022 
2023 
2027 template <>
2028 inline
2029 void
2030 vectorized_transpose_and_store(const bool add_into,
2031  const unsigned int n_entries,
2032  const VectorizedArray<float> *in,
2033  const unsigned int *offsets,
2034  float *out)
2035 {
2036  const unsigned int n_chunks = n_entries/4;
2037  for (unsigned int i=0; i<n_chunks; ++i)
2038  {
2039  __m128 u0 = in[4*i+0].data;
2040  __m128 u1 = in[4*i+1].data;
2041  __m128 u2 = in[4*i+2].data;
2042  __m128 u3 = in[4*i+3].data;
2043  __m128 t0 = _mm_shuffle_ps (u0, u1, 0x44);
2044  __m128 t1 = _mm_shuffle_ps (u0, u1, 0xee);
2045  __m128 t2 = _mm_shuffle_ps (u2, u3, 0x44);
2046  __m128 t3 = _mm_shuffle_ps (u2, u3, 0xee);
2047  u0 = _mm_shuffle_ps (t0, t2, 0x88);
2048  u1 = _mm_shuffle_ps (t0, t2, 0xdd);
2049  u2 = _mm_shuffle_ps (t1, t3, 0x88);
2050  u3 = _mm_shuffle_ps (t1, t3, 0xdd);
2051 
2052  // Cannot use the same store instructions in both paths of the 'if'
2053  // because the compiler cannot know that there is no aliasing between
2054  // pointers
2055  if (add_into)
2056  {
2057  u0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), u0);
2058  _mm_storeu_ps(out+4*i+offsets[0], u0);
2059  u1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), u1);
2060  _mm_storeu_ps(out+4*i+offsets[1], u1);
2061  u2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), u2);
2062  _mm_storeu_ps(out+4*i+offsets[2], u2);
2063  u3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), u3);
2064  _mm_storeu_ps(out+4*i+offsets[3], u3);
2065  }
2066  else
2067  {
2068  _mm_storeu_ps(out+4*i+offsets[0], u0);
2069  _mm_storeu_ps(out+4*i+offsets[1], u1);
2070  _mm_storeu_ps(out+4*i+offsets[2], u2);
2071  _mm_storeu_ps(out+4*i+offsets[3], u3);
2072  }
2073  }
2074  const unsigned int shift = n_chunks * 4;
2075  if (add_into)
2076  for (unsigned int i=shift; i<n_entries; ++i)
2077  for (unsigned int v=0; v<4; ++v)
2078  out[offsets[v]+i] += in[i][v];
2079  else
2080  for (unsigned int i=shift; i<n_entries; ++i)
2081  for (unsigned int v=0; v<4; ++v)
2082  out[offsets[v]+i] = in[i][v];
2083 }
2084 
2085 
2086 
2087 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0
2088 
2089 
2095 template <typename Number>
2096 inline
2098 operator + (const VectorizedArray<Number> &u,
2099  const VectorizedArray<Number> &v)
2100 {
2101  VectorizedArray<Number> tmp = u;
2102  return tmp+=v;
2103 }
2104 
2110 template <typename Number>
2111 inline
2113 operator - (const VectorizedArray<Number> &u,
2114  const VectorizedArray<Number> &v)
2115 {
2116  VectorizedArray<Number> tmp = u;
2117  return tmp-=v;
2118 }
2119 
2125 template <typename Number>
2126 inline
2128 operator * (const VectorizedArray<Number> &u,
2129  const VectorizedArray<Number> &v)
2130 {
2131  VectorizedArray<Number> tmp = u;
2132  return tmp*=v;
2133 }
2134 
2140 template <typename Number>
2141 inline
2143 operator / (const VectorizedArray<Number> &u,
2144  const VectorizedArray<Number> &v)
2145 {
2146  VectorizedArray<Number> tmp = u;
2147  return tmp/=v;
2148 }
2149 
2156 template <typename Number>
2157 inline
2159 operator + (const Number &u,
2160  const VectorizedArray<Number> &v)
2161 {
2163  tmp = u;
2164  return tmp+=v;
2165 }
2166 
2175 inline
2177 operator + (const double &u,
2178  const VectorizedArray<float> &v)
2179 {
2181  tmp = u;
2182  return tmp+=v;
2183 }
2184 
2191 template <typename Number>
2192 inline
2194 operator + (const VectorizedArray<Number> &v,
2195  const Number &u)
2196 {
2197  return u + v;
2198 }
2199 
2208 inline
2210 operator + (const VectorizedArray<float> &v,
2211  const double &u)
2212 {
2213  return u + v;
2214 }
2215 
2222 template <typename Number>
2223 inline
2225 operator - (const Number &u,
2226  const VectorizedArray<Number> &v)
2227 {
2229  tmp = u;
2230  return tmp-=v;
2231 }
2232 
2241 inline
2243 operator - (const double &u,
2244  const VectorizedArray<float> &v)
2245 {
2247  tmp = float(u);
2248  return tmp-=v;
2249 }
2250 
2257 template <typename Number>
2258 inline
2260 operator - (const VectorizedArray<Number> &v,
2261  const Number &u)
2262 {
2264  tmp = u;
2265  return v-tmp;
2266 }
2267 
2276 inline
2278 operator - (const VectorizedArray<float> &v,
2279  const double &u)
2280 {
2282  tmp = float(u);
2283  return v-tmp;
2284 }
2285 
2292 template <typename Number>
2293 inline
2295 operator * (const Number &u,
2296  const VectorizedArray<Number> &v)
2297 {
2299  tmp = u;
2300  return tmp*=v;
2301 }
2302 
2311 inline
2313 operator * (const double &u,
2314  const VectorizedArray<float> &v)
2315 {
2317  tmp = float(u);
2318  return tmp*=v;
2319 }
2320 
2327 template <typename Number>
2328 inline
2330 operator * (const VectorizedArray<Number> &v,
2331  const Number &u)
2332 {
2333  return u * v;
2334 }
2335 
2344 inline
2346 operator * (const VectorizedArray<float> &v,
2347  const double &u)
2348 {
2349  return u * v;
2350 }
2351 
2358 template <typename Number>
2359 inline
2361 operator / (const Number &u,
2362  const VectorizedArray<Number> &v)
2363 {
2365  tmp = u;
2366  return tmp/=v;
2367 }
2368 
2377 inline
2379 operator / (const double &u,
2380  const VectorizedArray<float> &v)
2381 {
2383  tmp = float(u);
2384  return tmp/=v;
2385 }
2386 
2393 template <typename Number>
2394 inline
2396 operator / (const VectorizedArray<Number> &v,
2397  const Number &u)
2398 {
2400  tmp = u;
2401  return v/tmp;
2402 }
2403 
2412 inline
2414 operator / (const VectorizedArray<float> &v,
2415  const double &u)
2416 {
2418  tmp = float(u);
2419  return v/tmp;
2420 }
2421 
2427 template <typename Number>
2428 inline
2430 operator + (const VectorizedArray<Number> &u)
2431 {
2432  return u;
2433 }
2434 
2440 template <typename Number>
2441 inline
2443 operator - (const VectorizedArray<Number> &u)
2444 {
2445  // to get a negative sign, subtract the input from zero (could also
2446  // multiply by -1, but this one is slightly simpler)
2447  return VectorizedArray<Number>()-u;
2448 }
2449 
2450 
2451 DEAL_II_NAMESPACE_CLOSE
2452 
2453 
2460 namespace std
2461 {
2469  template <typename Number>
2470  inline
2471  ::VectorizedArray<Number>
2472  sin (const ::VectorizedArray<Number> &x)
2473  {
2474  // put values in an array and later read in that array with an unaligned
2475  // read. This should save some instructions as compared to directly
2476  // setting the individual elements and also circumvents a compiler
2477  // optimization bug in gcc-4.6 with SSE2 (see also deal.II developers list
2478  // from April 2014, topic "matrix_free/step-48 Test").
2480  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2481  values[i] = std::sin(x[i]);
2483  out.load(&values[0]);
2484  return out;
2485  }
2486 
2487 
2488 
2496  template <typename Number>
2497  inline
2498  ::VectorizedArray<Number>
2499  cos (const ::VectorizedArray<Number> &x)
2500  {
2502  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2503  values[i] = std::cos(x[i]);
2505  out.load(&values[0]);
2506  return out;
2507  }
2508 
2509 
2510 
2518  template <typename Number>
2519  inline
2520  ::VectorizedArray<Number>
2521  tan (const ::VectorizedArray<Number> &x)
2522  {
2524  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2525  values[i] = std::tan(x[i]);
2527  out.load(&values[0]);
2528  return out;
2529  }
2530 
2531 
2532 
2540  template <typename Number>
2541  inline
2542  ::VectorizedArray<Number>
2543  exp (const ::VectorizedArray<Number> &x)
2544  {
2546  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2547  values[i] = std::exp(x[i]);
2549  out.load(&values[0]);
2550  return out;
2551  }
2552 
2553 
2554 
2562  template <typename Number>
2563  inline
2564  ::VectorizedArray<Number>
2565  log (const ::VectorizedArray<Number> &x)
2566  {
2568  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2569  values[i] = std::log(x[i]);
2571  out.load(&values[0]);
2572  return out;
2573  }
2574 
2575 
2576 
2584  template <typename Number>
2585  inline
2586  ::VectorizedArray<Number>
2587  sqrt (const ::VectorizedArray<Number> &x)
2588  {
2589  return x.get_sqrt();
2590  }
2591 
2592 
2593 
2601  template <typename Number>
2602  inline
2603  ::VectorizedArray<Number>
2604  pow (const ::VectorizedArray<Number> &x,
2605  const Number p)
2606  {
2608  for (unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2609  values[i] = std::pow(x[i], p);
2611  out.load(&values[0]);
2612  return out;
2613  }
2614 
2615 
2616 
2624  template <typename Number>
2625  inline
2626  ::VectorizedArray<Number>
2627  abs (const ::VectorizedArray<Number> &x)
2628  {
2629  return x.get_abs();
2630  }
2631 
2632 
2633 
2641  template <typename Number>
2642  inline
2643  ::VectorizedArray<Number>
2644  max (const ::VectorizedArray<Number> &x,
2645  const ::VectorizedArray<Number> &y)
2646  {
2647  return x.get_max(y);
2648  }
2649 
2650 
2651 
2659  template <typename Number>
2660  inline
2661  ::VectorizedArray<Number>
2662  min (const ::VectorizedArray<Number> &x,
2663  const ::VectorizedArray<Number> &y)
2664  {
2665  return x.get_min(y);
2666  }
2667 
2668 }
2669 
2670 #endif
void store(Number *ptr) const
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray get_max(const VectorizedArray &other) const
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
Definition: exceptions.h:1081
STL namespace.
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_abs() const
void vectorized_transpose_and_store(const bool add_into, const unsigned int n_entries, const VectorizedArray< Number > *in, const unsigned int *offsets, Number *out)
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
#define Assert(cond, exc)
Definition: exceptions.h:294
VectorizedArray get_sqrt() const
VectorizedArray< Number > pow(const ::VectorizedArray< Number > &x, const Number p)
VectorizedArray get_min(const VectorizedArray &other) const
void vectorized_load_and_transpose(const unsigned int n_entries, const Number *in, const unsigned int *offsets, VectorizedArray< Number > *out)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
void load(const Number *ptr)
T min(const T &t, const MPI_Comm &mpi_communicator)
Definition: mpi.h:722
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
T max(const T &t, const MPI_Comm &mpi_communicator)
Definition: mpi.h:693
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)