17 #ifndef dealii__vectorization_h 18 #define dealii__vectorization_h 20 #include <deal.II/base/config.h> 21 #include <deal.II/base/exceptions.h> 41 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 // AVX, AVX-512 42 #include <immintrin.h> 43 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2 44 #include <emmintrin.h> 49 DEAL_II_NAMESPACE_OPEN
52 DEAL_II_NAMESPACE_CLOSE
58 sqrt(const ::VectorizedArray<Number> &);
60 abs(const ::VectorizedArray<Number> &);
62 max(const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
64 min (const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
68 DEAL_II_NAMESPACE_OPEN
74 template<
typename Number>
131 template <
typename Number>
138 static const unsigned int n_array_elements = 1;
147 operator = (
const Number scalar)
157 operator [] (
const unsigned int comp)
168 operator [] (
const unsigned int comp)
const 252 res.
data = std::sqrt(data);
264 res.
data = std::fabs(data);
276 res.
data = std::max (data, other.
data);
288 res.
data = std::min (data, other.
data);
313 template <
typename Number>
350 template <
typename Number>
355 const unsigned int *offsets,
358 for (
unsigned int i=0; i<n_entries; ++i)
359 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
360 out[i][v] = in[offsets[v]+i];
403 template <
typename Number>
407 const unsigned int n_entries,
409 const unsigned int *offsets,
413 for (
unsigned int i=0; i<n_entries; ++i)
414 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
415 out[offsets[v]+i] += in[i][v];
417 for (
unsigned int i=0; i<n_entries; ++i)
418 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
419 out[offsets[v]+i] = in[i][v];
427 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && defined(__AVX512F__) 439 static const unsigned int n_array_elements = 8;
445 operator = (
const double x)
447 data = _mm512_set1_pd(x);
455 operator [] (
const unsigned int comp)
458 return *(
reinterpret_cast<double *
>(&data)+comp);
465 operator [] (
const unsigned int comp)
const 468 return *(
reinterpret_cast<const double *
>(&data)+comp);
482 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 485 data = _mm512_add_pd(data,vec.
data);
496 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 499 data = _mm512_sub_pd(data,vec.
data);
509 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 512 data = _mm512_mul_pd(data,vec.
data);
523 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 526 data = _mm512_div_pd(data,vec.
data);
536 void load (
const double *ptr)
538 data = _mm512_loadu_pd (ptr);
547 void store (
double *ptr)
const 549 _mm512_storeu_pd (ptr, data);
567 res.
data = _mm512_sqrt_pd(data);
583 __m512d mask = _mm512_set1_pd (-0.);
585 res.
data = (__m512d)_mm512_andnot_epi64 ((__m512i)mask, (__m512i)data);
597 res.
data = _mm512_max_pd (data, other.
data);
609 res.
data = _mm512_min_pd (data, other.
data);
638 static const unsigned int n_array_elements = 16;
644 operator = (
const float x)
646 data = _mm512_set1_ps(x);
654 operator [] (
const unsigned int comp)
657 return *(
reinterpret_cast<float *
>(&data)+comp);
664 operator [] (
const unsigned int comp)
const 667 return *(
reinterpret_cast<const float *
>(&data)+comp);
681 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 684 data = _mm512_add_ps(data,vec.
data);
695 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 698 data = _mm512_sub_ps(data,vec.
data);
708 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 711 data = _mm512_mul_ps(data,vec.
data);
722 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 725 data = _mm512_div_ps(data,vec.
data);
735 void load (
const float *ptr)
737 data = _mm512_loadu_ps (ptr);
746 void store (
float *ptr)
const 748 _mm512_storeu_ps (ptr, data);
767 res.
data = _mm512_sqrt_ps(data);
783 __m512 mask = _mm512_set1_ps (-0.f);
785 res.
data = (__m512)_mm512_andnot_epi32 ((__m512i)mask, (__m512i)data);
797 res.
data = _mm512_max_ps (data, other.
data);
809 res.
data = _mm512_min_ps (data, other.
data);
828 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && defined(__AVX__) 840 static const unsigned int n_array_elements = 4;
846 operator = (
const double x)
848 data = _mm256_set1_pd(x);
856 operator [] (
const unsigned int comp)
859 return *(
reinterpret_cast<double *
>(&data)+comp);
866 operator [] (
const unsigned int comp)
const 869 return *(
reinterpret_cast<const double *
>(&data)+comp);
883 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 886 data = _mm256_add_pd(data,vec.
data);
897 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 900 data = _mm256_sub_pd(data,vec.
data);
910 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 913 data = _mm256_mul_pd(data,vec.
data);
924 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 927 data = _mm256_div_pd(data,vec.
data);
937 void load (
const double *ptr)
939 data = _mm256_loadu_pd (ptr);
948 void store (
double *ptr)
const 950 _mm256_storeu_pd (ptr, data);
968 res.
data = _mm256_sqrt_pd(data);
982 __m256d mask = _mm256_set1_pd (-0.);
984 res.
data = _mm256_andnot_pd(mask, data);
996 res.
data = _mm256_max_pd (data, other.
data);
1008 res.
data = _mm256_min_pd (data, other.
data);
1033 vectorized_load_and_transpose(
const unsigned int n_entries,
1035 const unsigned int *offsets,
1038 const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1039 for (
unsigned int i=0; i<n_chunks; ++i)
1041 __m256d u0 = _mm256_loadu_pd(in+4*i+offsets[0]);
1042 __m256d u1 = _mm256_loadu_pd(in+4*i+offsets[1]);
1043 __m256d u2 = _mm256_loadu_pd(in+4*i+offsets[2]);
1044 __m256d u3 = _mm256_loadu_pd(in+4*i+offsets[3]);
1045 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1046 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1047 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1048 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1049 out[4*i+0].
data = _mm256_unpacklo_pd (t0, t1);
1050 out[4*i+1].
data = _mm256_unpackhi_pd (t0, t1);
1051 out[4*i+2].
data = _mm256_unpacklo_pd (t2, t3);
1052 out[4*i+3].
data = _mm256_unpackhi_pd (t2, t3);
1054 if (remainder > 0 && n_chunks > 0)
1057 const unsigned int final_pos = n_chunks*4-4+remainder;
1058 Assert(final_pos+4 == n_entries, ExcInternalError());
1059 __m256d u0 = _mm256_loadu_pd(in+final_pos+offsets[0]);
1060 __m256d u1 = _mm256_loadu_pd(in+final_pos+offsets[1]);
1061 __m256d u2 = _mm256_loadu_pd(in+final_pos+offsets[2]);
1062 __m256d u3 = _mm256_loadu_pd(in+final_pos+offsets[3]);
1063 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1064 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1065 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1066 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1067 out[final_pos+0].
data = _mm256_unpacklo_pd (t0, t1);
1068 out[final_pos+1].
data = _mm256_unpackhi_pd (t0, t1);
1069 out[final_pos+2].
data = _mm256_unpacklo_pd (t2, t3);
1070 out[final_pos+3].
data = _mm256_unpackhi_pd (t2, t3);
1072 else if (remainder > 0)
1073 for (
unsigned int i=0; i<n_entries; ++i)
1074 for (
unsigned int v=0; v<4; ++v)
1075 out[i][v] = in[offsets[v]+i];
1086 vectorized_transpose_and_store(
const bool add_into,
1087 const unsigned int n_entries,
1089 const unsigned int *offsets,
1092 const unsigned int n_chunks = n_entries/4;
1093 for (
unsigned int i=0; i<n_chunks; ++i)
1095 __m256d u0 = in[4*i+0].
data;
1096 __m256d u1 = in[4*i+1].
data;
1097 __m256d u2 = in[4*i+2].
data;
1098 __m256d u3 = in[4*i+3].
data;
1099 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1100 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1101 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1102 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1103 __m256d res0 = _mm256_unpacklo_pd (t0, t1);
1104 __m256d res1 = _mm256_unpackhi_pd (t0, t1);
1105 __m256d res2 = _mm256_unpacklo_pd (t2, t3);
1106 __m256d res3 = _mm256_unpackhi_pd (t2, t3);
1113 res0 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[0]), res0);
1114 _mm256_storeu_pd(out+4*i+offsets[0], res0);
1115 res1 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[1]), res1);
1116 _mm256_storeu_pd(out+4*i+offsets[1], res1);
1117 res2 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[2]), res2);
1118 _mm256_storeu_pd(out+4*i+offsets[2], res2);
1119 res3 = _mm256_add_pd(_mm256_loadu_pd(out+4*i+offsets[3]), res3);
1120 _mm256_storeu_pd(out+4*i+offsets[3], res3);
1124 _mm256_storeu_pd(out+4*i+offsets[0], res0);
1125 _mm256_storeu_pd(out+4*i+offsets[1], res1);
1126 _mm256_storeu_pd(out+4*i+offsets[2], res2);
1127 _mm256_storeu_pd(out+4*i+offsets[3], res3);
1130 const unsigned int shift = n_chunks * 4;
1132 for (
unsigned int i=shift; i<n_entries; ++i)
1133 for (
unsigned int v=0; v<4; ++v)
1134 out[offsets[v]+i] += in[i][v];
1136 for (
unsigned int i=shift; i<n_entries; ++i)
1137 for (
unsigned int v=0; v<4; ++v)
1138 out[offsets[v]+i] = in[i][v];
1153 static const unsigned int n_array_elements = 8;
1159 operator = (
const float x)
1161 data = _mm256_set1_ps(x);
1169 operator [] (
const unsigned int comp)
1172 return *(
reinterpret_cast<float *
>(&data)+comp);
1179 operator [] (
const unsigned int comp)
const 1182 return *(
reinterpret_cast<const float *
>(&data)+comp);
1196 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1199 data = _mm256_add_ps(data,vec.
data);
1210 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1213 data = _mm256_sub_ps(data,vec.
data);
1223 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1226 data = _mm256_mul_ps(data,vec.
data);
1237 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1240 data = _mm256_div_ps(data,vec.
data);
1250 void load (
const float *ptr)
1252 data = _mm256_loadu_ps (ptr);
1261 void store (
float *ptr)
const 1263 _mm256_storeu_ps (ptr, data);
1282 res.
data = _mm256_sqrt_ps(data);
1296 __m256 mask = _mm256_set1_ps (-0.f);
1298 res.
data = _mm256_andnot_ps(mask, data);
1310 res.
data = _mm256_max_ps (data, other.
data);
1322 res.
data = _mm256_min_ps (data, other.
data);
1347 vectorized_load_and_transpose(
const unsigned int n_entries,
1349 const unsigned int *offsets,
1352 const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1353 for (
unsigned int i=0; i<n_chunks; ++i)
1355 __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
1356 __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
1357 __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
1358 __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
1359 __m128 u4 = _mm_loadu_ps(in+4*i+offsets[4]);
1360 __m128 u5 = _mm_loadu_ps(in+4*i+offsets[5]);
1361 __m128 u6 = _mm_loadu_ps(in+4*i+offsets[6]);
1362 __m128 u7 = _mm_loadu_ps(in+4*i+offsets[7]);
1365 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1366 t0 = _mm256_insertf128_ps (t3, u0, 0);
1367 t0 = _mm256_insertf128_ps (t0, u4, 1);
1368 t1 = _mm256_insertf128_ps (t3, u1, 0);
1369 t1 = _mm256_insertf128_ps (t1, u5, 1);
1370 t2 = _mm256_insertf128_ps (t3, u2, 0);
1371 t2 = _mm256_insertf128_ps (t2, u6, 1);
1372 t3 = _mm256_insertf128_ps (t3, u3, 0);
1373 t3 = _mm256_insertf128_ps (t3, u7, 1);
1374 __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1375 __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1376 __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1377 __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1378 out[4*i+0].
data = _mm256_shuffle_ps (v0, v2, 0x88);
1379 out[4*i+1].
data = _mm256_shuffle_ps (v0, v2, 0xdd);
1380 out[4*i+2].
data = _mm256_shuffle_ps (v1, v3, 0x88);
1381 out[4*i+3].
data = _mm256_shuffle_ps (v1, v3, 0xdd);
1383 if (remainder > 0 && n_chunks > 0)
1386 const unsigned int final_pos = n_chunks*4-4+remainder;
1387 Assert(final_pos+4 == n_entries, ExcInternalError());
1388 __m128 u0 = _mm_loadu_ps(in+final_pos+offsets[0]);
1389 __m128 u1 = _mm_loadu_ps(in+final_pos+offsets[1]);
1390 __m128 u2 = _mm_loadu_ps(in+final_pos+offsets[2]);
1391 __m128 u3 = _mm_loadu_ps(in+final_pos+offsets[3]);
1392 __m128 u4 = _mm_loadu_ps(in+final_pos+offsets[4]);
1393 __m128 u5 = _mm_loadu_ps(in+final_pos+offsets[5]);
1394 __m128 u6 = _mm_loadu_ps(in+final_pos+offsets[6]);
1395 __m128 u7 = _mm_loadu_ps(in+final_pos+offsets[7]);
1396 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1397 t0 = _mm256_insertf128_ps (t3, u0, 0);
1398 t0 = _mm256_insertf128_ps (t0, u4, 1);
1399 t1 = _mm256_insertf128_ps (t3, u1, 0);
1400 t1 = _mm256_insertf128_ps (t1, u5, 1);
1401 t2 = _mm256_insertf128_ps (t3, u2, 0);
1402 t2 = _mm256_insertf128_ps (t2, u6, 1);
1403 t3 = _mm256_insertf128_ps (t3, u3, 0);
1404 t3 = _mm256_insertf128_ps (t3, u7, 1);
1405 __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1406 __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1407 __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1408 __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1409 out[final_pos+0].
data = _mm256_shuffle_ps (v0, v2, 0x88);
1410 out[final_pos+1].
data = _mm256_shuffle_ps (v0, v2, 0xdd);
1411 out[final_pos+2].
data = _mm256_shuffle_ps (v1, v3, 0x88);
1412 out[final_pos+3].
data = _mm256_shuffle_ps (v1, v3, 0xdd);
1414 else if (remainder > 0)
1415 for (
unsigned int i=0; i<n_entries; ++i)
1416 for (
unsigned int v=0; v<8; ++v)
1417 out[i][v] = in[offsets[v]+i];
1428 vectorized_transpose_and_store(
const bool add_into,
1429 const unsigned int n_entries,
1431 const unsigned int *offsets,
1434 const unsigned int n_chunks = n_entries/4;
1435 for (
unsigned int i=0; i<n_chunks; ++i)
1437 __m256 u0 = in[4*i+0].
data;
1438 __m256 u1 = in[4*i+1].
data;
1439 __m256 u2 = in[4*i+2].
data;
1440 __m256 u3 = in[4*i+3].
data;
1441 __m256 t0 = _mm256_shuffle_ps (u0, u1, 0x44);
1442 __m256 t1 = _mm256_shuffle_ps (u0, u1, 0xee);
1443 __m256 t2 = _mm256_shuffle_ps (u2, u3, 0x44);
1444 __m256 t3 = _mm256_shuffle_ps (u2, u3, 0xee);
1445 u0 = _mm256_shuffle_ps (t0, t2, 0x88);
1446 u1 = _mm256_shuffle_ps (t0, t2, 0xdd);
1447 u2 = _mm256_shuffle_ps (t1, t3, 0x88);
1448 u3 = _mm256_shuffle_ps (t1, t3, 0xdd);
1449 __m128 res0 = _mm256_extractf128_ps (u0, 0);
1450 __m128 res4 = _mm256_extractf128_ps (u0, 1);
1451 __m128 res1 = _mm256_extractf128_ps (u1, 0);
1452 __m128 res5 = _mm256_extractf128_ps (u1, 1);
1453 __m128 res2 = _mm256_extractf128_ps (u2, 0);
1454 __m128 res6 = _mm256_extractf128_ps (u2, 1);
1455 __m128 res3 = _mm256_extractf128_ps (u3, 0);
1456 __m128 res7 = _mm256_extractf128_ps (u3, 1);
1463 res0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), res0);
1464 _mm_storeu_ps(out+4*i+offsets[0], res0);
1465 res1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), res1);
1466 _mm_storeu_ps(out+4*i+offsets[1], res1);
1467 res2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), res2);
1468 _mm_storeu_ps(out+4*i+offsets[2], res2);
1469 res3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), res3);
1470 _mm_storeu_ps(out+4*i+offsets[3], res3);
1471 res4 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[4]), res4);
1472 _mm_storeu_ps(out+4*i+offsets[4], res4);
1473 res5 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[5]), res5);
1474 _mm_storeu_ps(out+4*i+offsets[5], res5);
1475 res6 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[6]), res6);
1476 _mm_storeu_ps(out+4*i+offsets[6], res6);
1477 res7 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[7]), res7);
1478 _mm_storeu_ps(out+4*i+offsets[7], res7);
1482 _mm_storeu_ps(out+4*i+offsets[0], res0);
1483 _mm_storeu_ps(out+4*i+offsets[1], res1);
1484 _mm_storeu_ps(out+4*i+offsets[2], res2);
1485 _mm_storeu_ps(out+4*i+offsets[3], res3);
1486 _mm_storeu_ps(out+4*i+offsets[4], res4);
1487 _mm_storeu_ps(out+4*i+offsets[5], res5);
1488 _mm_storeu_ps(out+4*i+offsets[6], res6);
1489 _mm_storeu_ps(out+4*i+offsets[7], res7);
1492 const unsigned int shift = n_chunks * 4;
1494 for (
unsigned int i=shift; i<n_entries; ++i)
1495 for (
unsigned int v=0; v<8; ++v)
1496 out[offsets[v]+i] += in[i][v];
1498 for (
unsigned int i=shift; i<n_entries; ++i)
1499 for (
unsigned int v=0; v<8; ++v)
1500 out[offsets[v]+i] = in[i][v];
1508 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__) 1520 static const unsigned int n_array_elements = 2;
1526 operator = (
const double x)
1528 data = _mm_set1_pd(x);
1536 operator [] (
const unsigned int comp)
1539 return *(
reinterpret_cast<double *
>(&data)+comp);
1546 operator [] (
const unsigned int comp)
const 1549 return *(
reinterpret_cast<const double *
>(&data)+comp);
1558 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1561 data = _mm_add_pd(data,vec.
data);
1572 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1575 data = _mm_sub_pd(data,vec.
data);
1585 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1588 data = _mm_mul_pd(data,vec.
data);
1599 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1602 data = _mm_div_pd(data,vec.
data);
1612 void load (
const double *ptr)
1614 data = _mm_loadu_pd (ptr);
1623 void store (
double *ptr)
const 1625 _mm_storeu_pd (ptr, data);
1643 res.
data = _mm_sqrt_pd(data);
1658 __m128d mask = _mm_set1_pd (-0.);
1660 res.
data = _mm_andnot_pd(mask, data);
1672 res.
data = _mm_max_pd (data, other.
data);
1684 res.
data = _mm_min_pd (data, other.
data);
1708 void vectorized_load_and_transpose(
const unsigned int n_entries,
1710 const unsigned int *offsets,
1713 const unsigned int n_chunks = n_entries/2, remainder = n_entries%2;
1714 for (
unsigned int i=0; i<n_chunks; ++i)
1716 __m128d u0 = _mm_loadu_pd(in+2*i+offsets[0]);
1717 __m128d u1 = _mm_loadu_pd(in+2*i+offsets[1]);
1718 out[2*i+0].
data = _mm_unpacklo_pd (u0, u1);
1719 out[2*i+1].
data = _mm_unpackhi_pd (u0, u1);
1722 for (
unsigned int i=0; i<n_entries; ++i)
1723 for (
unsigned int v=0; v<2; ++v)
1724 out[i][v] = in[offsets[v]+i];
1735 vectorized_transpose_and_store(
const bool add_into,
1736 const unsigned int n_entries,
1738 const unsigned int *offsets,
1741 const unsigned int n_chunks = n_entries/2;
1744 for (
unsigned int i=0; i<n_chunks; ++i)
1746 __m128d u0 = in[2*i+0].
data;
1747 __m128d u1 = in[2*i+1].
data;
1748 __m128d res0 = _mm_unpacklo_pd (u0, u1);
1749 __m128d res1 = _mm_unpackhi_pd (u0, u1);
1750 _mm_storeu_pd(out+2*i+offsets[0], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[0]), res0));
1751 _mm_storeu_pd(out+2*i+offsets[1], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[1]), res1));
1753 const unsigned int shift = n_chunks * 2;
1754 for (
unsigned int i=shift; i<n_entries; ++i)
1755 for (
unsigned int v=0; v<2; ++v)
1756 out[offsets[v]+i] += in[i][v];
1760 for (
unsigned int i=0; i<n_chunks; ++i)
1762 __m128d u0 = in[2*i+0].
data;
1763 __m128d u1 = in[2*i+1].
data;
1764 __m128d res0 = _mm_unpacklo_pd (u0, u1);
1765 __m128d res1 = _mm_unpackhi_pd (u0, u1);
1766 _mm_storeu_pd(out+2*i+offsets[0], res0);
1767 _mm_storeu_pd(out+2*i+offsets[1], res1);
1769 const unsigned int shift = n_chunks * 2;
1770 for (
unsigned int i=shift; i<n_entries; ++i)
1771 for (
unsigned int v=0; v<2; ++v)
1772 out[offsets[v]+i] = in[i][v];
1788 static const unsigned int n_array_elements = 4;
1795 operator = (
const float x)
1797 data = _mm_set1_ps(x);
1805 operator [] (
const unsigned int comp)
1808 return *(
reinterpret_cast<float *
>(&data)+comp);
1815 operator [] (
const unsigned int comp)
const 1818 return *(
reinterpret_cast<const float *
>(&data)+comp);
1827 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1830 data = _mm_add_ps(data,vec.
data);
1841 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1844 data = _mm_sub_ps(data,vec.
data);
1855 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1858 data = _mm_mul_ps(data,vec.
data);
1869 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1872 data = _mm_div_ps(data,vec.
data);
1882 void load (
const float *ptr)
1884 data = _mm_loadu_ps (ptr);
1893 void store (
float *ptr)
const 1895 _mm_storeu_ps (ptr, data);
1913 res.
data = _mm_sqrt_ps(data);
1927 __m128 mask = _mm_set1_ps (-0.f);
1929 res.
data = _mm_andnot_ps(mask, data);
1941 res.
data = _mm_max_ps (data, other.
data);
1953 res.
data = _mm_min_ps (data, other.
data);
1977 void vectorized_load_and_transpose(
const unsigned int n_entries,
1979 const unsigned int *offsets,
1982 const unsigned int n_chunks = n_entries/4, remainder = n_entries%4;
1983 for (
unsigned int i=0; i<n_chunks; ++i)
1985 __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
1986 __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
1987 __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
1988 __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
1989 __m128 v0 = _mm_shuffle_ps (u0, u1, 0x44);
1990 __m128 v1 = _mm_shuffle_ps (u0, u1, 0xee);
1991 __m128 v2 = _mm_shuffle_ps (u2, u3, 0x44);
1992 __m128 v3 = _mm_shuffle_ps (u2, u3, 0xee);
1993 out[4*i+0].
data = _mm_shuffle_ps (v0, v2, 0x88);
1994 out[4*i+1].
data = _mm_shuffle_ps (v0, v2, 0xdd);
1995 out[4*i+2].
data = _mm_shuffle_ps (v1, v3, 0x88);
1996 out[4*i+3].
data = _mm_shuffle_ps (v1, v3, 0xdd);
1998 if (remainder > 0 && n_chunks > 0)
2001 const unsigned int final_pos = n_chunks*4-4+remainder;
2002 Assert(final_pos+4 == n_entries, ExcInternalError());
2003 __m128 u0 = _mm_loadu_ps(in+final_pos+offsets[0]);
2004 __m128 u1 = _mm_loadu_ps(in+final_pos+offsets[1]);
2005 __m128 u2 = _mm_loadu_ps(in+final_pos+offsets[2]);
2006 __m128 u3 = _mm_loadu_ps(in+final_pos+offsets[3]);
2007 __m128 v0 = _mm_shuffle_ps (u0, u1, 0x44);
2008 __m128 v1 = _mm_shuffle_ps (u0, u1, 0xee);
2009 __m128 v2 = _mm_shuffle_ps (u2, u3, 0x44);
2010 __m128 v3 = _mm_shuffle_ps (u2, u3, 0xee);
2011 out[final_pos+0].
data = _mm_shuffle_ps (v0, v2, 0x88);
2012 out[final_pos+1].
data = _mm_shuffle_ps (v0, v2, 0xdd);
2013 out[final_pos+2].
data = _mm_shuffle_ps (v1, v3, 0x88);
2014 out[final_pos+3].
data = _mm_shuffle_ps (v1, v3, 0xdd);
2016 else if (remainder > 0)
2017 for (
unsigned int i=0; i<n_entries; ++i)
2018 for (
unsigned int v=0; v<4; ++v)
2019 out[i][v] = in[offsets[v]+i];
2030 vectorized_transpose_and_store(
const bool add_into,
2031 const unsigned int n_entries,
2033 const unsigned int *offsets,
2036 const unsigned int n_chunks = n_entries/4;
2037 for (
unsigned int i=0; i<n_chunks; ++i)
2039 __m128 u0 = in[4*i+0].
data;
2040 __m128 u1 = in[4*i+1].
data;
2041 __m128 u2 = in[4*i+2].
data;
2042 __m128 u3 = in[4*i+3].
data;
2043 __m128 t0 = _mm_shuffle_ps (u0, u1, 0x44);
2044 __m128 t1 = _mm_shuffle_ps (u0, u1, 0xee);
2045 __m128 t2 = _mm_shuffle_ps (u2, u3, 0x44);
2046 __m128 t3 = _mm_shuffle_ps (u2, u3, 0xee);
2047 u0 = _mm_shuffle_ps (t0, t2, 0x88);
2048 u1 = _mm_shuffle_ps (t0, t2, 0xdd);
2049 u2 = _mm_shuffle_ps (t1, t3, 0x88);
2050 u3 = _mm_shuffle_ps (t1, t3, 0xdd);
2057 u0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), u0);
2058 _mm_storeu_ps(out+4*i+offsets[0], u0);
2059 u1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), u1);
2060 _mm_storeu_ps(out+4*i+offsets[1], u1);
2061 u2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), u2);
2062 _mm_storeu_ps(out+4*i+offsets[2], u2);
2063 u3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), u3);
2064 _mm_storeu_ps(out+4*i+offsets[3], u3);
2068 _mm_storeu_ps(out+4*i+offsets[0], u0);
2069 _mm_storeu_ps(out+4*i+offsets[1], u1);
2070 _mm_storeu_ps(out+4*i+offsets[2], u2);
2071 _mm_storeu_ps(out+4*i+offsets[3], u3);
2074 const unsigned int shift = n_chunks * 4;
2076 for (
unsigned int i=shift; i<n_entries; ++i)
2077 for (
unsigned int v=0; v<4; ++v)
2078 out[offsets[v]+i] += in[i][v];
2080 for (
unsigned int i=shift; i<n_entries; ++i)
2081 for (
unsigned int v=0; v<4; ++v)
2082 out[offsets[v]+i] = in[i][v];
2087 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 2095 template <
typename Number>
2110 template <
typename Number>
2125 template <
typename Number>
2140 template <
typename Number>
2156 template <
typename Number>
2159 operator + (
const Number &u,
2177 operator + (
const double &u,
2191 template <
typename Number>
2222 template <
typename Number>
2225 operator - (
const Number &u,
2243 operator - (
const double &u,
2257 template <
typename Number>
2292 template <
typename Number>
2295 operator * (
const Number &u,
2313 operator * (
const double &u,
2327 template <
typename Number>
2358 template <
typename Number>
2361 operator / (
const Number &u,
2379 operator / (
const double &u,
2393 template <
typename Number>
2427 template <
typename Number>
2440 template <
typename Number>
2451 DEAL_II_NAMESPACE_CLOSE
2469 template <
typename Number>
2471 ::VectorizedArray<Number>
2472 sin (const ::VectorizedArray<Number> &x)
2480 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2481 values[i] = std::sin(x[i]);
2483 out.
load(&values[0]);
2496 template <
typename Number>
2498 ::VectorizedArray<Number>
2499 cos (const ::VectorizedArray<Number> &x)
2502 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2503 values[i] = std::cos(x[i]);
2505 out.
load(&values[0]);
2518 template <
typename Number>
2520 ::VectorizedArray<Number>
2521 tan (const ::VectorizedArray<Number> &x)
2524 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2525 values[i] = std::tan(x[i]);
2527 out.
load(&values[0]);
2540 template <
typename Number>
2542 ::VectorizedArray<Number>
2543 exp (const ::VectorizedArray<Number> &x)
2546 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2547 values[i] = std::exp(x[i]);
2549 out.
load(&values[0]);
2562 template <
typename Number>
2564 ::VectorizedArray<Number>
2565 log (const ::VectorizedArray<Number> &x)
2568 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2569 values[i] = std::log(x[i]);
2571 out.
load(&values[0]);
2584 template <
typename Number>
2586 ::VectorizedArray<Number>
2587 sqrt (const ::VectorizedArray<Number> &x)
2589 return x.get_sqrt();
2601 template <
typename Number>
2603 ::VectorizedArray<Number>
2604 pow (const ::VectorizedArray<Number> &x,
2608 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2609 values[i] = std::pow(x[i], p);
2611 out.
load(&values[0]);
2624 template <
typename Number>
2626 ::VectorizedArray<Number>
2627 abs (const ::VectorizedArray<Number> &x)
2641 template <
typename Number>
2643 ::VectorizedArray<Number>
2644 max (const ::VectorizedArray<Number> &x,
2645 const ::VectorizedArray<Number> &y)
2647 return x.get_max(y);
2659 template <
typename Number>
2661 ::VectorizedArray<Number>
2662 min (const ::VectorizedArray<Number> &x,
2663 const ::VectorizedArray<Number> &y)
2665 return x.get_min(y);
void store(Number *ptr) const
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray get_max(const VectorizedArray &other) const
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_abs() const
void vectorized_transpose_and_store(const bool add_into, const unsigned int n_entries, const VectorizedArray< Number > *in, const unsigned int *offsets, Number *out)
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
#define Assert(cond, exc)
VectorizedArray get_sqrt() const
VectorizedArray< Number > pow(const ::VectorizedArray< Number > &x, const Number p)
VectorizedArray get_min(const VectorizedArray &other) const
void vectorized_load_and_transpose(const unsigned int n_entries, const Number *in, const unsigned int *offsets, VectorizedArray< Number > *out)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
void load(const Number *ptr)
T min(const T &t, const MPI_Comm &mpi_communicator)
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
T max(const T &t, const MPI_Comm &mpi_communicator)
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)