27 #ifdef DEAL_II_WITH_THREADS 28 # include <tbb/blocked_range.h> 29 # include <tbb/parallel_for.h> 30 # define TBB_SUPPRESS_DEPRECATED_MESSAGES 1 31 # include <tbb/task.h> 32 # include <tbb/task_scheduler_init.h> 33 # undef TBB_SUPPRESS_DEPRECATED_MESSAGES 46 namespace MatrixFreeFunctions
48 #ifdef DEAL_II_WITH_THREADS 62 , worker_pointer(worker_pointer)
63 , partition(partition)
64 , task_info(task_info)
72 , partition(partition)
73 , task_info(task_info)
92 used_worker->
boundary(std::make_pair(
111 const bool is_blocked)
113 , work(worker, partition, task_info)
114 , is_blocked(is_blocked)
122 if (is_blocked ==
true)
123 tbb::empty_task::spawn(*
dummy);
140 const unsigned int partition_in,
142 const bool is_blocked_in =
false)
144 , function(function_in)
147 , is_blocked(is_blocked_in)
153 tbb::empty_task *root =
154 new (tbb::task::allocate_root()) tbb::empty_task;
157 const unsigned int n_blocked_workers =
159 const unsigned int n_workers =
161 std::vector<CellWork *>
worker(n_workers);
162 std::vector<CellWork *> blocked_worker(n_blocked_workers);
164 root->set_ref_count(evens + 1);
165 for (
unsigned int j = 0; j < evens; j++)
167 worker[j] =
new (root->allocate_child())
174 worker[j]->set_ref_count(2);
175 blocked_worker[j - 1]->dummy =
176 new (worker[j]->allocate_child()) tbb::empty_task;
177 tbb::task::spawn(*blocked_worker[j - 1]);
180 worker[j]->set_ref_count(1);
183 blocked_worker[j] =
new (worker[j]->allocate_child())
194 worker[evens] =
new (worker[j]->allocate_child())
200 tbb::task::spawn(*worker[evens]);
204 tbb::empty_task *child =
205 new (worker[j]->allocate_child()) tbb::empty_task();
206 tbb::task::spawn(*child);
211 root->wait_for_all();
212 root->destroy(*root);
213 if (is_blocked ==
true)
214 tbb::empty_task::spawn(*
dummy);
238 const unsigned int partition_in)
247 const unsigned int start_index =
250 const unsigned int end_index =
253 worker.
cell(std::make_pair(start_index, end_index));
273 const unsigned int partition_in,
275 const bool is_blocked_in)
280 , is_blocked(is_blocked_in)
286 const unsigned int n_chunks =
291 parallel_for(tbb::blocked_range<unsigned int>(0, n_chunks, 1),
293 if (is_blocked ==
true)
294 tbb::empty_task::spawn(*
dummy);
316 , do_compress(do_compress)
322 if (do_compress ==
false)
334 #endif // DEAL_II_WITH_THREADS 349 partition_row_index[partition_row_index.size() - 2]);
353 #ifdef DEAL_II_WITH_THREADS 358 if (scheme == partition_partition)
360 tbb::empty_task *root =
361 new (tbb::task::allocate_root()) tbb::empty_task;
362 root->set_ref_count(evens + 1);
363 std::vector<partition::PartitionWork *>
worker(n_workers);
364 std::vector<partition::PartitionWork *> blocked_worker(
368 worker_compr->set_ref_count(1);
369 for (
unsigned int j = 0; j < evens; j++)
373 worker[j] =
new (root->allocate_child())
375 worker[j]->set_ref_count(2);
376 blocked_worker[j - 1]->dummy =
377 new (worker[j]->allocate_child()) tbb::empty_task;
378 tbb::task::spawn(*blocked_worker[j - 1]);
382 worker[j] =
new (worker_compr->allocate_child())
384 worker[j]->set_ref_count(2);
386 new (worker[j]->allocate_child())
388 tbb::task::spawn(*worker_dist);
392 blocked_worker[j] =
new (worker[j]->allocate_child())
399 worker[evens] =
new (worker[j]->allocate_child())
404 tbb::task::spawn(*worker[evens]);
408 tbb::empty_task *child =
409 new (worker[j]->allocate_child()) tbb::empty_task();
410 tbb::task::spawn(*child);
415 root->wait_for_all();
416 root->destroy(*root);
424 tbb::empty_task *root =
425 new (tbb::task::allocate_root()) tbb::empty_task;
426 root->set_ref_count(evens + 1);
427 const unsigned int n_blocked_workers =
428 odds - (odds + evens + 1) % 2;
429 const unsigned int n_workers =
430 cell_partition_data.size() - 1 - n_blocked_workers;
431 std::vector<color::PartitionWork *>
worker(n_workers);
432 std::vector<color::PartitionWork *> blocked_worker(
434 unsigned int worker_index = 0, slice_index = 0;
435 int spawn_index_child = -2;
438 worker_compr->set_ref_count(1);
439 for (
unsigned int part = 0;
440 part < partition_row_index.size() - 1;
444 worker[worker_index] =
445 new (worker_compr->allocate_child())
451 worker[worker_index] =
new (root->allocate_child())
457 for (; slice_index < partition_row_index[part + 1];
460 worker[worker_index]->set_ref_count(1);
462 worker[worker_index] =
463 new (worker[worker_index - 1]->allocate_child())
469 worker[worker_index]->set_ref_count(2);
472 blocked_worker[(part - 1) / 2]->
dummy =
473 new (worker[worker_index]->allocate_child())
476 if (spawn_index_child == -1)
477 tbb::task::spawn(*blocked_worker[(part - 1) / 2]);
480 Assert(spawn_index_child >= 0,
482 tbb::task::spawn(*worker[spawn_index_child]);
484 spawn_index_child = -2;
489 new (worker[worker_index]->allocate_child())
491 tbb::task::spawn(*worker_dist);
495 if (part < partition_row_index.size() - 1)
497 if (part < partition_row_index.size() - 2)
499 blocked_worker[part / 2] =
500 new (worker[worker_index - 1]->allocate_child())
506 if (slice_index < partition_row_index[part + 1])
508 blocked_worker[part / 2]->set_ref_count(1);
509 worker[worker_index] =
new (
510 blocked_worker[part / 2]->allocate_child())
519 spawn_index_child = -1;
523 for (; slice_index < partition_row_index[part + 1];
526 if (slice_index > partition_row_index[part])
528 worker[worker_index]->set_ref_count(1);
531 worker[worker_index] =
532 new (worker[worker_index - 1]->allocate_child())
538 spawn_index_child = worker_index;
543 tbb::empty_task *
final =
544 new (worker[worker_index - 1]->allocate_child())
546 tbb::task::spawn(*
final);
547 spawn_index_child = worker_index - 1;
553 tbb::task::spawn(*worker[spawn_index_child]);
555 root->wait_for_all();
556 root->destroy(*root);
566 for (
unsigned int color = 0; color < partition_row_index[1];
569 tbb::empty_task *root =
570 new (tbb::task::allocate_root()) tbb::empty_task;
571 root->set_ref_count(2);
573 new (root->allocate_child())
575 tbb::empty_task::spawn(*worker);
576 root->wait_for_all();
577 root->destroy(*root);
589 for (
unsigned int part = 0; part < partition_row_index.size() - 2;
595 for (
unsigned int i = partition_row_index[part];
596 i < partition_row_index[part + 1];
602 if (cell_partition_data[i + 1] > cell_partition_data[i])
604 funct.
cell(std::make_pair(cell_partition_data[i],
605 cell_partition_data[i + 1]));
608 if (face_partition_data.empty() ==
false)
610 if (face_partition_data[i + 1] > face_partition_data[i])
611 funct.
face(std::make_pair(face_partition_data[i],
612 face_partition_data[i + 1]));
613 if (boundary_partition_data[i + 1] >
614 boundary_partition_data[i])
616 std::make_pair(boundary_partition_data[i],
617 boundary_partition_data[i + 1]));
632 partition_row_index[partition_row_index.size() - 2]);
649 vectorization_length = 1;
653 partition_row_index.clear();
654 partition_row_index.resize(2);
655 cell_partition_data.clear();
656 face_partition_data.clear();
657 boundary_partition_data.clear();
660 n_blocked_workers = 0;
662 partition_evens.clear();
663 partition_odds.clear();
664 partition_n_blocked_workers.clear();
665 partition_n_workers.clear();
666 communicator = MPI_COMM_SELF;
673 template <
typename StreamType>
676 const std::size_t data_length)
const 683 out << memory_c.
min <<
"/" << memory_c.
avg <<
"/" << memory_c.
max;
684 out <<
" MB" << std::endl;
708 std::vector<unsigned int> &boundary_cells)
712 unsigned int fillup_needed =
713 (vectorization_length - boundary_cells.size() % vectorization_length) %
714 vectorization_length;
720 std::vector<unsigned int> new_boundary_cells;
721 new_boundary_cells.reserve(boundary_cells.size());
723 unsigned int next_free_slot = 0, bound_index = 0;
724 while (fillup_needed > 0 && bound_index < boundary_cells.size())
726 if (next_free_slot < boundary_cells[bound_index])
730 if (next_free_slot + fillup_needed <=
731 boundary_cells[bound_index])
733 for (
unsigned int j =
734 boundary_cells[bound_index] - fillup_needed;
735 j < boundary_cells[bound_index];
737 new_boundary_cells.push_back(j);
744 for (
unsigned int j = next_free_slot;
745 j < boundary_cells[bound_index];
747 new_boundary_cells.push_back(j);
749 boundary_cells[bound_index] - next_free_slot;
752 new_boundary_cells.push_back(boundary_cells[bound_index]);
753 next_free_slot = boundary_cells[bound_index] + 1;
756 while (fillup_needed > 0 &&
757 (new_boundary_cells.size() == 0 ||
759 new_boundary_cells.push_back(new_boundary_cells.back() + 1);
760 while (bound_index < boundary_cells.size())
761 new_boundary_cells.push_back(boundary_cells[bound_index++]);
763 boundary_cells.swap(new_boundary_cells);
767 std::sort(boundary_cells.begin(), boundary_cells.end());
771 Assert(boundary_cells.size() % vectorization_length == 0 ||
780 const std::vector<unsigned int> &cells_with_comm,
781 const unsigned int dofs_per_cell,
782 const bool categories_are_hp,
783 const std::vector<unsigned int> &cell_vectorization_categories,
784 const bool cell_vectorization_categories_strict,
785 const std::vector<unsigned int> &parent_relation,
786 std::vector<unsigned int> & renumbering,
787 std::vector<unsigned char> & incompletely_filled_vectorization)
812 unsigned int vectorization_length_bits = 0;
813 unsigned int my_length = vectorization_length;
814 while (my_length >>= 1)
815 ++vectorization_length_bits;
816 const unsigned int n_lanes = 1 << vectorization_length_bits;
821 unsigned int n_categories = 1;
823 if (cell_vectorization_categories.empty() ==
false)
828 std::set<unsigned int> used_categories;
830 used_categories.insert(cell_vectorization_categories[i]);
831 std::vector<unsigned int> used_categories_vector(
832 used_categories.size());
834 for (
const auto &it : used_categories)
835 used_categories_vector[n_categories++] = it;
838 const unsigned int index =
840 used_categories_vector.end(),
841 cell_vectorization_categories[i]) -
842 used_categories_vector.begin();
844 tight_category_map[i] = index;
851 std::vector<std::vector<unsigned int>> renumbering_category(n_categories);
853 renumbering_category[tight_category_map[i]].push_back(i);
855 if (cell_vectorization_categories_strict ==
false && n_categories > 1)
856 for (
unsigned int j = n_categories - 1; j > 0; --j)
858 unsigned int lower_index = j - 1;
859 while (renumbering_category[j].size() % n_lanes)
861 while (renumbering_category[j].size() % n_lanes &&
862 !renumbering_category[lower_index].empty())
864 renumbering_category[j].push_back(
865 renumbering_category[lower_index].back());
866 renumbering_category[lower_index].pop_back();
868 if (lower_index == 0)
879 std::vector<unsigned int> temporary_numbering;
880 temporary_numbering.reserve(n_active_cells +
881 (n_lanes - 1) * n_categories);
882 const unsigned int n_cells_per_parent =
883 std::count(parent_relation.begin(), parent_relation.end(), 0);
884 std::vector<unsigned int> category_size;
885 for (
unsigned int j = 0; j < n_categories; ++j)
887 std::vector<std::pair<unsigned int, unsigned int>> grouped_cells;
888 std::vector<unsigned int> other_cells;
889 for (
const unsigned int cell : renumbering_category[j])
890 if (parent_relation.empty() ||
892 other_cells.push_back(cell);
894 grouped_cells.emplace_back(parent_relation[cell], cell);
897 std::sort(grouped_cells.begin(), grouped_cells.end());
898 std::vector<unsigned int> n_cells_per_group;
899 unsigned int length = 0;
900 for (
unsigned int i = 0; i < grouped_cells.size(); ++i, ++length)
901 if (i > 0 && grouped_cells[i].
first != grouped_cells[i - 1].
first)
903 n_cells_per_group.push_back(length);
907 n_cells_per_group.push_back(length);
912 auto group_it = grouped_cells.begin();
913 for (
unsigned int length : n_cells_per_group)
914 if (length < n_cells_per_parent)
915 for (
unsigned int j = 0; j < length; ++j)
916 other_cells.push_back((group_it++)->second);
922 for (
unsigned int j = 0; j < length; ++j)
923 temporary_numbering.push_back((group_it++)->second);
927 std::sort(other_cells.begin(), other_cells.end());
928 temporary_numbering.insert(temporary_numbering.end(),
932 while (temporary_numbering.size() % n_lanes != 0)
935 category_size.push_back(temporary_numbering.size());
939 std::vector<bool> batch_with_comm(temporary_numbering.size() / n_lanes,
941 std::vector<unsigned int> temporary_numbering_inverse(n_active_cells);
942 for (
unsigned int i = 0; i < temporary_numbering.size(); ++i)
944 temporary_numbering_inverse[temporary_numbering[i]] = i;
945 for (
const unsigned int cell : cells_with_comm)
946 batch_with_comm[temporary_numbering_inverse[cell] / n_lanes] =
true;
952 std::vector<std::array<unsigned int, 3>> batch_order;
953 std::vector<std::array<unsigned int, 3>> batch_order_comm;
954 for (
unsigned int i = 0; i < temporary_numbering.size(); i += n_lanes)
956 unsigned int max_index = 0;
957 for (
unsigned int j = 0; j < n_lanes; ++j)
959 max_index =
std::max(temporary_numbering[i + j], max_index);
960 const unsigned int category_hp =
962 std::upper_bound(category_size.begin(), category_size.end(), i) -
963 category_size.begin() :
965 const std::array<unsigned int, 3> next{{category_hp, max_index, i}};
966 if (batch_with_comm[i / n_lanes])
967 batch_order_comm.emplace_back(next);
969 batch_order.emplace_back(next);
972 std::sort(batch_order.begin(), batch_order.end());
973 std::sort(batch_order_comm.begin(), batch_order_comm.end());
980 std::vector<unsigned int> blocks;
983 if (batch_order.empty())
984 std::swap(batch_order_comm, batch_order);
986 partition_row_index.resize(3);
987 blocks = {0,
static_cast<unsigned int>(batch_order.size())};
991 partition_row_index.resize(5);
992 const unsigned int comm_begin = batch_order.size() / 2;
993 batch_order.insert(batch_order.begin() + comm_begin,
994 batch_order_comm.begin(),
995 batch_order_comm.end());
996 const unsigned int comm_end = comm_begin + batch_order_comm.size();
997 const unsigned int end = batch_order.size();
998 blocks = {0, comm_begin, comm_end, end};
1002 const unsigned int n_cell_batches = batch_order.size();
1003 const unsigned int n_ghost_batches =
1004 (n_ghost_cells + n_lanes - 1) / n_lanes;
1005 incompletely_filled_vectorization.resize(n_cell_batches +
1008 cell_partition_data.clear();
1009 cell_partition_data.resize(1, 0);
1011 renumbering.clear();
1012 renumbering.resize(n_active_cells + n_ghost_cells,
1015 unsigned int counter = 0;
1016 for (
unsigned int block = 0; block < blocks.size() - 1; ++block)
1018 const unsigned int grain_size =
1019 std::max((2048
U / dofs_per_cell) / 8 * 4, 2
U);
1020 for (
unsigned int k = blocks[block]; k < blocks[block + 1];
1022 cell_partition_data.push_back(
1023 std::min(k + grain_size, blocks[block + 1]));
1024 partition_row_index[block + 1] = cell_partition_data.size() - 1;
1027 for (
unsigned int k = blocks[block]; k < blocks[block + 1]; ++k)
1029 const unsigned int pos = batch_order[k][2];
1031 for (; j < n_lanes && temporary_numbering[pos + j] !=
1034 renumbering[counter++] = temporary_numbering[pos + j];
1036 incompletely_filled_vectorization[k] = j;
1042 for (
unsigned int cell = n_active_cells;
1043 cell < n_active_cells + n_ghost_cells;
1046 if (!cell_vectorization_categories.empty())
1048 cell_vectorization_categories[n_active_cells]);
1049 renumbering[cell] = cell;
1051 if (n_ghost_cells % n_lanes)
1052 incompletely_filled_vectorization.back() = n_ghost_cells % n_lanes;
1053 cell_partition_data.push_back(n_cell_batches + n_ghost_batches);
1054 partition_row_index.back() = cell_partition_data.size() - 1;
1057 std::vector<unsigned int> renumber_cpy(renumbering);
1058 std::sort(renumber_cpy.begin(), renumber_cpy.end());
1059 for (
unsigned int i = 0; i < renumber_cpy.size(); ++i)
1068 const std::vector<unsigned int> &boundary_cells,
1069 std::vector<unsigned int> & renumbering,
1070 std::vector<unsigned char> & incompletely_filled_vectorization)
1072 const unsigned int n_macro_cells =
1073 (
n_active_cells + vectorization_length - 1) / vectorization_length;
1074 const unsigned int n_ghost_slots =
1075 (n_ghost_cells + vectorization_length - 1) / vectorization_length;
1076 incompletely_filled_vectorization.resize(n_macro_cells + n_ghost_slots);
1078 incompletely_filled_vectorization[n_macro_cells - 1] =
1079 vectorization_length -
1081 if (n_ghost_slots * vectorization_length > n_ghost_cells)
1082 incompletely_filled_vectorization[n_macro_cells + n_ghost_slots - 1] =
1083 vectorization_length -
1084 (n_ghost_slots * vectorization_length - n_ghost_cells);
1086 std::vector<unsigned int> reverse_numbering(
1088 for (
unsigned int j = 0; j < boundary_cells.size(); ++j)
1089 reverse_numbering[boundary_cells[j]] = j;
1090 unsigned int counter = boundary_cells.size();
1093 reverse_numbering[j] = counter++;
1098 for (
unsigned int j = n_active_cells; j < n_active_cells + n_ghost_cells;
1100 renumbering.push_back(j);
1104 cell_partition_data.clear();
1105 cell_partition_data.push_back(0);
1108 const unsigned int n_macro_boundary_cells =
1109 (boundary_cells.size() + vectorization_length - 1) /
1110 vectorization_length;
1111 cell_partition_data.push_back(
1112 (n_macro_cells - n_macro_boundary_cells) / 2);
1113 cell_partition_data.push_back(cell_partition_data[1] +
1114 n_macro_boundary_cells);
1118 cell_partition_data.push_back(n_macro_cells);
1119 cell_partition_data.push_back(cell_partition_data.back() + n_ghost_slots);
1120 partition_row_index.resize(n_procs > 1 ? 4 : 2);
1121 partition_row_index[0] = 0;
1122 partition_row_index[1] = 1;
1125 partition_row_index[2] = 2;
1126 partition_row_index[3] = 3;
1141 vectorization_length);
1146 if (dofs_per_cell *
block_size < minimum_parallel_grain_size)
1147 block_size = (minimum_parallel_grain_size / dofs_per_cell + 1);
1152 1 <<
static_cast<unsigned int>(std::log2(
block_size + 1));
1163 std::vector<unsigned int> & renumbering,
1164 std::vector<unsigned char> &irregular_cells,
1167 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1168 if (n_macro_cells == 0)
1173 unsigned int partition = 0, counter = 0;
1178 make_connectivity_cells_to_blocks(irregular_cells,
1187 std::vector<unsigned int> cell_partition(
n_blocks,
1192 std::vector<unsigned int> partition_list(
n_blocks, 0);
1193 std::vector<unsigned int> partition_color_list(
n_blocks, 0);
1196 std::vector<unsigned int> partition_size(2, 0);
1202 unsigned int cluster_size = 1;
1205 make_partitioning(connectivity,
1213 make_coloring_within_partitions_pre_blocked(connectivity,
1218 partition_color_list);
1220 partition_list = renumbering;
1225 std::vector<unsigned int> sorted_pc_list(partition_color_list);
1226 std::sort(sorted_pc_list.begin(), sorted_pc_list.end());
1227 for (
unsigned int i = 0; i < sorted_pc_list.size(); ++i)
1234 std::vector<unsigned int> block_start(n_macro_cells + 1);
1235 std::vector<unsigned char> irregular(n_macro_cells);
1237 unsigned int mcell_start = 0;
1239 for (
unsigned int block = 0; block <
n_blocks; block++)
1241 block_start[block + 1] = block_start[block];
1242 for (
unsigned int mcell = mcell_start;
1246 unsigned int n_comp = (irregular_cells[mcell] > 0) ?
1247 irregular_cells[mcell] :
1248 vectorization_length;
1249 block_start[block + 1] += n_comp;
1255 unsigned int counter_macro = 0;
1256 unsigned int block_size_last =
1258 if (block_size_last == 0)
1261 unsigned int tick = 0;
1262 for (
unsigned int block = 0; block <
n_blocks; block++)
1264 unsigned int present_block = partition_color_list[block];
1265 for (
unsigned int cell = block_start[present_block];
1266 cell < block_start[present_block + 1];
1268 renumbering[counter++] = partition_list[cell];
1269 unsigned int this_block_size =
1270 (present_block == n_blocks - 1) ? block_size_last :
block_size;
1274 if (cell_partition_data[tick] == block)
1275 cell_partition_data[tick++] = counter_macro;
1277 for (
unsigned int j = 0; j < this_block_size; j++)
1278 irregular[counter_macro++] =
1279 irregular_cells[present_block *
block_size + j];
1282 cell_partition_data.back() = counter_macro;
1284 irregular_cells.swap(irregular);
1291 std::vector<unsigned int> sorted_renumbering(renumbering);
1292 std::sort(sorted_renumbering.begin(), sorted_renumbering.end());
1293 for (
unsigned int i = 0; i < sorted_renumbering.size(); ++i)
1309 const std::vector<unsigned int> &cell_active_fe_index,
1311 std::vector<unsigned int> & renumbering,
1312 std::vector<unsigned char> & irregular_cells,
1315 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1316 if (n_macro_cells == 0)
1324 make_connectivity_cells_to_blocks(irregular_cells,
1326 connectivity_blocks);
1329 if (scheme == partition_color ||
1338 std::vector<unsigned int> cell_partition(n_blocks,
1344 std::vector<unsigned int> partition_list(n_blocks, 0);
1345 std::vector<unsigned int> partition_2layers_list(n_blocks, 0);
1348 std::vector<unsigned int> partition_size(2, 0);
1356 unsigned int cluster_size = 1;
1357 if (scheme == partition_partition)
1358 cluster_size =
block_size * vectorization_length;
1361 if (scheme == partition_color || scheme == color)
1362 make_partitioning(connectivity_blocks,
1369 make_partitioning(connectivity,
1377 if (scheme == partition_partition)
1381 make_partitioning_within_partitions_post_blocked(
1383 cell_active_fe_index,
1390 partition_2layers_list,
1393 else if (scheme == partition_color || scheme == color)
1395 make_coloring_within_partitions_pre_blocked(connectivity_blocks,
1400 partition_2layers_list);
1406 std::vector<unsigned int> sorted_pc_list(partition_2layers_list);
1407 std::sort(sorted_pc_list.begin(), sorted_pc_list.end());
1408 for (
unsigned int i = 0; i < sorted_pc_list.size(); ++i)
1415 renumbering_in.swap(renumbering);
1416 if (scheme == partition_partition)
1421 for (
unsigned int j = 0; j < renumbering.size(); j++)
1422 renumbering[j] = renumbering_in[partition_2layers_list[j]];
1424 for (
unsigned int i = 0; i < n_ghost_cells; ++i)
1431 std::vector<unsigned int> block_start(n_macro_cells + 1);
1432 std::vector<unsigned char> irregular(n_macro_cells);
1434 unsigned int counter = 0;
1435 unsigned int mcell_start = 0;
1437 for (
unsigned int block = 0; block <
n_blocks; block++)
1439 block_start[block + 1] = block_start[block];
1440 for (
unsigned int mcell = mcell_start;
1444 unsigned int n_comp = (irregular_cells[mcell] > 0) ?
1445 irregular_cells[mcell] :
1446 vectorization_length;
1447 block_start[block + 1] += n_comp;
1453 unsigned int counter_macro = 0;
1454 unsigned int block_size_last =
1456 if (block_size_last == 0)
1459 unsigned int tick = 0;
1460 for (
unsigned int block = 0; block <
n_blocks; block++)
1462 unsigned int present_block = partition_2layers_list[block];
1463 for (
unsigned int cell = block_start[present_block];
1464 cell < block_start[present_block + 1];
1466 renumbering[counter++] = renumbering_in[cell];
1467 unsigned int this_block_size =
1468 (present_block == n_blocks - 1) ? block_size_last :
block_size;
1472 if (cell_partition_data[tick] == block)
1473 cell_partition_data[tick++] = counter_macro;
1475 for (
unsigned int j = 0; j < this_block_size; j++)
1476 irregular[counter_macro++] =
1477 irregular_cells[present_block *
block_size + j];
1480 cell_partition_data.back() = counter_macro;
1482 irregular_cells.swap(irregular);
1488 std::vector<unsigned int> sorted_renumbering(renumbering);
1489 std::sort(sorted_renumbering.begin(), sorted_renumbering.end());
1490 for (
unsigned int i = 0; i < sorted_renumbering.size(); ++i)
1497 update_task_info(partition);
1504 const std::vector<unsigned int> &cell_active_fe_index,
1506 std::vector<unsigned int> & renumbering,
1507 std::vector<unsigned char> & irregular_cells,
1510 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1511 if (n_macro_cells == 0)
1514 const unsigned int cluster_size =
block_size * vectorization_length;
1528 std::vector<unsigned int> partition_partition_list(
n_active_cells, 0);
1531 std::vector<unsigned int> partition_size(2, 0);
1538 make_partitioning(connectivity,
1546 make_partitioning_within_partitions_post_blocked(connectivity,
1547 cell_active_fe_index,
1554 partition_partition_list,
1557 partition_list.swap(renumbering);
1559 for (
unsigned int j = 0; j < renumbering.size(); j++)
1560 renumbering[j] = partition_list[partition_partition_list[j]];
1562 for (
unsigned int i = 0; i < n_ghost_cells; ++i)
1565 update_task_info(partition);
1572 const std::vector<unsigned char> &irregular_cells,
1576 std::vector<std::vector<unsigned int>> cell_blocks(
n_blocks);
1578 unsigned int cell = 0;
1579 for (
unsigned int i = 0, mcell = 0; i <
n_blocks; ++i)
1581 for (
unsigned int c = 0;
1582 c <
block_size && mcell < *(cell_partition_data.end() - 2);
1585 unsigned int ncomp = (irregular_cells[mcell] > 0) ?
1586 irregular_cells[mcell] :
1587 vectorization_length;
1588 for (
unsigned int c = 0; c < ncomp; ++c, ++cell)
1590 cell_blocks[i].push_back(cell);
1591 touched_cells[cell] = i;
1596 for (
unsigned int i = 0; i < cell_blocks.size(); ++i)
1597 for (
unsigned int col = 0; col < cell_blocks[i].size(); ++col)
1600 connectivity_cells.
begin(cell_blocks[i][col]);
1601 it != connectivity_cells.
end(cell_blocks[i][col]);
1604 if (touched_cells[it->column()] != i)
1605 connectivity_blocks.
add(i, touched_cells[it->column()]);
1617 const std::vector<unsigned int> &cell_active_fe_index,
1619 const unsigned int cluster_size,
1621 const std::vector<unsigned int> &cell_partition,
1622 const std::vector<unsigned int> &partition_list,
1623 const std::vector<unsigned int> &partition_size,
1624 std::vector<unsigned int> & partition_partition_list,
1625 std::vector<unsigned char> & irregular_cells)
1627 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1628 const unsigned int n_ghost_slots =
1629 *(cell_partition_data.end() - 1) - n_macro_cells;
1632 std::vector<unsigned int> neighbor_list;
1635 std::vector<unsigned int> neighbor_neighbor_list;
1639 irregular_cells.back() = 0;
1642 unsigned int max_fe_index = 0;
1643 for (
const unsigned int fe_index : cell_active_fe_index)
1644 max_fe_index =
std::max(fe_index, max_fe_index);
1650 unsigned int n_macro_cells_before = 0;
1656 std::vector<unsigned int> cell_partition_l2(
1658 partition_row_index.clear();
1659 partition_row_index.resize(partition + 1, 0);
1660 cell_partition_data.resize(1, 0);
1662 unsigned int counter = 0;
1663 unsigned int missing_macros;
1664 for (
unsigned int part = 0; part <
partition; ++part)
1666 neighbor_neighbor_list.resize(0);
1667 neighbor_list.resize(0);
1669 unsigned int partition_l2 = 0;
1670 unsigned int start_up = partition_size[part];
1671 unsigned int partition_counter = 0;
1674 if (neighbor_list.size() == 0)
1677 partition_counter = 0;
1678 for (
unsigned int j = start_up;
1679 j < partition_size[part + 1];
1681 if (cell_partition[partition_list[j]] == part &&
1682 cell_partition_l2[partition_list[j]] ==
1687 partition_counter = 1;
1691 cell_partition_l2[partition_list[start_up]] =
1693 neighbor_neighbor_list.push_back(
1694 partition_list[start_up]);
1695 partition_partition_list[counter++] =
1696 partition_list[start_up];
1703 partition_counter = 0;
1704 for (
const unsigned int neighbor : neighbor_list)
1706 Assert(cell_partition[neighbor] == part,
1708 Assert(cell_partition_l2[neighbor] == partition_l2 - 1,
1710 auto neighbor_it = connectivity.
begin(neighbor);
1711 const auto end_it = connectivity.
end(neighbor);
1712 for (; neighbor_it != end_it; ++neighbor_it)
1714 if (cell_partition[neighbor_it->column()] == part &&
1715 cell_partition_l2[neighbor_it->column()] ==
1718 cell_partition_l2[neighbor_it->column()] =
1720 neighbor_neighbor_list.push_back(
1721 neighbor_it->column());
1722 partition_partition_list[counter++] =
1723 neighbor_it->column();
1724 partition_counter++;
1729 if (partition_counter > 0)
1731 int index_before = neighbor_neighbor_list.size(),
1732 index = index_before;
1737 std::vector<unsigned int> remaining_per_macro_cell(
1739 std::vector<std::vector<unsigned int>>
1740 renumbering_fe_index;
1743 if (hp_bool ==
true)
1745 renumbering_fe_index.resize(max_fe_index + 1);
1746 for (cell = counter - partition_counter;
1750 renumbering_fe_index
1751 [cell_active_fe_index.empty() ?
1753 cell_active_fe_index
1754 [partition_partition_list[cell]]]
1755 .push_back(partition_partition_list[cell]);
1758 for (
unsigned int j = 0; j < max_fe_index + 1; j++)
1760 remaining_per_macro_cell[j] =
1761 renumbering_fe_index[j].size() %
1762 vectorization_length;
1763 if (remaining_per_macro_cell[j] != 0)
1766 ((renumbering_fe_index[j].size() +
1767 vectorization_length - 1) /
1768 vectorization_length);
1773 remaining_per_macro_cell.resize(1);
1774 remaining_per_macro_cell[0] =
1775 partition_counter % vectorization_length;
1777 partition_counter / vectorization_length;
1778 if (remaining_per_macro_cell[0] != 0)
1785 cluster_size - (missing_macros % cluster_size);
1788 while (missing_macros > 0 || filled ==
false)
1792 index = neighbor_neighbor_list.size();
1793 if (index == index_before)
1795 if (missing_macros != 0)
1797 neighbor_neighbor_list.resize(0);
1802 index_before = index;
1805 unsigned int additional =
1806 neighbor_neighbor_list[index];
1817 for (; neighbor !=
end; ++neighbor)
1819 if (cell_partition[neighbor->
column()] == part &&
1820 cell_partition_l2[neighbor->
column()] ==
1823 unsigned int this_index = 0;
1824 if (hp_bool ==
true)
1826 cell_active_fe_index.empty() ?
1828 cell_active_fe_index[neighbor
1835 if (missing_macros > 0 ||
1836 remaining_per_macro_cell[this_index] > 0)
1838 cell_partition_l2[neighbor->
column()] =
1840 neighbor_neighbor_list.push_back(
1842 if (hp_bool ==
true)
1843 renumbering_fe_index[this_index]
1844 .push_back(neighbor->
column());
1845 partition_partition_list[counter] =
1848 partition_counter++;
1849 if (remaining_per_macro_cell
1850 [this_index] == 0 &&
1853 remaining_per_macro_cell[this_index]++;
1854 if (remaining_per_macro_cell
1856 vectorization_length)
1858 remaining_per_macro_cell[this_index] =
1861 if (missing_macros == 0)
1864 for (
unsigned int fe_ind = 0;
1865 fe_ind < max_fe_index + 1;
1867 if (remaining_per_macro_cell
1877 if (hp_bool ==
true)
1882 cell = counter - partition_counter;
1883 for (
unsigned int j = 0; j < max_fe_index + 1; j++)
1885 for (
const unsigned int jj :
1886 renumbering_fe_index[j])
1887 renumbering[cell++] = jj;
1888 if (renumbering_fe_index[j].size() %
1889 vectorization_length !=
1891 irregular_cells[renumbering_fe_index[j].size() /
1892 vectorization_length +
1893 n_macro_cells_before] =
1894 renumbering_fe_index[j].size() %
1895 vectorization_length;
1896 n_macro_cells_before +=
1897 (renumbering_fe_index[j].size() +
1898 vectorization_length - 1) /
1899 vectorization_length;
1900 renumbering_fe_index[j].resize(0);
1905 n_macro_cells_before +=
1906 partition_counter / vectorization_length;
1907 if (partition_counter % vectorization_length != 0)
1909 irregular_cells[n_macro_cells_before] =
1910 partition_counter % vectorization_length;
1911 n_macro_cells_before++;
1915 cell_partition_data.push_back(n_macro_cells_before);
1918 neighbor_list = neighbor_neighbor_list;
1919 neighbor_neighbor_list.resize(0);
1921 partition_row_index[part + 1] =
1922 partition_row_index[part] + partition_l2;
1925 if (hp_bool ==
true)
1927 partition_partition_list.swap(renumbering);
1939 const std::vector<unsigned int> &cell_partition,
1940 const std::vector<unsigned int> &partition_list,
1941 const std::vector<unsigned int> &partition_size,
1942 std::vector<unsigned int> & partition_color_list)
1944 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
1945 std::vector<unsigned int> cell_color(
n_blocks, n_macro_cells);
1946 std::vector<bool> color_finder;
1948 partition_row_index.resize(partition + 1);
1949 cell_partition_data.clear();
1950 unsigned int color_counter = 0, index_counter = 0;
1951 for (
unsigned int part = 0; part <
partition; part++)
1953 partition_row_index[part] = index_counter;
1954 unsigned int max_color = 0;
1955 for (
unsigned int k = partition_size[part];
1956 k < partition_size[part + 1];
1959 unsigned int cell = partition_list[k];
1960 unsigned int n_neighbors = connectivity.
row_length(cell);
1964 color_finder.resize(n_neighbors + 1);
1965 for (
unsigned int j = 0; j <= n_neighbors; ++j)
1966 color_finder[j] =
true;
1968 connectivity.
begin(cell),
1969 end = connectivity.
end(cell);
1970 for (; neighbor !=
end; ++neighbor)
1974 if (cell_partition[neighbor->
column()] == part &&
1975 cell_color[neighbor->
column()] <= n_neighbors)
1976 color_finder[cell_color[neighbor->
column()]] =
false;
1979 cell_color[cell] = 0;
1980 while (color_finder[cell_color[cell]] ==
false)
1982 if (cell_color[cell] > max_color)
1983 max_color = cell_color[cell];
1988 for (
unsigned int color = 0; color <= max_color; color++)
1990 cell_partition_data.push_back(color_counter);
1992 for (
unsigned int k = partition_size[part];
1993 k < partition_size[part + 1];
1996 unsigned int cell = partition_list[k];
1997 if (cell_color[cell] == color)
1999 partition_color_list[color_counter++] = cell;
2004 cell_partition_data.push_back(
n_blocks);
2005 partition_row_index[
partition] = index_counter;
2013 const unsigned int cluster_size,
2014 std::vector<unsigned int> & cell_partition,
2015 std::vector<unsigned int> & partition_list,
2016 std::vector<unsigned int> & partition_size,
2026 std::vector<unsigned int> neighbor_list;
2029 std::vector<unsigned int> neighbor_neighbor_list;
2039 unsigned int counter = 0;
2040 unsigned int start_nonboundary =
2041 cell_partition_data.size() == 5 ?
2042 vectorization_length *
2043 (cell_partition_data[2] - cell_partition_data[1]) :
2046 const unsigned int n_macro_cells = *(cell_partition_data.end() - 2);
2047 if (n_macro_cells == 0)
2049 if (scheme == color)
2050 start_nonboundary = n_macro_cells;
2051 if (scheme == partition_color ||
2055 if (scheme == partition_color ||
2061 if (start_nonboundary > n_blocks)
2065 unsigned int start_up = 0;
2067 unsigned int remainder = cluster_size;
2075 if (start_nonboundary > 0)
2077 for (
unsigned int cell = 0; cell < start_nonboundary; ++cell)
2079 const unsigned int cell_nn = cell;
2081 neighbor_list.push_back(cell_nn);
2082 partition_list[counter++] = cell_nn;
2083 partition_size.back()++;
2085 start_nonboundary = 0;
2086 remainder -= (start_nonboundary % cluster_size);
2087 if (remainder == cluster_size)
2095 neighbor_list.push_back(start_up);
2096 partition_list[counter++] = start_up;
2097 partition_size.back()++;
2100 if (remainder == cluster_size)
2103 int index_before = neighbor_list.size(), index = index_before,
2105 while (remainder > 0)
2107 if (index == index_stop)
2109 index = neighbor_list.size();
2110 if (index == index_before)
2112 neighbor_list.resize(0);
2115 index_stop = index_before;
2116 index_before = index;
2119 unsigned int additional = neighbor_list[index];
2121 connectivity.
begin(additional),
2123 connectivity.
end(additional);
2124 for (; neighbor !=
end; ++neighbor)
2126 if (cell_partition[neighbor->
column()] ==
2129 partition_size.back()++;
2131 neighbor_list.push_back(neighbor->
column());
2132 partition_list[counter++] = neighbor->
column();
2140 while (neighbor_list.size() > 0)
2145 unsigned int partition_counter = 0;
2148 partition_size.push_back(partition_size.back());
2152 for (
const unsigned int cell : neighbor_list)
2154 Assert(cell_partition[cell] == partition - 1,
2156 auto neighbor = connectivity.
begin(cell);
2157 const auto end = connectivity.
end(cell);
2158 for (; neighbor !=
end; ++neighbor)
2160 if (cell_partition[neighbor->column()] ==
2163 partition_size.back()++;
2164 cell_partition[neighbor->column()] =
partition;
2168 neighbor_neighbor_list.push_back(neighbor->column());
2169 partition_list[counter++] = neighbor->column();
2170 partition_counter++;
2174 remainder = cluster_size - (partition_counter % cluster_size);
2175 if (remainder == cluster_size)
2178 int index_before = neighbor_neighbor_list.size(),
2179 index = index_before;
2180 while (remainder > 0)
2182 if (index == index_stop)
2184 index = neighbor_neighbor_list.size();
2185 if (index == index_before)
2187 neighbor_neighbor_list.resize(0);
2190 index_stop = index_before;
2191 index_before = index;
2194 unsigned int additional = neighbor_neighbor_list[index];
2200 for (; neighbor !=
end; ++neighbor)
2202 if (cell_partition[neighbor->
column()] ==
2205 partition_size.back()++;
2207 neighbor_neighbor_list.push_back(neighbor->
column());
2208 partition_list[counter++] = neighbor->
column();
2216 neighbor_list = neighbor_neighbor_list;
2217 neighbor_neighbor_list.resize(0);
2223 for (
unsigned int j = start_up; j <
n_blocks; ++j)
2229 remainder = cluster_size;
2243 evens = (partition + 1) / 2;
2244 odds = partition / 2;
2245 n_blocked_workers = odds - (odds + evens + 1) % 2;
2246 n_workers = evens + odds - n_blocked_workers;
2248 partition_evens.resize(partition);
2249 partition_odds.resize(partition);
2250 partition_n_blocked_workers.resize(partition);
2251 partition_n_workers.resize(partition);
2252 for (
unsigned int part = 0; part <
partition; part++)
2254 partition_evens[part] =
2255 (partition_row_index[part + 1] - partition_row_index[part] + 1) / 2;
2256 partition_odds[part] =
2257 (partition_row_index[part + 1] - partition_row_index[part]) / 2;
2258 partition_n_blocked_workers[part] =
2259 partition_odds[part] -
2260 (partition_odds[part] + partition_evens[part] + 1) % 2;
2261 partition_n_workers[part] = partition_evens[part] +
2262 partition_odds[part] -
2263 partition_n_blocked_workers[part];
2273 internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics<std::ostream>(
2275 const std::size_t)
const;
Iterator lower_bound(Iterator first, Iterator last, const T &val)
MPICommunication(MFWorkerInterface &worker_in, const bool do_compress)
void operator()(const tbb::blocked_range< unsigned int > &r) const
static const unsigned int invalid_unsigned_int
virtual void cell_loop_post_range(const unsigned int range_index)=0
MFWorkerInterface * worker
#define AssertDimension(dim1, dim2)
tbb::task * execute() override
PartitionWork(MFWorkerInterface &worker_in, const unsigned int partition_in, const TaskInfo &task_info_in, const bool is_blocked_in)
virtual void cell_loop_pre_range(const unsigned int range_index)=0
MFWorkerInterface & worker
virtual void cell(const std::pair< unsigned int, unsigned int > &cell_range)=0
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
void parallel_for(Iterator x_begin, Iterator x_end, const Functor &functor, const unsigned int grainsize)
MFWorkerInterface & worker
void create_blocks_serial(const std::vector< unsigned int > &cells_with_comm, const unsigned int dofs_per_cell, const bool categories_are_hp, const std::vector< unsigned int > &cell_vectorization_categories, const bool cell_vectorization_categories_strict, const std::vector< unsigned int > &parent_relation, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &incompletely_filled_vectorization)
void loop(MFWorkerInterface &worker) const
void add(const size_type i, const size_type j)
CellWork(MFWorkerInterface &worker_in, const TaskInfo &task_info_in, const unsigned int partition_in)
void guess_block_size(const unsigned int dofs_per_cell)
std::vector< unsigned int > partition_row_index
#define AssertIndexRange(index, range)
std::vector< unsigned int > partition_n_workers
virtual void vector_update_ghosts_finish()=0
Finishes the communication for the update ghost values operation.
std::vector< unsigned int > cell_partition_data
#define AssertThrow(cond, exc)
void make_connectivity_cells_to_blocks(const std::vector< unsigned char > &irregular_cells, const DynamicSparsityPattern &connectivity_cells, DynamicSparsityPattern &connectivity_blocks) const
tbb::task * execute() override
std::vector< unsigned int > partition_n_blocked_workers
void make_partitioning_within_partitions_post_blocked(const DynamicSparsityPattern &connectivity, const std::vector< unsigned int > &cell_active_fe_index, const unsigned int partition, const unsigned int cluster_size, const bool hp_bool, const std::vector< unsigned int > &cell_partition, const std::vector< unsigned int > &partition_list, const std::vector< unsigned int > &partition_size, std::vector< unsigned int > &partition_partition_list, std::vector< unsigned char > &irregular_cells)
const TaskInfo & task_info
CellWork(MFWorkerInterface &worker, const unsigned int partition, const TaskInfo &task_info, const bool is_blocked)
void make_coloring_within_partitions_pre_blocked(const DynamicSparsityPattern &connectivity, const unsigned int partition, const std::vector< unsigned int > &cell_partition, const std::vector< unsigned int > &partition_list, const std::vector< unsigned int > &partition_size, std::vector< unsigned int > &partition_color_list)
const TaskInfo & task_info
const unsigned int partition
#define Assert(cond, exc)
void make_boundary_cells_divisible(std::vector< unsigned int > &boundary_cells)
tbb::task * execute() override
void make_thread_graph_partition_color(DynamicSparsityPattern &connectivity, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &irregular_cells, const bool hp_bool)
void update_task_info(const unsigned int partition)
std::vector< unsigned int > boundary_partition_data
void initial_setup_blocks_tasks(const std::vector< unsigned int > &boundary_cells, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &incompletely_filled_vectorization)
#define DEAL_II_NAMESPACE_CLOSE
virtual void zero_dst_vector_range(const unsigned int range_index)=0
VectorType::value_type * end(VectorType &V)
virtual void boundary(const std::pair< unsigned int, unsigned int > &face_range)=0
std::enable_if< IsBlockVector< VectorType >::value, unsigned int >::type n_blocks(const VectorType &vector)
std::vector< Integer > invert_permutation(const std::vector< Integer > &permutation)
unsigned int n_active_cells(const internal::TriangulationImplementation::NumberCache< 1 > &c)
const TaskInfo & task_info
ActualCellWork(MFWorkerInterface &worker, const unsigned int partition, const TaskInfo &task_info)
const types::global_dof_index * dummy()
virtual void vector_update_ghosts_start()=0
Starts the communication for the update ghost values operation.
virtual void face(const std::pair< unsigned int, unsigned int > &face_range)=0
std::vector< unsigned int > face_partition_data
void swap(MemorySpaceData< Number, MemorySpace > &, MemorySpaceData< Number, MemorySpace > &)
size_type row_length(const size_type row) const
void make_thread_graph_partition_partition(const std::vector< unsigned int > &cell_active_fe_index, DynamicSparsityPattern &connectivity, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &irregular_cells, const bool hp_bool)
#define DEAL_II_NAMESPACE_OPEN
T min(const T &t, const MPI_Comm &mpi_communicator)
void make_partitioning(const DynamicSparsityPattern &connectivity, const unsigned int cluster_size, std::vector< unsigned int > &cell_partition, std::vector< unsigned int > &partition_list, std::vector< unsigned int > &partition_size, unsigned int &partition) const
unsigned int minimum_parallel_grain_size
virtual void vector_compress_start()=0
Starts the communication for the vector compress operation.
std::vector< unsigned int > partition_odds
MFWorkerInterface ** worker_pointer
std::size_t memory_consumption() const
static ::ExceptionBase & ExcNotImplemented()
PartitionWork(MFWorkerInterface &function_in, const unsigned int partition_in, const TaskInfo &task_info_in, const bool is_blocked_in=false)
static unsigned int n_threads()
const unsigned int partition
const TaskInfo & task_info
const unsigned int partition
ActualCellWork(MFWorkerInterface **worker_pointer, const unsigned int partition, const TaskInfo &task_info)
MinMaxAvg min_max_avg(const double my_value, const MPI_Comm &mpi_communicator)
MFWorkerInterface & worker
tbb::task * execute() override
const unsigned int partition
T max(const T &t, const MPI_Comm &mpi_communicator)
std::vector< unsigned int > partition_evens
void print_memory_statistics(StreamType &out, std::size_t data_length) const
std::enable_if< std::is_fundamental< T >::value, std::size_t >::type memory_consumption(const T &t)
static ::ExceptionBase & ExcInternalError()
void make_thread_graph(const std::vector< unsigned int > &cell_active_fe_index, DynamicSparsityPattern &connectivity, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &irregular_cells, const bool hp_bool)
virtual void vector_compress_finish()=0
Finishes the communication for the vector compress operation.