Reference documentation for deal.II version 9.2.0
\(\newcommand{\dealvcentcolon}{\mathrel{\mathop{:}}}\) \(\newcommand{\dealcoloneq}{\dealvcentcolon\mathrel{\mkern-1.2mu}=}\) \(\newcommand{\jump}[1]{\left[\!\left[ #1 \right]\!\right]}\) \(\newcommand{\average}[1]{\left\{\!\left\{ #1 \right\}\!\right\}}\)
cuda_matrix_free.h
Go to the documentation of this file.
1 // ---------------------------------------------------------------------
2 //
3 // Copyright (C) 2016 - 2020 by the deal.II authors
4 //
5 // This file is part of the deal.II library.
6 //
7 // The deal.II library is free software; you can use it, redistribute
8 // it, and/or modify it under the terms of the GNU Lesser General
9 // Public License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 // The full text of the license can be found in the file LICENSE.md at
12 // the top level directory of deal.II.
13 //
14 // ---------------------------------------------------------------------
15 
16 
17 #ifndef dealii_cuda_matrix_free_h
18 #define dealii_cuda_matrix_free_h
19 
20 #include <deal.II/base/config.h>
21 
22 #ifdef DEAL_II_COMPILER_CUDA_AWARE
23 
24 # include <deal.II/base/cuda_size.h>
25 # include <deal.II/base/mpi.h>
26 # include <deal.II/base/quadrature.h>
27 # include <deal.II/base/tensor.h>
28 
30 
32 # include <deal.II/fe/mapping.h>
33 # include <deal.II/fe/mapping_q1.h>
34 
36 # include <deal.II/lac/cuda_vector.h>
38 
39 
41 
42 namespace CUDAWrappers
43 {
44  // forward declaration
45 # ifndef DOXYGEN
46  namespace internal
47  {
48  template <int dim, typename Number>
49  class ReinitHelper;
50  }
51 # endif
52 
79  template <int dim, typename Number = double>
80  class MatrixFree : public Subscriptor
81  {
82  public:
85 
92  {
94  parallel_over_elem
95  };
96 
101  {
106  const ParallelizationScheme parallelization_scheme = parallel_in_elem,
107  const UpdateFlags mapping_update_flags = update_gradients |
110  const bool use_coloring = false,
111  const bool overlap_communication_computation = false)
112  : parallelization_scheme(parallelization_scheme)
113  , mapping_update_flags(mapping_update_flags)
114  , use_coloring(use_coloring)
115  , overlap_communication_computation(overlap_communication_computation)
116  {
117 # ifndef DEAL_II_MPI_WITH_CUDA_SUPPORT
118  AssertThrow(
119  overlap_communication_computation == false,
120  ExcMessage(
121  "Overlapping communication and computation requires CUDA-aware MPI."));
122 # endif
123  if (overlap_communication_computation == true)
124  AssertThrow(
125  use_coloring == false || overlap_communication_computation == false,
126  ExcMessage(
127  "Overlapping communication and coloring are incompatible options. Only one of them can be enabled."));
128  }
144 
151 
157  };
158 
163  struct Data
164  {
169 
175 
179  Number *inv_jacobian;
180 
184  Number *JxW;
185 
189  unsigned int n_cells;
190 
194  unsigned int padding_length;
195 
199  unsigned int row_start;
200 
204  unsigned int *constraint_mask;
205 
211  };
212 
216  MatrixFree();
217 
221  unsigned int
222  get_padding_length() const;
223 
232  void
233  reinit(const Mapping<dim> & mapping,
234  const DoFHandler<dim> & dof_handler,
235  const AffineConstraints<Number> &constraints,
236  const Quadrature<1> & quad,
237  const AdditionalData & additional_data = AdditionalData());
238 
242  void
243  reinit(const DoFHandler<dim> & dof_handler,
244  const AffineConstraints<Number> &constraints,
245  const Quadrature<1> & quad,
247 
251  Data
252  get_data(unsigned int color) const;
253 
254  // clang-format off
272  // clang-format on
273  template <typename Functor, typename VectorType>
274  void
275  cell_loop(const Functor & func,
276  const VectorType &src,
277  VectorType & dst) const;
278 
294  template <typename Functor>
295  void
296  evaluate_coefficients(Functor func) const;
297 
302  template <typename VectorType>
303  void
304  copy_constrained_values(const VectorType &src, VectorType &dst) const;
305 
311  template <typename VectorType>
312  void
313  set_constrained_values(const Number val, VectorType &dst) const;
314 
319  void
320  initialize_dof_vector(
322 
328  void
329  initialize_dof_vector(
331 
335  void
336  free();
337 
341  std::size_t
342  memory_consumption() const;
343 
344  private:
348  void
349  internal_reinit(const Mapping<dim> & mapping,
350  const DoFHandler<dim> & dof_handler,
351  const AffineConstraints<Number> &constraints,
352  const Quadrature<1> & quad,
353  std::shared_ptr<const MPI_Comm> comm,
354  const AdditionalData additional_data);
355 
360  template <typename Functor, typename VectorType>
361  void
362  serial_cell_loop(const Functor & func,
363  const VectorType &src,
364  VectorType & dst) const;
365 
370  template <typename Functor>
371  void
372  distributed_cell_loop(
373  const Functor & func,
376 
382  template <typename Functor>
383  void
384  distributed_cell_loop(
385  const Functor & func,
388 
393  template <typename VectorType>
394  void
395  serial_copy_constrained_values(const VectorType &src,
396  VectorType & dst) const;
397 
402  void
403  distributed_copy_constrained_values(
406 
413  void
414  distributed_copy_constrained_values(
417 
422  template <typename VectorType>
423  void
424  serial_set_constrained_values(const Number val, VectorType &dst) const;
425 
430  void
431  distributed_set_constrained_values(
432  const Number val,
434 
441  void
442  distributed_set_constrained_values(
443  const Number val,
445 
451 
458 
464 
469 
473  unsigned int fe_degree;
474 
478  unsigned int dofs_per_cell;
479 
483  unsigned int n_constrained_dofs;
484 
488  unsigned int q_points_per_cell;
489 
493  unsigned int n_colors;
494 
498  std::vector<unsigned int> n_cells;
499 
504  std::vector<point_type *> q_points;
505 
510  std::vector<types::global_dof_index *> local_to_global;
511 
516  std::vector<Number *> inv_jacobian;
517 
522  std::vector<Number *> JxW;
523 
528 
532  std::vector<unsigned int *> constraint_mask;
533 
538  std::vector<dim3> grid_dim;
539 
544  std::vector<dim3> block_dim;
545 
550  std::shared_ptr<const Utilities::MPI::Partitioner> partitioner;
551 
555  unsigned int cells_per_block;
556 
562 
568 
573  unsigned int padding_length;
574 
578  std::vector<unsigned int> row_start;
579 
580  friend class internal::ReinitHelper<dim, Number>;
581  };
582 
583 
584 
585  // TODO find a better place to put these things
586 
590  template <int dim, typename Number>
591  struct SharedData
592  {
596  __device__
597  SharedData(Number *vd, Number *gq[dim])
598  : values(vd)
599  {
600  for (int d = 0; d < dim; ++d)
601  gradients[d] = gq[d];
602  }
603 
607  Number *values;
608 
614  Number *gradients[dim];
615  };
616 
617 
618 
619  // This function determines the number of cells per block, possibly at compile
620  // time (by virtue of being 'constexpr')
621  // TODO this function should be rewritten using meta-programming
622  __host__ __device__ constexpr unsigned int
623  cells_per_block_shmem(int dim, int fe_degree)
624  {
625  /* clang-format off */
626  // We are limiting the number of threads according to the
627  // following formulas:
628  // - in 2D: `threads = cells * (k+1)^d <= 4*CUDAWrappers::warp_size`
629  // - in 3D: `threads = cells * (k+1)^d <= 2*CUDAWrappers::warp_size`
630  return dim==2 ? (fe_degree==1 ? CUDAWrappers::warp_size : // 128
631  fe_degree==2 ? CUDAWrappers::warp_size/4 : // 72
632  fe_degree==3 ? CUDAWrappers::warp_size/8 : // 64
633  fe_degree==4 ? CUDAWrappers::warp_size/8 : // 100
634  1) :
635  dim==3 ? (fe_degree==1 ? CUDAWrappers::warp_size/4 : // 64
636  fe_degree==2 ? CUDAWrappers::warp_size/16 : // 54
637  1) : 1;
638  /* clang-format on */
639  }
640 
641 
642 
648  template <int dim>
649  __device__ inline unsigned int
650  q_point_id_in_cell(const unsigned int n_q_points_1d)
651  {
652  return (dim == 1 ?
653  threadIdx.x % n_q_points_1d :
654  dim == 2 ?
655  threadIdx.x % n_q_points_1d + n_q_points_1d * threadIdx.y :
656  threadIdx.x % n_q_points_1d +
657  n_q_points_1d * (threadIdx.y + n_q_points_1d * threadIdx.z));
658  }
659 
660 
661 
668  template <int dim, typename Number>
669  __device__ inline unsigned int
671  const unsigned int cell,
672  const typename CUDAWrappers::MatrixFree<dim, Number>::Data *data,
673  const unsigned int n_q_points_1d,
674  const unsigned int n_q_points)
675  {
676  return (data->row_start / data->padding_length + cell) * n_q_points +
677  q_point_id_in_cell<dim>(n_q_points_1d);
678  }
679 
680 
681 
687  template <int dim, typename Number>
688  __device__ inline typename CUDAWrappers::MatrixFree<dim, Number>::point_type &
690  const unsigned int cell,
691  const typename CUDAWrappers::MatrixFree<dim, Number>::Data *data,
692  const unsigned int n_q_points_1d)
693  {
694  return *(data->q_points + data->padding_length * cell +
695  q_point_id_in_cell<dim>(n_q_points_1d));
696  }
697 } // namespace CUDAWrappers
698 
700 
701 #endif
702 
703 #endif
Transformed quadrature weights.
constexpr int warp_size
Definition: cuda_size.h:40
ParallelizationScheme parallelization_scheme
std::shared_ptr< const Utilities::MPI::Partitioner > partitioner
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
Definition: matrix_block.h:621
CUDAWrappers::MatrixFree< dim, Number >::point_type & get_quadrature_point(const unsigned int cell, const typename CUDAWrappers::MatrixFree< dim, Number >::Data *data, const unsigned int n_q_points_1d)
__host__ constexpr unsigned int cells_per_block_shmem(int dim, int fe_degree)
Transformed quadrature points.
types::global_dof_index * constrained_dofs
#define AssertThrow(cond, exc)
Definition: exceptions.h:1531
types::global_dof_index n_dofs
Definition: point.h:111
AdditionalData(const ParallelizationScheme parallelization_scheme=parallel_in_elem, const UpdateFlags mapping_update_flags=update_gradients|update_JxW_values|update_quadrature_points, const bool use_coloring=false, const bool overlap_communication_computation=false)
static ::ExceptionBase & ExcMessage(std::string arg1)
std::vector< point_type * > q_points
unsigned int q_point_id_in_cell(const unsigned int n_q_points_1d)
SharedData(Number *vd, Number *gq[dim])
std::vector< unsigned int > row_start
UpdateFlags
std::vector< Number * > inv_jacobian
#define DEAL_II_NAMESPACE_CLOSE
Definition: config.h:359
unsigned int local_q_point_id(const unsigned int cell, const typename CUDAWrappers::MatrixFree< dim, Number >::Data *data, const unsigned int n_q_points_1d, const unsigned int n_q_points)
std::vector< dim3 > block_dim
std::vector< unsigned int * > constraint_mask
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
std::vector< dim3 > grid_dim
std::vector< types::global_dof_index * > local_to_global
Definition: tensor.h:450
#define DEAL_II_NAMESPACE_OPEN
Definition: config.h:358
Shape function gradients.
std::vector< Number * > JxW
void free(T *&pointer)
Definition: cuda.h:96
std::vector< unsigned int > n_cells
types::global_dof_index * local_to_global
std::enable_if< std::is_fundamental< T >::value, std::size_t >::type memory_consumption(const T &t)