trilinos · iyamazaki · Dec 12, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 14, 2024
@@ -196,6 +196,8 @@ class TachoSolver : public SolverCore<Amesos2::TachoSolver, Matrix, Vector>
     int small_problem_threshold_size;
     int streams;
     bool verbose;
+    int dofs_per_node;
+    bool pivot_pert;
     // int num_kokkos_threads;
     // int max_num_superblocks;
   } data_;

@@ -27,10 +27,12 @@ TachoSolver<Matrix,Vector>::TachoSolver(
   Teuchos::RCP<const Vector> B )
   : SolverCore<Amesos2::TachoSolver,Matrix,Vector>(A, X, B)
 {
-  data_.method  = 1;     // Cholesky
-  data_.variant = 2;     // solver variant
-  data_.streams = 1;     // # of streams
-  data_.verbose = false; // verbose
+  data_.method    = 1;      // Cholesky
+  data_.variant   = 2;      // solver variant
+  data_.streams   = 1;      // # of streams
+  data_.dofs_per_node = 1;  // DoFs / node
+  data_.pivot_pert = false; // Diagonal pertubation
+  data_.verbose    = false; // verbose
 }
 
 
@@ -82,7 +84,11 @@ TachoSolver<Matrix,Vector>::symbolicFactorization_impl()
     // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks);
 
     // Symbolic factorization currently must be done on host
-    data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
+    if (data_.dofs_per_node > 1) {
+      data_.solver.analyze(this->globalNumCols_, data_.dofs_per_node, host_row_ptr_view_, host_cols_view_);
+    } else {
+      data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
+    }
     data_.solver.initialize();
   }
   return status;
@@ -102,6 +108,11 @@ TachoSolver<Matrix,Vector>::numericFactorization_impl()
     if(do_optimization()) {
      this->matrixA_->returnValues_kokkos_view(device_nzvals_view_);
     }
+    if (data_.pivot_pert) {
+      data_.solver.useDefaultPivotTolerance();
+    } else {
+      data_.solver.useNoPivotTolerance();
+    }
     data_.solver.factorize(device_nzvals_view_);
   }
   return status;
@@ -223,6 +234,10 @@ TachoSolver<Matrix,Vector>::setParameters_impl(const Teuchos::RCP<Teuchos::Param
   data_.verbose = parameterList->get<bool> ("verbose", false);
   // # of streams
   data_.streams = parameterList->get<int> ("num-streams", 1);
+  // DoFs / node
+  data_.dofs_per_node = parameterList->get<int> ("dofs-per-node", 1);
+  // Perturb tiny pivots
+  data_.pivot_pert = parameterList->get<bool> ("perturb-pivot", false);
   // TODO: Confirm param options
   // data_.num_kokkos_threads = parameterList->get<int>("kokkos-threads", 1);
   // data_.max_num_superblocks = parameterList->get<int>("max-num-superblocks", 4);
@@ -243,6 +258,8 @@ TachoSolver<Matrix,Vector>::getValidParameters_impl() const
     pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK.");
     pl->set("verbose", false, "Verbosity");
     pl->set("num-streams", 1, "Number of GPU streams");
+    pl->set("dofs-per-node", 1, "DoFs per node");
+    pl->set("perturb-pivot", false, "Perturb tiny pivots");
 
     // TODO: Confirm param options
     // pl->set("kokkos-threads", 1, "Number of threads");

@@ -25,6 +25,9 @@
 /* Define if want to build with CHOLMOD enabled */
 #cmakedefine TACHO_HAVE_SUITESPARSE
 
+/* Define if want to build with TrilinosSS enabled */
+#cmakedefine TACHO_HAVE_TRILINOS_SS
+
 /* Define if want to build with VTune enabled */
 #cmakedefine TACHO_HAVE_VTUNE
 

@@ -25,8 +25,11 @@ template <typename value_type> int driver(int argc, char *argv[]) {
   std::string file = "test.mtx";
   std::string graph_file = "";
   std::string weight_file = "";
+  int dofs_per_node = 1;
+  bool perturbPivot = false;
   int nrhs = 1;
   bool randomRHS = true;
+  bool onesRHS = false;
   std::string method_name = "chol";
   int method = 1; // 1 - Chol, 2 - LDL, 3 - SymLU
   int small_problem_thres = 1024;
@@ -47,6 +50,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
   opts.set_option<std::string>("file", "Input file (MatrixMarket SPD matrix)", &file);
   opts.set_option<std::string>("graph", "Input condensed graph", &graph_file);
   opts.set_option<std::string>("weight", "Input condensed graph weight", &weight_file);
+  opts.set_option<int>("dofs-per-node", "# DoFs per node", &dofs_per_node);
+  opts.set_option<bool>("perturb", "Flag to perturb tiny pivots", &perturbPivot);
   opts.set_option<int>("nrhs", "Number of RHS vectors", &nrhs);
   opts.set_option<std::string>("method", "Solution method: chol, ldl, lu", &method_name);
   opts.set_option<int>("small-problem-thres", "LAPACK is used smaller than this thres", &small_problem_thres);
@@ -55,6 +60,7 @@ template <typename value_type> int driver(int argc, char *argv[]) {
   opts.set_option<int>("device-solve-thres", "Device function is used above this subproblem size", &device_solve_thres);
   opts.set_option<int>("variant", "algorithm variant in levelset scheduling; 0, 1 and 2", &variant);
   opts.set_option<int>("nstreams", "# of streams used in CUDA; on host, it is ignored", &nstreams);
+  opts.set_option<bool>("one-rhs", "Set RHS to be ones", &onesRHS);
   opts.set_option<bool>("no-warmup", "Flag to turn off warmup", &no_warmup);
   opts.set_option<int>("nfacts", "# of factorizations to perform", &nfacts);
   opts.set_option<int>("nsolves", "# of solves to perform", &nsolves);
@@ -125,6 +131,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
           if (!in.good()) {
             std::cout << "Failed in open the file: " << graph_file << std::endl;
             return -1;
+          } else if (verbose) {
+            std::cout << " > Condensed graph file: " << graph_file << std::endl;
           }
           in >> m_graph;
 
@@ -135,8 +143,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
           aj_graph = ordinal_type_array_host("aj", ap_graph(m_graph));
           for (ordinal_type i = 0; i < m_graph; ++i) {
             const ordinal_type jbeg = ap_graph(i), jend = ap_graph(i + 1);
-            for (ordinal_type j = jbeg; j < jend; ++j)
+            for (ordinal_type j = jbeg; j < jend; ++j) {
               in >> aj_graph(j);
+              aj_graph(j) --; // base-one
+            }
           }
         }
 
@@ -146,6 +156,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
           if (!in.good()) {
             std::cout << "Failed in open the file: " << weight_file << std::endl;
             return -1;
+          } else if (verbose) {
+            std::cout << " > Weight file for condensed graph: " << weight_file << std::endl;
           }
           ordinal_type m(0);
           in >> m;
@@ -160,25 +172,32 @@ template <typename value_type> int driver(int argc, char *argv[]) {
     Tacho::Driver<value_type, device_type> solver;
 
     /// common options
-    solver.setSolutionMethod(method);
-    solver.setSmallProblemThresholdsize(small_problem_thres);
     solver.setVerbose(verbose);
+    solver.setSolutionMethod(method);
+    solver.setLevelSetOptionAlgorithmVariant(variant);
+    solver.setLevelSetOptionNumStreams(nstreams);
 
     /// graph options
     solver.setOrderConnectedGraphSeparately();
 
     /// levelset options
+    solver.setSmallProblemThresholdsize(small_problem_thres);
     solver.setLevelSetOptionDeviceFunctionThreshold(device_factor_thres, device_solve_thres);
-    solver.setLevelSetOptionAlgorithmVariant(variant);
-    solver.setLevelSetOptionNumStreams(nstreams);
+    if (perturbPivot) {
+      if (verbose) std::cout << " > perturb tiny pivots" << std::endl;
+      solver.useDefaultPivotTolerance();
+    }
 
     auto values_on_device = Kokkos::create_mirror_view(typename device_type::memory_space(), A.Values());
     Kokkos::deep_copy(values_on_device, A.Values());
 
     /// inputs are used for graph reordering and analysis
     if (m_graph > 0 && m_graph < A.NumRows())
       solver.analyze(A.NumRows(), A.RowPtr(), A.Cols(), m_graph, ap_graph, aj_graph, aw_graph);
-    else
+    else if (dofs_per_node > 1) {
+      if (verbose) std::cout << " > DoFs / node = " << dofs_per_node << std::endl;
+      solver.analyze(A.NumRows(), dofs_per_node, A.RowPtr(), A.Cols());
+    } else
       solver.analyze(A.NumRows(), A.RowPtr(), A.Cols());
 
     /// create numeric tools and levelset tools
@@ -202,7 +221,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
         t("t", A.NumRows(), nrhs);                  // temp workspace (store permuted rhs)
 
     {
-      if (randomRHS) {
+      if (onesRHS) {
+        const value_type one(1.0);
+        Kokkos::deep_copy (b, one);
+      } else if (randomRHS) {
         Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> random(13718);
         Kokkos::fill_random(b, random, value_type(1));
       } else {
@@ -235,6 +257,7 @@ template <typename value_type> int driver(int argc, char *argv[]) {
 
     std::cout << std::endl;
     std::cout << " Initi Time " << initi_time << std::endl;
+    std::cout << " > nnz = " << solver.getNumNonZerosU() << std::endl;
     std::cout << " Facto Time " << facto_time / (double)nfacts << std::endl;
     std::cout << " Solve Time " << solve_time / (double)nsolves << std::endl;
     std::cout << std::endl;

@@ -371,7 +371,8 @@ inline static void applyPermutationToCrsMatrixLower(/* */ CrsMatrixType &A, cons
 template <typename ValueType, typename DeviceType>
 inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType> &A,
                                       const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &x,
-                                      const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b) {
+                                      const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b,
+                                      const bool verbose = false) {
   const bool test = (size_t(A.NumRows()) != size_t(A.NumCols()) || size_t(A.NumRows()) != size_t(b.extent(0)) ||
                      size_t(x.extent(0)) != size_t(b.extent(0)) || size_t(x.extent(1)) != size_t(b.extent(1)));
   if (test)
@@ -405,6 +406,8 @@ inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType>
       diff += arith_traits::real((h_b(i, p) - s) * arith_traits::conj(h_b(i, p) - s));
     }
   }
+  if (verbose)
+    std::cout << " Relative residual norm = " << sqrt(diff) << " / " << sqrt(norm) << " = " << sqrt(diff/norm) << std::endl;
   return sqrt(diff / norm);
 }
 

@@ -16,6 +16,7 @@
 /// \author Kyungjoo Kim ([email protected])
 
 #include "Tacho.hpp"
+#include "Tacho_Util.hpp"
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Timer.hpp>
@@ -24,7 +25,7 @@ namespace Tacho {
 
 /// forward decl
 class Graph;
-#if defined(TACHO_HAVE_METIS)
+#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
 class GraphTools_Metis;
 #else
 class GraphTools;
@@ -42,6 +43,7 @@ template <typename ValueType, typename DeviceType, int Var> class NumericToolsLe
 template <typename ValueType, typename DeviceType> struct Driver {
 public:
   using value_type = ValueType;
+  using mag_type = typename ArithTraits<ValueType>::mag_type;
   using device_type = DeviceType;
   using exec_space = typename device_type::execution_space;
   using exec_memory_space = typename device_type::memory_space;
@@ -63,7 +65,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
   using crs_matrix_type = CrsMatrixBase<value_type, device_type>;
   using crs_matrix_type_host = CrsMatrixBase<value_type, host_device_type>;
 
-#if defined(TACHO_HAVE_METIS)
+#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
   using graph_tools_type = GraphTools_Metis;
 #else
   using graph_tools_type = GraphTools;
@@ -111,6 +113,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
   ordinal_type_array_host _h_peri_graph;
 
   // ** symbolic factorization output
+  ordinal_type _nnz_u;
   // supernodes output
   ordinal_type _nsupernodes;
   ordinal_type_array _supernodes;
@@ -160,6 +163,8 @@ template <typename ValueType, typename DeviceType> struct Driver {
   ordinal_type _variant;             // algorithmic variant in levelset 0: naive, 1: invert diagonals
   ordinal_type _nstreams;            // on cuda, multi streams are used
 
+  mag_type _pivot_tol;               // tolerance for tiny pivot perturbation
+
   // parallelism and memory constraint is made via this parameter
   ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool
 
@@ -206,9 +211,14 @@ template <typename ValueType, typename DeviceType> struct Driver {
   void setLevelSetOptionNumStreams(const ordinal_type nstreams);
   void setLevelSetOptionAlgorithmVariant(const ordinal_type variant);
 
+  void setPivotTolerance(const mag_type pivot_tol);
+  void useNoPivotTolerance();
+  void useDefaultPivotTolerance();
+
   ///
   /// get interface
   ///
+  ordinal_type getNumNonZerosU() const;
   ordinal_type getNumSupernodes() const;
   ordinal_type_array getSupernodes() const;
   ordinal_type_array getPermutationVector() const;
@@ -222,6 +232,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
   template <typename arg_size_type_array, typename arg_ordinal_type_array>
   int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
               const bool duplicate = false) {
+
     _m = m;
 
     if (duplicate) {
@@ -270,6 +281,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
               const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) {
     _m = m;
 
+    // this takes the user-specified perm, such that analyze() won't call graph partitioner
     if (duplicate) {
       /// for most cases, ap and aj are from host; so construct ap and aj and mirror to device
       _h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0));
@@ -375,6 +387,46 @@ template <typename ValueType, typename DeviceType> struct Driver {
     return analyze();
   }
 
+  template <typename arg_size_type_array, typename arg_ordinal_type_array>
+  int analyze(const ordinal_type m, const ordinal_type blk_size,
+              const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
+              const bool duplicate = false) {
+
+    if (blk_size > 1) {
+      //condense graph before calling analyze
+      const size_type nnz = ap(m);
+      ordinal_type m_graph = m / blk_size;
+      size_type nnz_graph = nnz / (blk_size*blk_size);
+      TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != size_type(blk_size*blk_size) * nnz_graph),
+        std::logic_error, "Failed to initialize the condensed graph");
+
+      size_type_array_host ap_graph
+        (Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph);
+      ordinal_type_array_host aj_graph
+        (Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph);
+      ordinal_type_array_host aw_graph
+        (Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph);
+      // condense the graph
+      nnz_graph = 0;
+      ap_graph(0) = 0;
+      for (ordinal_type i = 0; i < m; i += blk_size) {
+        for (size_type k = ap(i); k < ap(i+1); k++) {
+          if (aj(k)%blk_size == 0) {
+            aj_graph(nnz_graph) = aj(k)/blk_size;
+            nnz_graph++;
+          }
+          aw_graph(i/blk_size) = blk_size;
+          ap_graph((i/blk_size)+1) = nnz_graph;
+        }
+      }
+      TACHO_TEST_FOR_EXCEPTION((nnz != size_type(blk_size*blk_size) * nnz_graph),
+        std::logic_error, "Failed to condense graph");
+      return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate);
+    } else {
+      return analyze(m, ap, aj, duplicate);
+    }
+  }
+
   int initialize();
 
   int factorize(const value_type_array &ax);

@@ -32,10 +32,10 @@ struct CholAlgorithm {
 };
 
 struct CholAlgorithm_Team {
-#if defined(KOKKOS_ENABLE_OPENMP)
-  using type = ActiveHostAlgorithm<runsWithOMP()>::type;
-#else
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
   using type = ActiveAlgorithm<runsOnCudaOrHIP()>::type;
+#else
+  using type = ActiveHostAlgorithm<runsWithOMP()>::type;
 #endif
 };
 } // namespace Tacho