From 1550fd99fec2caf862df4814ec21fef590f52f37 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 8 Nov 2024 20:25:52 -0700 Subject: [PATCH] Tacho : new options (dofs-per-node, pivot-tol, amd) Signed-off-by: iyamazaki --- packages/amesos2/src/Amesos2_Tacho_decl.hpp | 2 + packages/amesos2/src/Amesos2_Tacho_def.hpp | 27 +++++-- .../frosch/test/Thyra_Xpetra_Laplace/main.cpp | 1 + .../shylu_node/tacho/cmake/Tacho_config.h.in | 3 + .../tacho/example/Tacho_ExampleDriver.hpp | 36 +++++++-- .../tacho/src/Tacho_CrsMatrixBase.hpp | 5 +- .../shylu_node/tacho/src/Tacho_Driver.hpp | 54 ++++++++++++- .../tacho/src/impl/Tacho_Driver_Impl.hpp | 19 ++++- .../tacho/src/impl/Tacho_GraphTools.hpp | 5 ++ .../tacho/src/impl/Tacho_GraphTools_Metis.cpp | 35 ++++++-- .../tacho/src/impl/Tacho_GraphTools_Metis.hpp | 13 ++- .../tacho/src/impl/Tacho_LU_Internal.hpp | 19 ++++- .../tacho/src/impl/Tacho_Lapack_Team.hpp | 79 +++++++++++++++++-- .../src/impl/Tacho_NumericTools_Base.hpp | 3 +- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 21 +++-- .../src/impl/Tacho_NumericTools_Serial.hpp | 3 +- .../impl/Tacho_TeamFunctor_FactorizeLU.hpp | 23 +++++- 17 files changed, 301 insertions(+), 47 deletions(-) diff --git a/packages/amesos2/src/Amesos2_Tacho_decl.hpp b/packages/amesos2/src/Amesos2_Tacho_decl.hpp index 95c71b184dc6..07acdaa91e49 100644 --- a/packages/amesos2/src/Amesos2_Tacho_decl.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_decl.hpp @@ -196,6 +196,8 @@ class TachoSolver : public SolverCore int small_problem_threshold_size; int streams; bool verbose; + int dofs_per_node; + bool pivot_pert; // int num_kokkos_threads; // int max_num_superblocks; } data_; diff --git a/packages/amesos2/src/Amesos2_Tacho_def.hpp b/packages/amesos2/src/Amesos2_Tacho_def.hpp index 221e505dbc54..e4f1bd98566b 100644 --- a/packages/amesos2/src/Amesos2_Tacho_def.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_def.hpp @@ -27,10 +27,12 @@ TachoSolver::TachoSolver( Teuchos::RCP B ) : SolverCore(A, X, B) { - data_.method = 1; // Cholesky - data_.variant = 2; // solver variant - data_.streams = 1; // # of streams - data_.verbose = false; // verbose + data_.method = 1; // Cholesky + data_.variant = 2; // solver variant + data_.streams = 1; // # of streams + data_.dofs_per_node = 1; // DoFs / node + data_.pivot_pert = false; // Diagonal pertubation + data_.verbose = false; // verbose } @@ -82,7 +84,11 @@ TachoSolver::symbolicFactorization_impl() // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks); // Symbolic factorization currently must be done on host - data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_); + if (data_.dofs_per_node > 1) { + data_.solver.analyze(this->globalNumCols_, data_.dofs_per_node, host_row_ptr_view_, host_cols_view_); + } else { + data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_); + } data_.solver.initialize(); } return status; @@ -102,6 +108,11 @@ TachoSolver::numericFactorization_impl() if(do_optimization()) { this->matrixA_->returnValues_kokkos_view(device_nzvals_view_); } + if (data_.pivot_pert) { + data_.solver.useDefaultPivotTolerance(); + } else { + data_.solver.useNoPivotTolerance(); + } data_.solver.factorize(device_nzvals_view_); } return status; @@ -223,6 +234,10 @@ TachoSolver::setParameters_impl(const Teuchos::RCPget ("verbose", false); // # of streams data_.streams = parameterList->get ("num-streams", 1); + // DoFs / node + data_.dofs_per_node = parameterList->get ("dofs-per-node", 1); + // Perturb tiny pivots + data_.pivot_pert = parameterList->get ("perturb-pivot", false); // TODO: Confirm param options // data_.num_kokkos_threads = parameterList->get("kokkos-threads", 1); // data_.max_num_superblocks = parameterList->get("max-num-superblocks", 4); @@ -243,6 +258,8 @@ TachoSolver::getValidParameters_impl() const pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK."); pl->set("verbose", false, "Verbosity"); pl->set("num-streams", 1, "Number of GPU streams"); + pl->set("dofs-per-node", 1, "DoFs per node"); + pl->set("perturb-pivot", false, "Perturb tiny pivots"); // TODO: Confirm param options // pl->set("kokkos-threads", 1, "Number of threads"); diff --git a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp index 18b413f02d96..3e0fbdb81231 100644 --- a/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp +++ b/packages/shylu/shylu_dd/frosch/test/Thyra_Xpetra_Laplace/main.cpp @@ -270,6 +270,7 @@ int main(int argc, char *argv[]) } else { assert(false); } + writeMM("Laplace.mtx",KMonolithic); RCP > xSolution = MultiVectorFactory::Build(KMonolithic->getMap(),1); RCP > xRightHandSide = MultiVectorFactory::Build(KMonolithic->getMap(),1); diff --git a/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in b/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in index 9daaa2f69860..a537bf9648c0 100644 --- a/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in +++ b/packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in @@ -25,6 +25,9 @@ /* Define if want to build with CHOLMOD enabled */ #cmakedefine TACHO_HAVE_SUITESPARSE +/* Define if want to build with TrilinosSS enabled */ +#cmakedefine TACHO_HAVE_TRILINOS_SS + /* Define if want to build with VTune enabled */ #cmakedefine TACHO_HAVE_VTUNE diff --git a/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp b/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp index 450e04608954..dce1a645b801 100644 --- a/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp +++ b/packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp @@ -25,8 +25,11 @@ template int driver(int argc, char *argv[]) { std::string file = "test.mtx"; std::string graph_file = ""; std::string weight_file = ""; + int dofs_per_node = 1; + bool perturbPivot = false; int nrhs = 1; bool randomRHS = true; + bool onesRHS = false; std::string method_name = "chol"; int method = 1; // 1 - Chol, 2 - LDL, 3 - SymLU int small_problem_thres = 1024; @@ -47,6 +50,8 @@ template int driver(int argc, char *argv[]) { opts.set_option("file", "Input file (MatrixMarket SPD matrix)", &file); opts.set_option("graph", "Input condensed graph", &graph_file); opts.set_option("weight", "Input condensed graph weight", &weight_file); + opts.set_option("dofs-per-node", "# DoFs per node", &dofs_per_node); + opts.set_option("perturb", "Flag to perturb tiny pivots", &perturbPivot); opts.set_option("nrhs", "Number of RHS vectors", &nrhs); opts.set_option("method", "Solution method: chol, ldl, lu", &method_name); opts.set_option("small-problem-thres", "LAPACK is used smaller than this thres", &small_problem_thres); @@ -55,6 +60,7 @@ template int driver(int argc, char *argv[]) { opts.set_option("device-solve-thres", "Device function is used above this subproblem size", &device_solve_thres); opts.set_option("variant", "algorithm variant in levelset scheduling; 0, 1 and 2", &variant); opts.set_option("nstreams", "# of streams used in CUDA; on host, it is ignored", &nstreams); + opts.set_option("one-rhs", "Set RHS to be ones", &onesRHS); opts.set_option("no-warmup", "Flag to turn off warmup", &no_warmup); opts.set_option("nfacts", "# of factorizations to perform", &nfacts); opts.set_option("nsolves", "# of solves to perform", &nsolves); @@ -125,6 +131,8 @@ template int driver(int argc, char *argv[]) { if (!in.good()) { std::cout << "Failed in open the file: " << graph_file << std::endl; return -1; + } else if (verbose) { + std::cout << " > Condensed graph file: " << graph_file << std::endl; } in >> m_graph; @@ -135,8 +143,10 @@ template int driver(int argc, char *argv[]) { aj_graph = ordinal_type_array_host("aj", ap_graph(m_graph)); for (ordinal_type i = 0; i < m_graph; ++i) { const ordinal_type jbeg = ap_graph(i), jend = ap_graph(i + 1); - for (ordinal_type j = jbeg; j < jend; ++j) + for (ordinal_type j = jbeg; j < jend; ++j) { in >> aj_graph(j); + aj_graph(j) --; // base-one + } } } @@ -146,6 +156,8 @@ template int driver(int argc, char *argv[]) { if (!in.good()) { std::cout << "Failed in open the file: " << weight_file << std::endl; return -1; + } else if (verbose) { + std::cout << " > Weight file for condensed graph: " << weight_file << std::endl; } ordinal_type m(0); in >> m; @@ -160,17 +172,21 @@ template int driver(int argc, char *argv[]) { Tacho::Driver solver; /// common options - solver.setSolutionMethod(method); - solver.setSmallProblemThresholdsize(small_problem_thres); solver.setVerbose(verbose); + solver.setSolutionMethod(method); + solver.setLevelSetOptionAlgorithmVariant(variant); + solver.setLevelSetOptionNumStreams(nstreams); /// graph options solver.setOrderConnectedGraphSeparately(); /// levelset options + solver.setSmallProblemThresholdsize(small_problem_thres); solver.setLevelSetOptionDeviceFunctionThreshold(device_factor_thres, device_solve_thres); - solver.setLevelSetOptionAlgorithmVariant(variant); - solver.setLevelSetOptionNumStreams(nstreams); + if (perturbPivot) { + if (verbose) std::cout << " > perturb tiny pivots" << std::endl; + solver.useDefaultPivotTolerance(); + } auto values_on_device = Kokkos::create_mirror_view(typename device_type::memory_space(), A.Values()); Kokkos::deep_copy(values_on_device, A.Values()); @@ -178,7 +194,10 @@ template int driver(int argc, char *argv[]) { /// inputs are used for graph reordering and analysis if (m_graph > 0 && m_graph < A.NumRows()) solver.analyze(A.NumRows(), A.RowPtr(), A.Cols(), m_graph, ap_graph, aj_graph, aw_graph); - else + else if (dofs_per_node > 1) { + if (verbose) std::cout << " > DoFs / node = " << dofs_per_node << std::endl; + solver.analyze(A.NumRows(), dofs_per_node, A.RowPtr(), A.Cols()); + } else solver.analyze(A.NumRows(), A.RowPtr(), A.Cols()); /// create numeric tools and levelset tools @@ -202,7 +221,10 @@ template int driver(int argc, char *argv[]) { t("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs) { - if (randomRHS) { + if (onesRHS) { + const value_type one(1.0); + Kokkos::deep_copy (b, one); + } else if (randomRHS) { Kokkos::Random_XorShift64_Pool random(13718); Kokkos::fill_random(b, random, value_type(1)); } else { diff --git a/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp b/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp index 5f5278497a86..ed9b3e0ae693 100644 --- a/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp +++ b/packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp @@ -371,7 +371,8 @@ inline static void applyPermutationToCrsMatrixLower(/* */ CrsMatrixType &A, cons template inline double computeRelativeResidual(const CrsMatrixBase &A, const Kokkos::View &x, - const Kokkos::View &b) { + const Kokkos::View &b, + const bool verbose = false) { const bool test = (size_t(A.NumRows()) != size_t(A.NumCols()) || size_t(A.NumRows()) != size_t(b.extent(0)) || size_t(x.extent(0)) != size_t(b.extent(0)) || size_t(x.extent(1)) != size_t(b.extent(1))); if (test) @@ -405,6 +406,8 @@ inline double computeRelativeResidual(const CrsMatrixBase diff += arith_traits::real((h_b(i, p) - s) * arith_traits::conj(h_b(i, p) - s)); } } + if (verbose) + std::cout << " Relative residual norm = " << sqrt(diff) << " / " << sqrt(norm) << " = " << sqrt(diff/norm) << std::endl; return sqrt(diff / norm); } diff --git a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp index 274f4c952092..a8da49d93806 100644 --- a/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp +++ b/packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp @@ -16,6 +16,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "Tacho.hpp" +#include "Tacho_Util.hpp" #include #include @@ -24,7 +25,7 @@ namespace Tacho { /// forward decl class Graph; -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) class GraphTools_Metis; #else class GraphTools; @@ -42,6 +43,7 @@ template class NumericToolsLe template struct Driver { public: using value_type = ValueType; + using mag_type = typename ArithTraits::mag_type; using device_type = DeviceType; using exec_space = typename device_type::execution_space; using exec_memory_space = typename device_type::memory_space; @@ -63,7 +65,7 @@ template struct Driver { using crs_matrix_type = CrsMatrixBase; using crs_matrix_type_host = CrsMatrixBase; -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) using graph_tools_type = GraphTools_Metis; #else using graph_tools_type = GraphTools; @@ -160,6 +162,8 @@ template struct Driver { ordinal_type _variant; // algorithmic variant in levelset 0: naive, 1: invert diagonals ordinal_type _nstreams; // on cuda, multi streams are used + mag_type _pivot_tol; // tolerance for tiny pivot perturbation + // parallelism and memory constraint is made via this parameter ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool @@ -206,6 +210,10 @@ template struct Driver { void setLevelSetOptionNumStreams(const ordinal_type nstreams); void setLevelSetOptionAlgorithmVariant(const ordinal_type variant); + void setPivotTolerance(const mag_type pivot_tol); + void useNoPivotTolerance(); + void useDefaultPivotTolerance(); + /// /// get interface /// @@ -222,6 +230,7 @@ template struct Driver { template int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj, const bool duplicate = false) { + _m = m; if (duplicate) { @@ -270,6 +279,7 @@ template struct Driver { const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) { _m = m; + // this takes the user-specified perm, such that analyze() won't call graph partitioner if (duplicate) { /// for most cases, ap and aj are from host; so construct ap and aj and mirror to device _h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0)); @@ -375,6 +385,46 @@ template struct Driver { return analyze(); } + template + int analyze(const ordinal_type m, const ordinal_type blk_size, + const arg_size_type_array &ap, const arg_ordinal_type_array &aj, + const bool duplicate = false) { + + if (blk_size > 1) { + //condense graph before calling analyze + const size_type nnz = ap(m); + size_type m_graph = m / blk_size; + size_type nnz_graph = nnz / (blk_size*blk_size); + TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != blk_size*blk_size * nnz_graph), + std::logic_error, "Failed to initialize the condensed graph"); + + size_type_array_host ap_graph + (Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph); + ordinal_type_array_host aj_graph + (Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph); + ordinal_type_array_host aw_graph + (Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph); + // condense the graph + nnz_graph = 0; + ap_graph(0) = 0; + for (size_type i = 0; i < m; i += blk_size) { + for (size_type k = ap(i); k < ap(i+1); k++) { + if (aj(k)%blk_size == 0) { + aj_graph(nnz_graph) = aj(k)/blk_size; + nnz_graph++; + } + aw_graph(i/blk_size) = blk_size; + ap_graph((i/blk_size)+1) = nnz_graph; + } + } + TACHO_TEST_FOR_EXCEPTION((nnz != blk_size*blk_size * nnz_graph), + std::logic_error, "Failed to condense graph"); + return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate); + } else { + return analyze(m, ap, aj, duplicate); + } + } + int initialize(); int factorize(const value_type_array &ax); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp index bf5e720265ee..605130e14fa9 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp @@ -26,7 +26,7 @@ Driver::Driver() _h_perm(), _peri(), _h_peri(), _m_graph(0), _nnz_graph(0), _h_ap_graph(), _h_aj_graph(), _h_perm_graph(), _h_peri_graph(), _nsupernodes(0), _N(nullptr), _verbose(0), _small_problem_thres(1024), _serial_thres_size(-1), _mb(-1), _nb(-1), _front_update_mode(-1), _levelset(0), _device_level_cut(0), _device_factor_thres(128), - _device_solve_thres(128), _variant(2), _nstreams(16), _max_num_superblocks(-1) {} + _device_solve_thres(128), _variant(2), _nstreams(16), _pivot_tol(0.0), _max_num_superblocks(-1) {} /// /// duplicate the object @@ -157,6 +157,19 @@ template void Driver::setLevelSetOptionNumStr _nstreams = nstreams; } +template void Driver::setPivotTolerance(const mag_type pivot_tol) { + _pivot_tol = pivot_tol; +} + +template void Driver::useNoPivotTolerance() { + _pivot_tol = 0.0; +} + +template void Driver::useDefaultPivotTolerance() { + using arith_traits = ArithTraits; + _pivot_tol = sqrt(arith_traits::epsilon()); +} + /// /// get interface /// @@ -373,7 +386,7 @@ template int Driver::factorize(const value_ty if (_m < _small_problem_thres) { factorize_small_host(ax); } else { - _N->factorize(ax, _verbose); + _N->factorize(ax, _pivot_tol, _verbose); } return 0; } @@ -541,7 +554,7 @@ double Driver::computeRelativeResidual(const value_type_array &ax, const CrsMatrixBase A; A.setExternalMatrix(_m, _m, _nnz, _ap, _aj, ax); - return Tacho::computeRelativeResidual(A, x, b); + return Tacho::computeRelativeResidual(A, x, b, _verbose); } template diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp index 9d48cd14fb96..a4a7c2948e46 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp @@ -67,6 +67,11 @@ class GraphTools { _perm(i) = i; _peri(i) = i; } + if (verbose) { + printf("Summary: GraphTools (Default)\n"); + printf("=============================\n"); + printf( " Use Natural Ordering\n\n" ); + } } ordinal_type_array PermVector() const { return _perm; } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp index a85ef651cc4a..a475f729f38a 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.cpp @@ -13,7 +13,7 @@ #include "Tacho_Util.hpp" -#if defined(TACHO_HAVE_METIS) +#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) #include "Tacho_GraphTools_Metis.hpp" namespace Tacho { @@ -39,8 +39,15 @@ GraphTools_Metis::GraphTools_Metis(const Graph &g) { for (ordinal_type i = 0; i < static_cast(_adjncy.extent(0)); ++i) _adjncy(i) = g_col_idx(i); +#if defined(TACHO_HAVE_METIS) + _algo = 2; METIS_SetDefaultOptions(_options); _options[METIS_OPTION_NUMBERING] = 0; +#elif defined(TACHO_HAVE_TRILINOS_SS) + _algo = 1; +#else + _algo = 0; +#endif _perm_t = idx_t_array(do_not_initialize_tag("idx_t_perm"), _nvts); _peri_t = idx_t_array(do_not_initialize_tag("idx_t_peri"), _nvts); @@ -52,7 +59,12 @@ GraphTools_Metis::GraphTools_Metis(const Graph &g) { GraphTools_Metis::~GraphTools_Metis() {} void GraphTools_Metis::setVerbose(const bool verbose) { _verbose = verbose; } -void GraphTools_Metis::setOption(const int id, const idx_t value) { _options[id] = value; } +void GraphTools_Metis::setOption(const int id, const idx_t value) { +#if defined(TACHO_HAVE_METIS) + _options[id] = value; +#endif +} +void GraphTools_Metis::setAlgorithm(const int algo) { _algo = algo; } /// /// reorder by amd @@ -81,13 +93,12 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { Kokkos::Timer timer; double t_metis = 0; - int algo = 2; - if (algo == 0) { + if (_algo == 0) { for (ordinal_type i = 0; i < _nvts; ++i) { _perm(i) = i; _peri(i) = i; } - } else if (algo == 1) { + } else if (_algo == 1) { int ierr = 0; double amd_info[TRILINOS_AMD_INFO]; @@ -100,8 +111,10 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { _peri(_perm(i)) = i; } - TACHO_TEST_FOR_EXCEPTION(ierr != METIS_OK, std::runtime_error, "Failed in trilinos_amd"); + // ierr != TRILINOS_AMD_OK && ierr != TRILINOS_AMD_OK_BUT_JUMBLED + TACHO_TEST_FOR_EXCEPTION(ierr < TRILINOS_AMD_OK, std::runtime_error, "Failed in trilinos_amd"); } else { +#if defined(TACHO_HAVE_METIS) int ierr = 0; idx_t *xadj = (idx_t *)_xadj.data(); @@ -121,11 +134,19 @@ void GraphTools_Metis::reorder(const ordinal_type verbose) { } TACHO_TEST_FOR_EXCEPTION(ierr != METIS_OK, std::runtime_error, "Failed in METIS_NodeND"); +#else + TACHO_TEST_FOR_EXCEPTION(true, std::runtime_error, "METIS is not enabled"); +#endif } _is_ordered = true; if (verbose) { - printf("Summary: GraphTools (Metis)\n"); + if (_algo == 0) + printf("Summary: GraphTools (Natural)\n"); + else if (_algo == 1) + printf("Summary: GraphTools (AMD)\n"); + else + printf("Summary: GraphTools (Metis)\n"); printf("===========================\n"); switch (verbose) { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp index e3dd1856e601..87119b84de0f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools_Metis.hpp @@ -16,11 +16,12 @@ #include "Tacho_Util.hpp" -#if defined(TACHO_HAVE_METIS) #include "Tacho_Graph.hpp" #include "trilinos_amd.h" -#include "metis.h" +#if defined(TACHO_HAVE_METIS) + #include "metis.h" +#endif namespace Tacho { @@ -28,6 +29,9 @@ class GraphTools_Metis { public: typedef typename UseThisDevice::type host_device_type; + #if !defined(TACHO_HAVE_METIS) + typedef ordinal_type idx_t; + #endif typedef Kokkos::View idx_t_array; typedef Kokkos::View ordinal_type_array; @@ -36,7 +40,10 @@ class GraphTools_Metis { idx_t _nvts; idx_t_array _xadj, _adjncy, _vwgt; + int _algo; + #if defined(TACHO_HAVE_METIS) idx_t _options[METIS_NOPTIONS]; + #endif // metis output idx_t_array _perm_t, _peri_t; @@ -61,6 +68,7 @@ class GraphTools_Metis { void setVerbose(const bool verbose); void setOption(const int id, const idx_t value); + void setAlgorithm(const int algo); template ordering_type amd_order(ordering_type n, const ordering_type *xadj, @@ -82,4 +90,3 @@ class GraphTools_Metis { } // namespace Tacho #endif -#endif diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp index b30c0c85c34f..26f4c4c202c6 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_LU_Internal.hpp @@ -25,7 +25,6 @@ template <> struct LU { template KOKKOS_INLINE_FUNCTION static int invoke(MemberType &member, const ViewTypeA &A, const ViewTypeP &P) { typedef typename ViewTypeA::non_const_value_type value_type; - // typedef typename ViewTypeP::non_const_value_type p_value_type; static_assert(ViewTypeA::rank == 2, "A is not rank 2 view."); static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); @@ -41,6 +40,24 @@ template <> struct LU { return r_val; } + template + KOKKOS_INLINE_FUNCTION static int invoke(MemberType &member, const double tol, const ViewTypeA &A, const ViewTypeP &P) { + typedef typename ViewTypeA::non_const_value_type value_type; + + static_assert(ViewTypeA::rank == 2, "A is not rank 2 view."); + static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); + + TACHO_TEST_FOR_ABORT(P.extent(0) < 4 * A.extent(0), "P should be 4*A.extent(0) ."); + + int r_val(0); + const ordinal_type m = A.extent(0), n = A.extent(1); + if (m > 0 && n > 0) { + /// factorize LU + LapackTeam::getrf(member, tol, m, n, A.data(), A.stride_1(), P.data(), &r_val); + } + return r_val; + } + template KOKKOS_INLINE_FUNCTION static int modify(const MemberType &member, const ordinal_type m, const ViewTypeP &P) { static_assert(ViewTypeP::rank == 1, "P is not rank 1 view."); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp index cde52b82693a..939ff6f240d8 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Lapack_Team.hpp @@ -231,13 +231,13 @@ template struct LapackTeam { template static KOKKOS_INLINE_FUNCTION void getrf(const MemberType &member, const int m, const int n, T *KOKKOS_RESTRICT A, const int as1, int *KOKKOS_RESTRICT ipiv, int *info) { + *info = 0; if (m <= 0 || n <= 0) return; using arith_traits = ArithTraits; using mag_type = typename arith_traits::mag_type; - - const T zero(0); + const mag_type zero(0); const int as0 = 1; for (int p = 0; p < m; ++p) { const int iend = m - p - 1, jend = n - p - 1; @@ -248,8 +248,9 @@ template struct LapackTeam { *KOKKOS_RESTRICT a12 = A + (p) * as0 + (p + 1) * as1, *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; + int idx(0); + mag_type val(0.0); { - int idx(0); using reducer_value_type = typename Kokkos::MaxLoc::value_type; reducer_value_type value; Kokkos::MaxLoc reducer_value(value); @@ -265,10 +266,11 @@ template struct LapackTeam { reducer_value); member.team_barrier(); idx = value.loc; + val = value.val; /// pivot Kokkos::single(Kokkos::PerThread(member), [&]() { - if (*info == 0 && *alpha11 == zero) { + if (*info == 0 && val == zero) { *info = 1+p; } ipiv[p] = p + idx + 1; @@ -279,9 +281,74 @@ template struct LapackTeam { member.team_barrier(); } } - + const T alpha = *alpha11; // swapped, so contains new pivot + if(val != zero) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, iend), [&](const int &i) { a21[i * as0] /= alpha; }); + member.team_barrier(); + } + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, jend), [&](const int &j) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, iend), + [&](const int &i) { A22[i * as0 + j * as1] -= a21[i * as0] * a12[j * as1]; }); + }); member.team_barrier(); - const T alpha = *alpha11; + } + } + + template + static KOKKOS_INLINE_FUNCTION void getrf(const MemberType &member, const double tol, const int m, const int n, T *KOKKOS_RESTRICT A, + const int as1, int *KOKKOS_RESTRICT ipiv, int *info) { + *info = 0; + if (m <= 0 || n <= 0) + return; + + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + const mag_type zero(0); + //const mag_type tol = sqrt(arith_traits::epsilon()); + const int as0 = 1; + for (int p = 0; p < m; ++p) { + const int iend = m - p - 1, jend = n - p - 1; + T *KOKKOS_RESTRICT alpha11 = A + (p)*as0 + (p)*as1, // as0 & as1 are leading dimension for rows & cols + *KOKKOS_RESTRICT AB = A + (p) * as0, + *KOKKOS_RESTRICT ABR = alpha11, + *KOKKOS_RESTRICT a21 = A + (p + 1) * as0 + (p) * as1, + *KOKKOS_RESTRICT a12 = A + (p) * as0 + (p + 1) * as1, + *KOKKOS_RESTRICT A22 = A + (p + 1) * as0 + (p + 1) * as1; + + int idx(0); + mag_type val(0.0); + { + using reducer_value_type = typename Kokkos::MaxLoc::value_type; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, 1 + iend), + [&](const int &i, reducer_value_type &update) { + const mag_type val = arith_traits::abs(ABR[i * as0]); + if (val > update.val) { + update.val = val; + update.loc = i; + } + }, + reducer_value); + member.team_barrier(); + idx = value.loc; + val = value.val; + + /// pivot + Kokkos::single(Kokkos::PerThread(member), [&]() { + if (val < tol) { + ABR[idx * as0] = (arith_traits::real(ABR[idx * as0]) < zero ? -T(tol) : T(tol)); + } + ipiv[p] = p + idx + 1; + }); + if (idx) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const int &j) { swap(AB[j * as1], AB[idx * as0 + j * as1]); }); + member.team_barrier(); + } + } + const T alpha = *alpha11; // swapped, so contains new pivot Kokkos::parallel_for(Kokkos::TeamVectorRange(member, iend), [&](const int &i) { a21[i * as0] /= alpha; }); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, jend), [&](const int &j) { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp index 312c2bfcefd9..5430e789a462 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp @@ -24,6 +24,7 @@ namespace Tacho { template class NumericToolsBase { public: using value_type = ValueType; + using mag_type = typename ArithTraits::mag_type; using device_type = DeviceType; using exec_space = typename device_type::execution_space; using exec_memory_space = typename device_type::memory_space; @@ -243,7 +244,7 @@ template class NumericToolsBase { } } - inline virtual void factorize(const value_type_array &ax, const ordinal_type verbose = 0) { + inline virtual void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) { TACHO_TEST_FOR_EXCEPTION(true, std::logic_error, "The function should be overriden by derived classes"); } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 18897036922a..334d514821b7 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -88,6 +88,7 @@ #endif #endif + namespace Tacho { template @@ -112,6 +113,7 @@ class NumericToolsLevelSet : public NumericToolsBase { using typename base_type::supernode_info_type; using typename base_type::supernode_type_array_host; using typename base_type::value_type; + using typename base_type::mag_type; using typename base_type::int_type_array; using typename base_type::value_type_array; using typename base_type::value_type_matrix; @@ -3669,7 +3671,7 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::parallel_for( policy, KOKKOS_LAMBDA(const ordinal_type &i) { buf_solve_nrhs_ptr(i) = nrhs * buf_solve_ptr(i); }); Kokkos::deep_copy(_h_buf_solve_nrhs_ptr, _buf_solve_nrhs_ptr); - _nrhs = nrhs; + _nrhs = nrhs; } } } @@ -4204,7 +4206,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } } - inline void factorizeLU(const value_type_array &ax, const ordinal_type verbose) { + inline void factorizeLU(const value_type_array &ax, const mag_type pivot_tol, const ordinal_type verbose) { constexpr bool is_host = std::is_same::value; Kokkos::Timer timer; Kokkos::Timer tick; @@ -4278,7 +4280,12 @@ class NumericToolsLevelSet : public NumericToolsBase { team_policy_factor policy_factor(1, 1, 1); team_policy_update policy_update(1, 1, 1); functor_type functor(_info, _factorize_mode, _level_sids, _piv, _buf, &rval); - + if (pivot_tol > 0.0) { + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + const mag_type tol = sqrt(arith_traits::epsilon()); + functor.setDiagPertubationTol(pivot_tol); + } // get max vector length const ordinal_type vmax = policy_factor.vector_length_max(); { @@ -4333,7 +4340,9 @@ class NumericToolsLevelSet : public NumericToolsBase { if (rval != 0) { TACHO_TEST_FOR_EXCEPTION(rval, std::runtime_error, "GETRF (team) returns non-zero error code."); } - + if (_status != 0) { + TACHO_TEST_FOR_EXCEPTION(rval, std::runtime_error, "GETRF (device) returns non-zero error code."); + } Kokkos::parallel_for("update factor", policy_update, functor); if (verbose) { Kokkos::fence(); time_update += tick.seconds(); @@ -4564,7 +4573,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } } - inline void factorize(const value_type_array &ax, const ordinal_type verbose = 0) override { + inline void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) override { Kokkos::deep_copy(_superpanel_buf, value_type(0)); switch (this->getSolutionMethod()) { case 1: { /// Cholesky @@ -4600,7 +4609,7 @@ class NumericToolsLevelSet : public NumericToolsBase { track_alloc(_piv.span() * sizeof(ordinal_type)); } } - factorizeLU(ax, verbose); + factorizeLU(ax, pivot_tol, verbose); break; } default: { diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp index 584930b56525..86b65e7ef78f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Serial.hpp @@ -45,6 +45,7 @@ class NumericToolsSerial : public NumericToolsBase { using typename base_type::ordinal_type_array; using typename base_type::ordinal_type_array_host; using typename base_type::size_type_array; + using typename base_type::mag_type; using typename base_type::value_type; using typename base_type::value_type_array; using typename base_type::value_type_matrix; @@ -475,7 +476,7 @@ class NumericToolsSerial : public NumericToolsBase { /// /// main interface /// - inline void factorize(const value_type_array &ax, const ordinal_type verbose = 0) override { + inline void factorize(const value_type_array &ax, const mag_type pivot_tol = 0.0, const ordinal_type verbose = 0) override { { const bool test = !std::is_same::value; TACHO_TEST_FOR_EXCEPTION(test, std::logic_error, "Serial interface works on host device only"); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp index 3ad435b8e853..33caa7532fb0 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeLU.hpp @@ -34,6 +34,9 @@ template struct TeamFunctor_FactorizeLU { using value_type_array = typename supernode_info_type::value_type_array; using value_type_matrix = typename supernode_info_type::value_type_matrix; + using arith_traits = ArithTraits; + using mag_type = typename arith_traits::mag_type; + private: supernode_info_type _info; ordinal_type_array _compute_mode, _level_sids; @@ -44,6 +47,7 @@ template struct TeamFunctor_FactorizeLU { size_type_array _buf_ptr; value_type_array _buf; + mag_type _tol; int *_rval; public: @@ -54,7 +58,8 @@ template struct TeamFunctor_FactorizeLU { TeamFunctor_FactorizeLU(const supernode_info_type &info, const ordinal_type_array &compute_mode, const ordinal_type_array &level_sids, const ordinal_type_array &piv, const value_type_array buf, int *rval) - : _info(info), _compute_mode(compute_mode), _level_sids(level_sids), _piv(piv), _buf(buf), _rval(rval) {} + : _info(info), _compute_mode(compute_mode), _level_sids(level_sids), _piv(piv), _buf(buf), + _tol(0.0), _rval(rval) {} inline void setRange(const ordinal_type pbeg, const ordinal_type pend) { _pbeg = pbeg; @@ -62,6 +67,7 @@ template struct TeamFunctor_FactorizeLU { } inline void setBufferPtr(const size_type_array &buf_ptr) { _buf_ptr = buf_ptr; } + inline void setDiagPertubationTol(const mag_type tol) { _tol = tol; } /// /// Main functions @@ -78,7 +84,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1); @@ -117,7 +126,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1); @@ -178,7 +190,10 @@ template struct TeamFunctor_FactorizeLU { if (m > 0) { UnmanagedViewType AT(s.u_buf, m, n); - err = LU::invoke(member, AT, P); + if (_tol > 0.0) + err = LU::invoke(member, _tol, AT, P); + else + err = LU::invoke(member, AT, P); member.team_barrier(); if (err != 0) { Kokkos::atomic_add(_rval, 1);