Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tacho : new options (dofs-per-node, pivot-tol, amd) #13585

Merged
merged 7 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/amesos2/src/Amesos2_Tacho_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ class TachoSolver : public SolverCore<Amesos2::TachoSolver, Matrix, Vector>
int small_problem_threshold_size;
int streams;
bool verbose;
int dofs_per_node;
bool pivot_pert;
// int num_kokkos_threads;
// int max_num_superblocks;
} data_;
Expand Down
27 changes: 22 additions & 5 deletions packages/amesos2/src/Amesos2_Tacho_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ TachoSolver<Matrix,Vector>::TachoSolver(
Teuchos::RCP<const Vector> B )
: SolverCore<Amesos2::TachoSolver,Matrix,Vector>(A, X, B)
{
data_.method = 1; // Cholesky
data_.variant = 2; // solver variant
data_.streams = 1; // # of streams
data_.verbose = false; // verbose
data_.method = 1; // Cholesky
data_.variant = 2; // solver variant
data_.streams = 1; // # of streams
data_.dofs_per_node = 1; // DoFs / node
data_.pivot_pert = false; // Diagonal pertubation
data_.verbose = false; // verbose
}


Expand Down Expand Up @@ -82,7 +84,11 @@ TachoSolver<Matrix,Vector>::symbolicFactorization_impl()
// data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks);

// Symbolic factorization currently must be done on host
data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
if (data_.dofs_per_node > 1) {
data_.solver.analyze(this->globalNumCols_, data_.dofs_per_node, host_row_ptr_view_, host_cols_view_);
} else {
data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
}
data_.solver.initialize();
}
return status;
Expand All @@ -102,6 +108,11 @@ TachoSolver<Matrix,Vector>::numericFactorization_impl()
if(do_optimization()) {
this->matrixA_->returnValues_kokkos_view(device_nzvals_view_);
}
if (data_.pivot_pert) {
data_.solver.useDefaultPivotTolerance();
} else {
data_.solver.useNoPivotTolerance();
}
data_.solver.factorize(device_nzvals_view_);
}
return status;
Expand Down Expand Up @@ -223,6 +234,10 @@ TachoSolver<Matrix,Vector>::setParameters_impl(const Teuchos::RCP<Teuchos::Param
data_.verbose = parameterList->get<bool> ("verbose", false);
// # of streams
data_.streams = parameterList->get<int> ("num-streams", 1);
// DoFs / node
data_.dofs_per_node = parameterList->get<int> ("dofs-per-node", 1);
// Perturb tiny pivots
data_.pivot_pert = parameterList->get<bool> ("perturb-pivot", false);
// TODO: Confirm param options
// data_.num_kokkos_threads = parameterList->get<int>("kokkos-threads", 1);
// data_.max_num_superblocks = parameterList->get<int>("max-num-superblocks", 4);
Expand All @@ -243,6 +258,8 @@ TachoSolver<Matrix,Vector>::getValidParameters_impl() const
pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK.");
pl->set("verbose", false, "Verbosity");
pl->set("num-streams", 1, "Number of GPU streams");
pl->set("dofs-per-node", 1, "DoFs per node");
pl->set("perturb-pivot", false, "Perturb tiny pivots");

// TODO: Confirm param options
// pl->set("kokkos-threads", 1, "Number of threads");
Expand Down
3 changes: 3 additions & 0 deletions packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
/* Define if want to build with CHOLMOD enabled */
#cmakedefine TACHO_HAVE_SUITESPARSE

/* Define if want to build with TrilinosSS enabled */
#cmakedefine TACHO_HAVE_TRILINOS_SS

/* Define if want to build with VTune enabled */
#cmakedefine TACHO_HAVE_VTUNE

Expand Down
37 changes: 30 additions & 7 deletions packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ template <typename value_type> int driver(int argc, char *argv[]) {
std::string file = "test.mtx";
std::string graph_file = "";
std::string weight_file = "";
int dofs_per_node = 1;
bool perturbPivot = false;
int nrhs = 1;
bool randomRHS = true;
bool onesRHS = false;
std::string method_name = "chol";
int method = 1; // 1 - Chol, 2 - LDL, 3 - SymLU
int small_problem_thres = 1024;
Expand All @@ -47,6 +50,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
opts.set_option<std::string>("file", "Input file (MatrixMarket SPD matrix)", &file);
opts.set_option<std::string>("graph", "Input condensed graph", &graph_file);
opts.set_option<std::string>("weight", "Input condensed graph weight", &weight_file);
opts.set_option<int>("dofs-per-node", "# DoFs per node", &dofs_per_node);
opts.set_option<bool>("perturb", "Flag to perturb tiny pivots", &perturbPivot);
opts.set_option<int>("nrhs", "Number of RHS vectors", &nrhs);
opts.set_option<std::string>("method", "Solution method: chol, ldl, lu", &method_name);
opts.set_option<int>("small-problem-thres", "LAPACK is used smaller than this thres", &small_problem_thres);
Expand All @@ -55,6 +60,7 @@ template <typename value_type> int driver(int argc, char *argv[]) {
opts.set_option<int>("device-solve-thres", "Device function is used above this subproblem size", &device_solve_thres);
opts.set_option<int>("variant", "algorithm variant in levelset scheduling; 0, 1 and 2", &variant);
opts.set_option<int>("nstreams", "# of streams used in CUDA; on host, it is ignored", &nstreams);
opts.set_option<bool>("one-rhs", "Set RHS to be ones", &onesRHS);
opts.set_option<bool>("no-warmup", "Flag to turn off warmup", &no_warmup);
opts.set_option<int>("nfacts", "# of factorizations to perform", &nfacts);
opts.set_option<int>("nsolves", "# of solves to perform", &nsolves);
Expand Down Expand Up @@ -125,6 +131,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
if (!in.good()) {
std::cout << "Failed in open the file: " << graph_file << std::endl;
return -1;
} else if (verbose) {
std::cout << " > Condensed graph file: " << graph_file << std::endl;
}
in >> m_graph;

Expand All @@ -135,8 +143,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
aj_graph = ordinal_type_array_host("aj", ap_graph(m_graph));
for (ordinal_type i = 0; i < m_graph; ++i) {
const ordinal_type jbeg = ap_graph(i), jend = ap_graph(i + 1);
for (ordinal_type j = jbeg; j < jend; ++j)
for (ordinal_type j = jbeg; j < jend; ++j) {
in >> aj_graph(j);
aj_graph(j) --; // base-one
}
}
}

Expand All @@ -146,6 +156,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
if (!in.good()) {
std::cout << "Failed in open the file: " << weight_file << std::endl;
return -1;
} else if (verbose) {
std::cout << " > Weight file for condensed graph: " << weight_file << std::endl;
}
ordinal_type m(0);
in >> m;
Expand All @@ -160,25 +172,32 @@ template <typename value_type> int driver(int argc, char *argv[]) {
Tacho::Driver<value_type, device_type> solver;

/// common options
solver.setSolutionMethod(method);
solver.setSmallProblemThresholdsize(small_problem_thres);
solver.setVerbose(verbose);
solver.setSolutionMethod(method);
solver.setLevelSetOptionAlgorithmVariant(variant);
solver.setLevelSetOptionNumStreams(nstreams);

/// graph options
solver.setOrderConnectedGraphSeparately();

/// levelset options
solver.setSmallProblemThresholdsize(small_problem_thres);
solver.setLevelSetOptionDeviceFunctionThreshold(device_factor_thres, device_solve_thres);
solver.setLevelSetOptionAlgorithmVariant(variant);
solver.setLevelSetOptionNumStreams(nstreams);
if (perturbPivot) {
if (verbose) std::cout << " > perturb tiny pivots" << std::endl;
solver.useDefaultPivotTolerance();
}

auto values_on_device = Kokkos::create_mirror_view(typename device_type::memory_space(), A.Values());
Kokkos::deep_copy(values_on_device, A.Values());

/// inputs are used for graph reordering and analysis
if (m_graph > 0 && m_graph < A.NumRows())
solver.analyze(A.NumRows(), A.RowPtr(), A.Cols(), m_graph, ap_graph, aj_graph, aw_graph);
else
else if (dofs_per_node > 1) {
if (verbose) std::cout << " > DoFs / node = " << dofs_per_node << std::endl;
solver.analyze(A.NumRows(), dofs_per_node, A.RowPtr(), A.Cols());
} else
solver.analyze(A.NumRows(), A.RowPtr(), A.Cols());

/// create numeric tools and levelset tools
Expand All @@ -202,7 +221,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
t("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs)

{
if (randomRHS) {
if (onesRHS) {
const value_type one(1.0);
Kokkos::deep_copy (b, one);
} else if (randomRHS) {
Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> random(13718);
Kokkos::fill_random(b, random, value_type(1));
} else {
Expand Down Expand Up @@ -235,6 +257,7 @@ template <typename value_type> int driver(int argc, char *argv[]) {

std::cout << std::endl;
std::cout << " Initi Time " << initi_time << std::endl;
std::cout << " > nnz = " << solver.getNumNonZerosU() << std::endl;
std::cout << " Facto Time " << facto_time / (double)nfacts << std::endl;
std::cout << " Solve Time " << solve_time / (double)nsolves << std::endl;
std::cout << std::endl;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,8 @@ inline static void applyPermutationToCrsMatrixLower(/* */ CrsMatrixType &A, cons
template <typename ValueType, typename DeviceType>
inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType> &A,
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &x,
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b) {
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b,
const bool verbose = false) {
const bool test = (size_t(A.NumRows()) != size_t(A.NumCols()) || size_t(A.NumRows()) != size_t(b.extent(0)) ||
size_t(x.extent(0)) != size_t(b.extent(0)) || size_t(x.extent(1)) != size_t(b.extent(1)));
if (test)
Expand Down Expand Up @@ -405,6 +406,8 @@ inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType>
diff += arith_traits::real((h_b(i, p) - s) * arith_traits::conj(h_b(i, p) - s));
}
}
if (verbose)
std::cout << " Relative residual norm = " << sqrt(diff) << " / " << sqrt(norm) << " = " << sqrt(diff/norm) << std::endl;
return sqrt(diff / norm);
}

Expand Down
56 changes: 54 additions & 2 deletions packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/// \author Kyungjoo Kim ([email protected])

#include "Tacho.hpp"
#include "Tacho_Util.hpp"

#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
Expand All @@ -24,7 +25,7 @@ namespace Tacho {

/// forward decl
class Graph;
#if defined(TACHO_HAVE_METIS)
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
class GraphTools_Metis;
#else
class GraphTools;
Expand All @@ -42,6 +43,7 @@ template <typename ValueType, typename DeviceType, int Var> class NumericToolsLe
template <typename ValueType, typename DeviceType> struct Driver {
public:
using value_type = ValueType;
using mag_type = typename ArithTraits<ValueType>::mag_type;
using device_type = DeviceType;
using exec_space = typename device_type::execution_space;
using exec_memory_space = typename device_type::memory_space;
Expand All @@ -63,7 +65,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
using crs_matrix_type = CrsMatrixBase<value_type, device_type>;
using crs_matrix_type_host = CrsMatrixBase<value_type, host_device_type>;

#if defined(TACHO_HAVE_METIS)
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
using graph_tools_type = GraphTools_Metis;
#else
using graph_tools_type = GraphTools;
Expand Down Expand Up @@ -111,6 +113,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
ordinal_type_array_host _h_peri_graph;

// ** symbolic factorization output
ordinal_type _nnz_u;
// supernodes output
ordinal_type _nsupernodes;
ordinal_type_array _supernodes;
Expand Down Expand Up @@ -160,6 +163,8 @@ template <typename ValueType, typename DeviceType> struct Driver {
ordinal_type _variant; // algorithmic variant in levelset 0: naive, 1: invert diagonals
ordinal_type _nstreams; // on cuda, multi streams are used

mag_type _pivot_tol; // tolerance for tiny pivot perturbation

// parallelism and memory constraint is made via this parameter
ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool

Expand Down Expand Up @@ -206,9 +211,14 @@ template <typename ValueType, typename DeviceType> struct Driver {
void setLevelSetOptionNumStreams(const ordinal_type nstreams);
void setLevelSetOptionAlgorithmVariant(const ordinal_type variant);

void setPivotTolerance(const mag_type pivot_tol);
void useNoPivotTolerance();
void useDefaultPivotTolerance();

///
/// get interface
///
ordinal_type getNumNonZerosU() const;
ordinal_type getNumSupernodes() const;
ordinal_type_array getSupernodes() const;
ordinal_type_array getPermutationVector() const;
Expand All @@ -222,6 +232,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
template <typename arg_size_type_array, typename arg_ordinal_type_array>
int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
const bool duplicate = false) {

_m = m;

if (duplicate) {
Expand Down Expand Up @@ -270,6 +281,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) {
_m = m;

// this takes the user-specified perm, such that analyze() won't call graph partitioner
if (duplicate) {
/// for most cases, ap and aj are from host; so construct ap and aj and mirror to device
_h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0));
Expand Down Expand Up @@ -375,6 +387,46 @@ template <typename ValueType, typename DeviceType> struct Driver {
return analyze();
}

template <typename arg_size_type_array, typename arg_ordinal_type_array>
int analyze(const ordinal_type m, const ordinal_type blk_size,
const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
const bool duplicate = false) {

if (blk_size > 1) {
//condense graph before calling analyze
const size_type nnz = ap(m);
ordinal_type m_graph = m / blk_size;
size_type nnz_graph = nnz / (blk_size*blk_size);
TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != size_type(blk_size*blk_size) * nnz_graph),
std::logic_error, "Failed to initialize the condensed graph");

size_type_array_host ap_graph
(Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph);
ordinal_type_array_host aj_graph
(Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph);
ordinal_type_array_host aw_graph
(Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph);
// condense the graph
nnz_graph = 0;
ap_graph(0) = 0;
for (ordinal_type i = 0; i < m; i += blk_size) {
for (size_type k = ap(i); k < ap(i+1); k++) {
if (aj(k)%blk_size == 0) {
aj_graph(nnz_graph) = aj(k)/blk_size;
nnz_graph++;
}
aw_graph(i/blk_size) = blk_size;
ap_graph((i/blk_size)+1) = nnz_graph;
}
}
TACHO_TEST_FOR_EXCEPTION((nnz != size_type(blk_size*blk_size) * nnz_graph),
std::logic_error, "Failed to condense graph");
return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate);
} else {
return analyze(m, ap, aj, duplicate);
}
}

int initialize();

int factorize(const value_type_array &ax);
Expand Down
6 changes: 3 additions & 3 deletions packages/shylu/shylu_node/tacho/src/impl/Tacho_Chol.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ struct CholAlgorithm {
};

struct CholAlgorithm_Team {
#if defined(KOKKOS_ENABLE_OPENMP)
using type = ActiveHostAlgorithm<runsWithOMP()>::type;
#else
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
using type = ActiveAlgorithm<runsOnCudaOrHIP()>::type;
#else
using type = ActiveHostAlgorithm<runsWithOMP()>::type;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iyamazaki does this also work when Serial backend is enabled as well? If so, maybe the algorithm could be renamed similar to the device version above like runsWithOMPOrSerial, otherwise is there a location in the code that will error/abort/throw if attempting to use this with the Serial backend?

#endif
};
} // namespace Tacho
Expand Down
Loading
Loading