Skip to content

Commit

Permalink
Tacho : new options (dofs-per-node, pivot-tol, amd)
Browse files Browse the repository at this point in the history
Signed-off-by: iyamazaki <[email protected]>
  • Loading branch information
iyamazaki committed Nov 9, 2024
1 parent 813f04e commit 1550fd9
Show file tree
Hide file tree
Showing 17 changed files with 301 additions and 47 deletions.
2 changes: 2 additions & 0 deletions packages/amesos2/src/Amesos2_Tacho_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ class TachoSolver : public SolverCore<Amesos2::TachoSolver, Matrix, Vector>
int small_problem_threshold_size;
int streams;
bool verbose;
int dofs_per_node;
bool pivot_pert;
// int num_kokkos_threads;
// int max_num_superblocks;
} data_;
Expand Down
27 changes: 22 additions & 5 deletions packages/amesos2/src/Amesos2_Tacho_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ TachoSolver<Matrix,Vector>::TachoSolver(
Teuchos::RCP<const Vector> B )
: SolverCore<Amesos2::TachoSolver,Matrix,Vector>(A, X, B)
{
data_.method = 1; // Cholesky
data_.variant = 2; // solver variant
data_.streams = 1; // # of streams
data_.verbose = false; // verbose
data_.method = 1; // Cholesky
data_.variant = 2; // solver variant
data_.streams = 1; // # of streams
data_.dofs_per_node = 1; // DoFs / node
data_.pivot_pert = false; // Diagonal pertubation
data_.verbose = false; // verbose
}


Expand Down Expand Up @@ -82,7 +84,11 @@ TachoSolver<Matrix,Vector>::symbolicFactorization_impl()
// data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks);

// Symbolic factorization currently must be done on host
data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
if (data_.dofs_per_node > 1) {
data_.solver.analyze(this->globalNumCols_, data_.dofs_per_node, host_row_ptr_view_, host_cols_view_);
} else {
data_.solver.analyze(this->globalNumCols_, host_row_ptr_view_, host_cols_view_);
}
data_.solver.initialize();
}
return status;
Expand All @@ -102,6 +108,11 @@ TachoSolver<Matrix,Vector>::numericFactorization_impl()
if(do_optimization()) {
this->matrixA_->returnValues_kokkos_view(device_nzvals_view_);
}
if (data_.pivot_pert) {
data_.solver.useDefaultPivotTolerance();
} else {
data_.solver.useNoPivotTolerance();
}
data_.solver.factorize(device_nzvals_view_);
}
return status;
Expand Down Expand Up @@ -223,6 +234,10 @@ TachoSolver<Matrix,Vector>::setParameters_impl(const Teuchos::RCP<Teuchos::Param
data_.verbose = parameterList->get<bool> ("verbose", false);
// # of streams
data_.streams = parameterList->get<int> ("num-streams", 1);
// DoFs / node
data_.dofs_per_node = parameterList->get<int> ("dofs-per-node", 1);
// Perturb tiny pivots
data_.pivot_pert = parameterList->get<bool> ("perturb-pivot", false);
// TODO: Confirm param options
// data_.num_kokkos_threads = parameterList->get<int>("kokkos-threads", 1);
// data_.max_num_superblocks = parameterList->get<int>("max-num-superblocks", 4);
Expand All @@ -243,6 +258,8 @@ TachoSolver<Matrix,Vector>::getValidParameters_impl() const
pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK.");
pl->set("verbose", false, "Verbosity");
pl->set("num-streams", 1, "Number of GPU streams");
pl->set("dofs-per-node", 1, "DoFs per node");
pl->set("perturb-pivot", false, "Perturb tiny pivots");

// TODO: Confirm param options
// pl->set("kokkos-threads", 1, "Number of threads");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ int main(int argc, char *argv[])
} else {
assert(false);
}
writeMM("Laplace.mtx",KMonolithic);

RCP<MultiVector<SC,LO,GO,NO> > xSolution = MultiVectorFactory<SC,LO,GO,NO>::Build(KMonolithic->getMap(),1);
RCP<MultiVector<SC,LO,GO,NO> > xRightHandSide = MultiVectorFactory<SC,LO,GO,NO>::Build(KMonolithic->getMap(),1);
Expand Down
3 changes: 3 additions & 0 deletions packages/shylu/shylu_node/tacho/cmake/Tacho_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
/* Define if want to build with CHOLMOD enabled */
#cmakedefine TACHO_HAVE_SUITESPARSE

/* Define if want to build with TrilinosSS enabled */
#cmakedefine TACHO_HAVE_TRILINOS_SS

/* Define if want to build with VTune enabled */
#cmakedefine TACHO_HAVE_VTUNE

Expand Down
36 changes: 29 additions & 7 deletions packages/shylu/shylu_node/tacho/example/Tacho_ExampleDriver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ template <typename value_type> int driver(int argc, char *argv[]) {
std::string file = "test.mtx";
std::string graph_file = "";
std::string weight_file = "";
int dofs_per_node = 1;
bool perturbPivot = false;
int nrhs = 1;
bool randomRHS = true;
bool onesRHS = false;
std::string method_name = "chol";
int method = 1; // 1 - Chol, 2 - LDL, 3 - SymLU
int small_problem_thres = 1024;
Expand All @@ -47,6 +50,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
opts.set_option<std::string>("file", "Input file (MatrixMarket SPD matrix)", &file);
opts.set_option<std::string>("graph", "Input condensed graph", &graph_file);
opts.set_option<std::string>("weight", "Input condensed graph weight", &weight_file);
opts.set_option<int>("dofs-per-node", "# DoFs per node", &dofs_per_node);
opts.set_option<bool>("perturb", "Flag to perturb tiny pivots", &perturbPivot);
opts.set_option<int>("nrhs", "Number of RHS vectors", &nrhs);
opts.set_option<std::string>("method", "Solution method: chol, ldl, lu", &method_name);
opts.set_option<int>("small-problem-thres", "LAPACK is used smaller than this thres", &small_problem_thres);
Expand All @@ -55,6 +60,7 @@ template <typename value_type> int driver(int argc, char *argv[]) {
opts.set_option<int>("device-solve-thres", "Device function is used above this subproblem size", &device_solve_thres);
opts.set_option<int>("variant", "algorithm variant in levelset scheduling; 0, 1 and 2", &variant);
opts.set_option<int>("nstreams", "# of streams used in CUDA; on host, it is ignored", &nstreams);
opts.set_option<bool>("one-rhs", "Set RHS to be ones", &onesRHS);
opts.set_option<bool>("no-warmup", "Flag to turn off warmup", &no_warmup);
opts.set_option<int>("nfacts", "# of factorizations to perform", &nfacts);
opts.set_option<int>("nsolves", "# of solves to perform", &nsolves);
Expand Down Expand Up @@ -125,6 +131,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
if (!in.good()) {
std::cout << "Failed in open the file: " << graph_file << std::endl;
return -1;
} else if (verbose) {
std::cout << " > Condensed graph file: " << graph_file << std::endl;
}
in >> m_graph;

Expand All @@ -135,8 +143,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
aj_graph = ordinal_type_array_host("aj", ap_graph(m_graph));
for (ordinal_type i = 0; i < m_graph; ++i) {
const ordinal_type jbeg = ap_graph(i), jend = ap_graph(i + 1);
for (ordinal_type j = jbeg; j < jend; ++j)
for (ordinal_type j = jbeg; j < jend; ++j) {
in >> aj_graph(j);
aj_graph(j) --; // base-one
}
}
}

Expand All @@ -146,6 +156,8 @@ template <typename value_type> int driver(int argc, char *argv[]) {
if (!in.good()) {
std::cout << "Failed in open the file: " << weight_file << std::endl;
return -1;
} else if (verbose) {
std::cout << " > Weight file for condensed graph: " << weight_file << std::endl;
}
ordinal_type m(0);
in >> m;
Expand All @@ -160,25 +172,32 @@ template <typename value_type> int driver(int argc, char *argv[]) {
Tacho::Driver<value_type, device_type> solver;

/// common options
solver.setSolutionMethod(method);
solver.setSmallProblemThresholdsize(small_problem_thres);
solver.setVerbose(verbose);
solver.setSolutionMethod(method);
solver.setLevelSetOptionAlgorithmVariant(variant);
solver.setLevelSetOptionNumStreams(nstreams);

/// graph options
solver.setOrderConnectedGraphSeparately();

/// levelset options
solver.setSmallProblemThresholdsize(small_problem_thres);
solver.setLevelSetOptionDeviceFunctionThreshold(device_factor_thres, device_solve_thres);
solver.setLevelSetOptionAlgorithmVariant(variant);
solver.setLevelSetOptionNumStreams(nstreams);
if (perturbPivot) {
if (verbose) std::cout << " > perturb tiny pivots" << std::endl;
solver.useDefaultPivotTolerance();
}

auto values_on_device = Kokkos::create_mirror_view(typename device_type::memory_space(), A.Values());
Kokkos::deep_copy(values_on_device, A.Values());

/// inputs are used for graph reordering and analysis
if (m_graph > 0 && m_graph < A.NumRows())
solver.analyze(A.NumRows(), A.RowPtr(), A.Cols(), m_graph, ap_graph, aj_graph, aw_graph);
else
else if (dofs_per_node > 1) {
if (verbose) std::cout << " > DoFs / node = " << dofs_per_node << std::endl;
solver.analyze(A.NumRows(), dofs_per_node, A.RowPtr(), A.Cols());
} else
solver.analyze(A.NumRows(), A.RowPtr(), A.Cols());

/// create numeric tools and levelset tools
Expand All @@ -202,7 +221,10 @@ template <typename value_type> int driver(int argc, char *argv[]) {
t("t", A.NumRows(), nrhs); // temp workspace (store permuted rhs)

{
if (randomRHS) {
if (onesRHS) {
const value_type one(1.0);
Kokkos::deep_copy (b, one);
} else if (randomRHS) {
Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> random(13718);
Kokkos::fill_random(b, random, value_type(1));
} else {
Expand Down
5 changes: 4 additions & 1 deletion packages/shylu/shylu_node/tacho/src/Tacho_CrsMatrixBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,8 @@ inline static void applyPermutationToCrsMatrixLower(/* */ CrsMatrixType &A, cons
template <typename ValueType, typename DeviceType>
inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType> &A,
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &x,
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b) {
const Kokkos::View<ValueType **, Kokkos::LayoutLeft, DeviceType> &b,
const bool verbose = false) {
const bool test = (size_t(A.NumRows()) != size_t(A.NumCols()) || size_t(A.NumRows()) != size_t(b.extent(0)) ||
size_t(x.extent(0)) != size_t(b.extent(0)) || size_t(x.extent(1)) != size_t(b.extent(1)));
if (test)
Expand Down Expand Up @@ -405,6 +406,8 @@ inline double computeRelativeResidual(const CrsMatrixBase<ValueType, DeviceType>
diff += arith_traits::real((h_b(i, p) - s) * arith_traits::conj(h_b(i, p) - s));
}
}
if (verbose)
std::cout << " Relative residual norm = " << sqrt(diff) << " / " << sqrt(norm) << " = " << sqrt(diff/norm) << std::endl;
return sqrt(diff / norm);
}

Expand Down
54 changes: 52 additions & 2 deletions packages/shylu/shylu_node/tacho/src/Tacho_Driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/// \author Kyungjoo Kim ([email protected])

#include "Tacho.hpp"
#include "Tacho_Util.hpp"

#include <Kokkos_Core.hpp>
#include <Kokkos_Timer.hpp>
Expand All @@ -24,7 +25,7 @@ namespace Tacho {

/// forward decl
class Graph;
#if defined(TACHO_HAVE_METIS)
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
class GraphTools_Metis;
#else
class GraphTools;
Expand All @@ -42,6 +43,7 @@ template <typename ValueType, typename DeviceType, int Var> class NumericToolsLe
template <typename ValueType, typename DeviceType> struct Driver {
public:
using value_type = ValueType;
using mag_type = typename ArithTraits<ValueType>::mag_type;
using device_type = DeviceType;
using exec_space = typename device_type::execution_space;
using exec_memory_space = typename device_type::memory_space;
Expand All @@ -63,7 +65,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
using crs_matrix_type = CrsMatrixBase<value_type, device_type>;
using crs_matrix_type_host = CrsMatrixBase<value_type, host_device_type>;

#if defined(TACHO_HAVE_METIS)
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS)
using graph_tools_type = GraphTools_Metis;
#else
using graph_tools_type = GraphTools;
Expand Down Expand Up @@ -160,6 +162,8 @@ template <typename ValueType, typename DeviceType> struct Driver {
ordinal_type _variant; // algorithmic variant in levelset 0: naive, 1: invert diagonals
ordinal_type _nstreams; // on cuda, multi streams are used

mag_type _pivot_tol; // tolerance for tiny pivot perturbation

// parallelism and memory constraint is made via this parameter
ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool

Expand Down Expand Up @@ -206,6 +210,10 @@ template <typename ValueType, typename DeviceType> struct Driver {
void setLevelSetOptionNumStreams(const ordinal_type nstreams);
void setLevelSetOptionAlgorithmVariant(const ordinal_type variant);

void setPivotTolerance(const mag_type pivot_tol);
void useNoPivotTolerance();
void useDefaultPivotTolerance();

///
/// get interface
///
Expand All @@ -222,6 +230,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
template <typename arg_size_type_array, typename arg_ordinal_type_array>
int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
const bool duplicate = false) {

_m = m;

if (duplicate) {
Expand Down Expand Up @@ -270,6 +279,7 @@ template <typename ValueType, typename DeviceType> struct Driver {
const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) {
_m = m;

// this takes the user-specified perm, such that analyze() won't call graph partitioner
if (duplicate) {
/// for most cases, ap and aj are from host; so construct ap and aj and mirror to device
_h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0));
Expand Down Expand Up @@ -375,6 +385,46 @@ template <typename ValueType, typename DeviceType> struct Driver {
return analyze();
}

template <typename arg_size_type_array, typename arg_ordinal_type_array>
int analyze(const ordinal_type m, const ordinal_type blk_size,
const arg_size_type_array &ap, const arg_ordinal_type_array &aj,
const bool duplicate = false) {

if (blk_size > 1) {
//condense graph before calling analyze
const size_type nnz = ap(m);
size_type m_graph = m / blk_size;
size_type nnz_graph = nnz / (blk_size*blk_size);
TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != blk_size*blk_size * nnz_graph),
std::logic_error, "Failed to initialize the condensed graph");

size_type_array_host ap_graph
(Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph);
ordinal_type_array_host aj_graph
(Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph);
ordinal_type_array_host aw_graph
(Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph);
// condense the graph
nnz_graph = 0;
ap_graph(0) = 0;
for (size_type i = 0; i < m; i += blk_size) {
for (size_type k = ap(i); k < ap(i+1); k++) {
if (aj(k)%blk_size == 0) {
aj_graph(nnz_graph) = aj(k)/blk_size;
nnz_graph++;
}
aw_graph(i/blk_size) = blk_size;
ap_graph((i/blk_size)+1) = nnz_graph;
}
}
TACHO_TEST_FOR_EXCEPTION((nnz != blk_size*blk_size * nnz_graph),
std::logic_error, "Failed to condense graph");
return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate);
} else {
return analyze(m, ap, aj, duplicate);
}
}

int initialize();

int factorize(const value_type_array &ax);
Expand Down
19 changes: 16 additions & 3 deletions packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Driver<VT, DT>::Driver()
_h_perm(), _peri(), _h_peri(), _m_graph(0), _nnz_graph(0), _h_ap_graph(), _h_aj_graph(), _h_perm_graph(),
_h_peri_graph(), _nsupernodes(0), _N(nullptr), _verbose(0), _small_problem_thres(1024), _serial_thres_size(-1),
_mb(-1), _nb(-1), _front_update_mode(-1), _levelset(0), _device_level_cut(0), _device_factor_thres(128),
_device_solve_thres(128), _variant(2), _nstreams(16), _max_num_superblocks(-1) {}
_device_solve_thres(128), _variant(2), _nstreams(16), _pivot_tol(0.0), _max_num_superblocks(-1) {}

///
/// duplicate the object
Expand Down Expand Up @@ -157,6 +157,19 @@ template <typename VT, typename DT> void Driver<VT, DT>::setLevelSetOptionNumStr
_nstreams = nstreams;
}

template <typename VT, typename DT> void Driver<VT, DT>::setPivotTolerance(const mag_type pivot_tol) {
_pivot_tol = pivot_tol;
}

template <typename VT, typename DT> void Driver<VT, DT>::useNoPivotTolerance() {
_pivot_tol = 0.0;
}

template <typename VT, typename DT> void Driver<VT, DT>::useDefaultPivotTolerance() {
using arith_traits = ArithTraits<value_type>;
_pivot_tol = sqrt(arith_traits::epsilon());
}

///
/// get interface
///
Expand Down Expand Up @@ -373,7 +386,7 @@ template <typename VT, typename DT> int Driver<VT, DT>::factorize(const value_ty
if (_m < _small_problem_thres) {
factorize_small_host(ax);
} else {
_N->factorize(ax, _verbose);
_N->factorize(ax, _pivot_tol, _verbose);
}
return 0;
}
Expand Down Expand Up @@ -541,7 +554,7 @@ double Driver<VT, DT>::computeRelativeResidual(const value_type_array &ax, const
CrsMatrixBase<value_type, device_type> A;
A.setExternalMatrix(_m, _m, _nnz, _ap, _aj, ax);

return Tacho::computeRelativeResidual(A, x, b);
return Tacho::computeRelativeResidual(A, x, b, _verbose);
}

template <typename VT, typename DT>
Expand Down
5 changes: 5 additions & 0 deletions packages/shylu/shylu_node/tacho/src/impl/Tacho_GraphTools.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ class GraphTools {
_perm(i) = i;
_peri(i) = i;
}
if (verbose) {
printf("Summary: GraphTools (Default)\n");
printf("=============================\n");
printf( " Use Natural Ordering\n\n" );
}
}

ordinal_type_array PermVector() const { return _perm; }
Expand Down
Loading

0 comments on commit 1550fd9

Please sign in to comment.