-
Notifications
You must be signed in to change notification settings - Fork 572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Tacho : new options (dofs-per-node, pivot-tol, amd) #13585
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
1550fd9
Tacho : new options (dofs-per-node, pivot-tol, amd)
iyamazaki 9990b44
Tacho : compiler warnings
iyamazaki 7cd69d1
Tacho : compile error with OpenMP (tol is used only by Team)
iyamazaki ead5778
Tacho : check for # of streams > 0
iyamazaki f590a0a
Tacho : function to return nnz of factors
iyamazaki 324ba20
Tacho : buid with CUDA + OpenMP
iyamazaki 808d111
Tacho : compile warnings
iyamazaki File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
/// \author Kyungjoo Kim ([email protected]) | ||
|
||
#include "Tacho.hpp" | ||
#include "Tacho_Util.hpp" | ||
|
||
#include <Kokkos_Core.hpp> | ||
#include <Kokkos_Timer.hpp> | ||
|
@@ -24,7 +25,7 @@ namespace Tacho { | |
|
||
/// forward decl | ||
class Graph; | ||
#if defined(TACHO_HAVE_METIS) | ||
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) | ||
class GraphTools_Metis; | ||
#else | ||
class GraphTools; | ||
|
@@ -42,6 +43,7 @@ template <typename ValueType, typename DeviceType, int Var> class NumericToolsLe | |
template <typename ValueType, typename DeviceType> struct Driver { | ||
public: | ||
using value_type = ValueType; | ||
using mag_type = typename ArithTraits<ValueType>::mag_type; | ||
using device_type = DeviceType; | ||
using exec_space = typename device_type::execution_space; | ||
using exec_memory_space = typename device_type::memory_space; | ||
|
@@ -63,7 +65,7 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
using crs_matrix_type = CrsMatrixBase<value_type, device_type>; | ||
using crs_matrix_type_host = CrsMatrixBase<value_type, host_device_type>; | ||
|
||
#if defined(TACHO_HAVE_METIS) | ||
#if defined(TACHO_HAVE_METIS) || defined(TACHO_HAVE_TRILINOS_SS) | ||
using graph_tools_type = GraphTools_Metis; | ||
#else | ||
using graph_tools_type = GraphTools; | ||
|
@@ -111,6 +113,7 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
ordinal_type_array_host _h_peri_graph; | ||
|
||
// ** symbolic factorization output | ||
ordinal_type _nnz_u; | ||
// supernodes output | ||
ordinal_type _nsupernodes; | ||
ordinal_type_array _supernodes; | ||
|
@@ -160,6 +163,8 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
ordinal_type _variant; // algorithmic variant in levelset 0: naive, 1: invert diagonals | ||
ordinal_type _nstreams; // on cuda, multi streams are used | ||
|
||
mag_type _pivot_tol; // tolerance for tiny pivot perturbation | ||
|
||
// parallelism and memory constraint is made via this parameter | ||
ordinal_type _max_num_superblocks; // # of superblocks in the memoyrpool | ||
|
||
|
@@ -206,9 +211,14 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
void setLevelSetOptionNumStreams(const ordinal_type nstreams); | ||
void setLevelSetOptionAlgorithmVariant(const ordinal_type variant); | ||
|
||
void setPivotTolerance(const mag_type pivot_tol); | ||
void useNoPivotTolerance(); | ||
void useDefaultPivotTolerance(); | ||
|
||
/// | ||
/// get interface | ||
/// | ||
ordinal_type getNumNonZerosU() const; | ||
ordinal_type getNumSupernodes() const; | ||
ordinal_type_array getSupernodes() const; | ||
ordinal_type_array getPermutationVector() const; | ||
|
@@ -222,6 +232,7 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
template <typename arg_size_type_array, typename arg_ordinal_type_array> | ||
int analyze(const ordinal_type m, const arg_size_type_array &ap, const arg_ordinal_type_array &aj, | ||
const bool duplicate = false) { | ||
|
||
_m = m; | ||
|
||
if (duplicate) { | ||
|
@@ -270,6 +281,7 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
const arg_perm_type_array &perm, const arg_perm_type_array &peri, const bool duplicate = false) { | ||
_m = m; | ||
|
||
// this takes the user-specified perm, such that analyze() won't call graph partitioner | ||
if (duplicate) { | ||
/// for most cases, ap and aj are from host; so construct ap and aj and mirror to device | ||
_h_ap = size_type_array_host(Kokkos::ViewAllocateWithoutInitializing("h_ap"), ap.extent(0)); | ||
|
@@ -375,6 +387,46 @@ template <typename ValueType, typename DeviceType> struct Driver { | |
return analyze(); | ||
} | ||
|
||
template <typename arg_size_type_array, typename arg_ordinal_type_array> | ||
int analyze(const ordinal_type m, const ordinal_type blk_size, | ||
const arg_size_type_array &ap, const arg_ordinal_type_array &aj, | ||
const bool duplicate = false) { | ||
|
||
if (blk_size > 1) { | ||
//condense graph before calling analyze | ||
const size_type nnz = ap(m); | ||
ordinal_type m_graph = m / blk_size; | ||
size_type nnz_graph = nnz / (blk_size*blk_size); | ||
TACHO_TEST_FOR_EXCEPTION((m != blk_size * m_graph || nnz != size_type(blk_size*blk_size) * nnz_graph), | ||
std::logic_error, "Failed to initialize the condensed graph"); | ||
|
||
size_type_array_host ap_graph | ||
(Kokkos::ViewAllocateWithoutInitializing("ap_graph"), 1+m_graph); | ||
ordinal_type_array_host aj_graph | ||
(Kokkos::ViewAllocateWithoutInitializing("aj_graph"), nnz_graph); | ||
ordinal_type_array_host aw_graph | ||
(Kokkos::ViewAllocateWithoutInitializing("wgs"), m_graph); | ||
// condense the graph | ||
nnz_graph = 0; | ||
ap_graph(0) = 0; | ||
for (ordinal_type i = 0; i < m; i += blk_size) { | ||
for (size_type k = ap(i); k < ap(i+1); k++) { | ||
if (aj(k)%blk_size == 0) { | ||
aj_graph(nnz_graph) = aj(k)/blk_size; | ||
nnz_graph++; | ||
} | ||
aw_graph(i/blk_size) = blk_size; | ||
ap_graph((i/blk_size)+1) = nnz_graph; | ||
} | ||
} | ||
TACHO_TEST_FOR_EXCEPTION((nnz != size_type(blk_size*blk_size) * nnz_graph), | ||
std::logic_error, "Failed to condense graph"); | ||
return analyze(m, ap, aj, m_graph, ap_graph, aj_graph, aw_graph, duplicate); | ||
} else { | ||
return analyze(m, ap, aj, duplicate); | ||
} | ||
} | ||
|
||
int initialize(); | ||
|
||
int factorize(const value_type_array &ax); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@iyamazaki does this also work when Serial backend is enabled as well? If so, maybe the algorithm could be renamed similar to the device version above like runsWithOMPOrSerial, otherwise is there a location in the code that will error/abort/throw if attempting to use this with the Serial backend?