From 423aa60d16eaec412171a0b22ba959bb875887c0 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 15 May 2024 17:10:43 -0600
Subject: [PATCH 1/3] Tpetra: restore imbalanced row path

Bring back the tpetra imbalanced row heuristic for spmv.
This was removed in #12852 but it is still needed for CUDA < 11.2.1.
---
 .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp    | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
index 06522d5b68e9..febf5fc28ddd 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
@@ -5087,9 +5087,24 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
 
 #if KOKKOSKERNELS_VERSION >= 40299
     auto A_lcl = getLocalMatrixDevice();
+
     if(!applyHelper.get()) {
       // The apply helper does not exist, so create it.
-      applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
+      // Decide now whether to use the imbalanced row path, or the default.
+      bool useMergePath = false;
+      LocalOrdinal nrows = getLocalNumRows();
+      LocalOrdinal maxRowImbalance = 0;
+      if(nrows != 0)
+        maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
+      //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
+      //we can use SPMV_MERGE_PATH for the native spmv as well.
+      //Take out this ifdef to enable that.
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+      if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
+        useMergePath = true;
+#endif
+      applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
+          useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
     }
 
     // Translate mode (Teuchos enum) to KokkosKernels (1-character string)

From 906d7be642e81e44d36a27880348ce7af8df27be Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Thu, 16 May 2024 16:53:59 -0600
Subject: [PATCH 2/3] kokkos-kernels: patch in spmv regression fixes

---
 .../sparse/impl/KokkosSparse_spmv_spec.hpp    | 53 +++------------
 .../sparse/src/KokkosSparse_spmv.hpp          | 48 +++----------
 .../sparse/src/KokkosSparse_spmv_handle.hpp   |  9 +--
 ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 47 ++++++-------
 .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp    | 12 ++--
 .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp  | 68 +++++++++++--------
 6 files changed, 88 insertions(+), 149 deletions(-)

diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
index da02b1af5abe..3405ae7575d2 100644
--- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
+++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
@@ -208,49 +208,18 @@ struct SPMV_MV<ExecutionSpace, Handle, AMatrix, XVector, YVector, false, false,
                       const AMatrix& A, const XVector& x,
                       const coefficient_type& beta, const YVector& y) {
     typedef Kokkos::ArithTraits<coefficient_type> KAT;
-    // Intercept special case: if x/y have only 1 column and both are
-    // contiguous, use the more efficient single-vector impl.
-    //
-    // We cannot do this if x or y is noncontiguous, because the column subview
-    // must be LayoutStride which is not ETI'd.
-    //
-    // Do not use a TPL even if one is available for the types:
-    // we don't want the same handle being used in both TPL and non-TPL versions
-    if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
-        y.span_is_contiguous()) {
-      Kokkos::View<typename XVector::const_value_type*, default_layout,
-                   typename XVector::device_type>
-          x0(x.data(), x.extent(0));
-      Kokkos::View<typename YVector::non_const_value_type*, default_layout,
-                   typename YVector::device_type>
-          y0(y.data(), y.extent(0));
-      if (beta == KAT::zero()) {
-        spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
-                  0>(space, handle, mode, alpha, A, x0, beta, y0);
-      } else if (beta == KAT::one()) {
-        spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
-                  1>(space, handle, mode, alpha, A, x0, beta, y0);
-      } else if (beta == -KAT::one()) {
-        spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
-                  -1>(space, handle, mode, alpha, A, x0, beta, y0);
-      } else {
-        spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
-                  2>(space, handle, mode, alpha, A, x0, beta, y0);
-      }
+    if (alpha == KAT::zero()) {
+      spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 0>(
+          space, mode, alpha, A, x, beta, y);
+    } else if (alpha == KAT::one()) {
+      spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 1>(
+          space, mode, alpha, A, x, beta, y);
+    } else if (alpha == -KAT::one()) {
+      spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, -1>(
+          space, mode, alpha, A, x, beta, y);
     } else {
-      if (alpha == KAT::zero()) {
-        spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 0>(
-            space, mode, alpha, A, x, beta, y);
-      } else if (alpha == KAT::one()) {
-        spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 1>(
-            space, mode, alpha, A, x, beta, y);
-      } else if (alpha == -KAT::one()) {
-        spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, -1>(
-            space, mode, alpha, A, x, beta, y);
-      } else {
-        spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 2>(
-            space, mode, alpha, A, x, beta, y);
-      }
+      spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 2>(
+          space, mode, alpha, A, x, beta, y);
     }
   }
 };
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp
index f11b61f675e6..336bae4f1d30 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp
@@ -40,31 +40,6 @@ struct RANK_ONE {};
 struct RANK_TWO {};
 }  // namespace
 
-namespace Impl {
-template <typename ExecutionSpace, typename Handle, typename AMatrix,
-          typename XVector, class YVector>
-inline constexpr bool spmv_general_tpl_avail() {
-  constexpr bool isBSR = ::KokkosSparse::Experimental::is_bsr_matrix_v<AMatrix>;
-  if constexpr (!isBSR) {
-    // CRS
-    if constexpr (XVector::rank() == 1)
-      return spmv_tpl_spec_avail<ExecutionSpace, Handle, AMatrix, XVector,
-                                 YVector>::value;
-    else
-      return spmv_mv_tpl_spec_avail<ExecutionSpace, Handle, AMatrix, XVector,
-                                    YVector>::value;
-  } else {
-    // BSR
-    if constexpr (XVector::rank() == 1)
-      return spmv_bsrmatrix_tpl_spec_avail<ExecutionSpace, Handle, AMatrix,
-                                           XVector, YVector>::value;
-    else
-      return spmv_mv_bsrmatrix_tpl_spec_avail<ExecutionSpace, Handle, AMatrix,
-                                              XVector, YVector>::value;
-  }
-}
-}  // namespace Impl
-
 // clang-format off
 /// \brief Kokkos sparse matrix-vector multiply.
 /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is
@@ -247,8 +222,8 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
       typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
 
   // Special case: XVector/YVector are rank-2 but x,y both have one column and
-  // are contiguous. If a TPL is available for rank-1 vectors but not rank-2,
-  // take rank-1 subviews of x,y and call the rank-1 version.
+  // are contiguous. In this case take rank-1 subviews of x,y and call the
+  // rank-1 version.
   if constexpr (XVector::rank() == 2) {
     using XVector_SubInternal = Kokkos::View<
         typename XVector::const_value_type*,
@@ -259,19 +234,12 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
         typename YVector::non_const_value_type*,
         typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
         typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
-    if constexpr (!Impl::spmv_general_tpl_avail<
-                      ExecutionSpace, HandleImpl, AMatrix_Internal,
-                      XVector_Internal, YVector_Internal>() &&
-                  Impl::spmv_general_tpl_avail<
-                      ExecutionSpace, HandleImpl, AMatrix_Internal,
-                      XVector_SubInternal, YVector_SubInternal>()) {
-      if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
-          y.span_is_contiguous()) {
-        XVector_SubInternal xsub(x.data(), x.extent(0));
-        YVector_SubInternal ysub(y.data(), y.extent(0));
-        spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub);
-        return;
-      }
+    if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
+        y.span_is_contiguous()) {
+      XVector_SubInternal xsub(x.data(), x.extent(0));
+      YVector_SubInternal ysub(y.data(), y.extent(0));
+      spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub);
+      return;
     }
   }
 
diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp
index a2eecfd1ce3a..d930cbdfbe35 100644
--- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp
+++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp
@@ -235,7 +235,8 @@ struct SPMVHandleImpl {
                 "SPMVHandleImpl: Ordinal must not be a const type");
   SPMVHandleImpl(SPMVAlgorithm algo_) : algo(algo_) {}
   ~SPMVHandleImpl() {
-    if (tpl) delete tpl;
+    if (tpl_rank1) delete tpl_rank1;
+    if (tpl_rank2) delete tpl_rank2;
   }
 
   ImplType* get_impl() { return this; }
@@ -243,9 +244,9 @@ struct SPMVHandleImpl {
   /// Get the SPMVAlgorithm used by this handle
   SPMVAlgorithm get_algorithm() const { return this->algo; }
 
-  bool is_set_up                     = false;
-  const SPMVAlgorithm algo           = SPMV_DEFAULT;
-  TPL_SpMV_Data<ExecutionSpace>* tpl = nullptr;
+  const SPMVAlgorithm algo                 = SPMV_DEFAULT;
+  TPL_SpMV_Data<ExecutionSpace>* tpl_rank1 = nullptr;
+  TPL_SpMV_Data<ExecutionSpace>* tpl_rank2 = nullptr;
   // Expert tuning parameters for native SpMV
   // TODO: expose a proper Experimental interface to set these. Currently they
   // can be assigned directly in the SPMVHandle as they are public members.
diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
index e867038842d9..5e6fb1fa09b8 100644
--- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
+++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
@@ -43,8 +43,8 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
   Subhandle* subhandle;
   const MKLScalar* x_mkl = reinterpret_cast<const MKLScalar*>(x);
   MKLScalar* y_mkl       = reinterpret_cast<MKLScalar*>(y);
-  if (handle->is_set_up) {
-    subhandle = dynamic_cast<Subhandle*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<Subhandle*>(handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for MKL BSR");
@@ -54,7 +54,7 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
     // Use the default execution space instance, as classic MKL does not use
     // a specific instance.
     subhandle             = new Subhandle(ExecSpace());
-    handle->tpl           = subhandle;
+    handle->tpl_rank1     = subhandle;
     subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL;
     subhandle->descr.mode = SPARSE_FILL_MODE_FULL;
     subhandle->descr.diag = SPARSE_DIAG_NON_UNIT;
@@ -87,7 +87,6 @@ inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
           const_cast<MKL_INT*>(Arowptrs + 1), const_cast<MKL_INT*>(Aentries),
           Avalues_mkl));
     }
-    handle->is_set_up = true;
   }
   MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar<Scalar>(alpha);
   MKLScalar beta_mkl  = KokkosSparse::Impl::KokkosToMKLScalar<Scalar>(beta);
@@ -124,8 +123,8 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
   Subhandle* subhandle;
   const MKLScalar* x_mkl = reinterpret_cast<const MKLScalar*>(x);
   MKLScalar* y_mkl       = reinterpret_cast<MKLScalar*>(y);
-  if (handle->is_set_up) {
-    subhandle = dynamic_cast<Subhandle*>(handle->tpl);
+  if (handle->tpl_rank2) {
+    subhandle = dynamic_cast<Subhandle*>(handle->tpl_rank2);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for MKL BSR");
@@ -135,7 +134,7 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
     // Use the default execution space instance, as classic MKL does not use
     // a specific instance.
     subhandle             = new Subhandle(ExecSpace());
-    handle->tpl           = subhandle;
+    handle->tpl_rank2     = subhandle;
     subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL;
     subhandle->descr.mode = SPARSE_FILL_MODE_FULL;
     subhandle->descr.diag = SPARSE_DIAG_NON_UNIT;
@@ -168,7 +167,6 @@ inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
           const_cast<MKL_INT*>(Arowptrs + 1), const_cast<MKL_INT*>(Aentries),
           Avalues_mkl));
     }
-    handle->is_set_up = true;
   }
   MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar<Scalar>(alpha);
   MKLScalar beta_mkl  = KokkosSparse::Impl::KokkosToMKLScalar<Scalar>(beta);
@@ -390,23 +388,22 @@ void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle,
 
   KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle;
 
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(
+        handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for cusparse");
     subhandle->set_exec_space(exec);
   } else {
     /* create and set the subhandle and matrix descriptor */
-    subhandle   = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO));
-    handle->is_set_up = true;
   }
 
   cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
@@ -518,23 +515,22 @@ void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle,
 
   KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle;
 
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank2) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(
+        handle->tpl_rank2);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for cusparse");
     subhandle->set_exec_space(exec);
   } else {
     /* create and set the subhandle and matrix descriptor */
-    subhandle   = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
+    handle->tpl_rank2 = subhandle;
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO));
-    handle->is_set_up = true;
   }
   cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW;
 
@@ -886,16 +882,16 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle,
   rocsparse_value_type* y_ = reinterpret_cast<rocsparse_value_type*>(y.data());
 
   KokkosSparse::Impl::RocSparse_BSR_SpMV_Data* subhandle;
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::RocSparse_BSR_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::RocSparse_BSR_SpMV_Data*>(
+        handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR");
     subhandle->set_exec_space(exec);
   } else {
-    subhandle   = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
     KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(
         rocsparse_create_mat_descr(&subhandle->mat));
     // *_ex* functions deprecated in introduced in 6+
@@ -949,7 +945,6 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle,
                     "unsupported value type for rocsparse_*bsrmv");
     }
 #endif
-    handle->is_set_up = true;
   }
 
   // *_ex* functions deprecated in introduced in 6+
diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp
index 2ae6bf44f252..853b93f47e7c 100644
--- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp
+++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp
@@ -186,16 +186,16 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle,
   }
 
   KokkosSparse::Impl::CuSparse10_SpMV_Data *subhandle;
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::CuSparse10_SpMV_Data *>(handle->tpl);
+  if (handle->tpl_rank2) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::CuSparse10_SpMV_Data *>(
+        handle->tpl_rank2);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for cusparse");
     subhandle->set_exec_space(exec);
   } else {
-    subhandle   = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec);
+    handle->tpl_rank2 = subhandle;
     /* create matrix */
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
         &subhandle->mat, A.numRows(), A.numCols(), A.nnz(),
@@ -209,8 +209,6 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle,
 
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMalloc(&subhandle->buffer, subhandle->bufferSize));
-
-    handle->is_set_up = true;
   }
 
   KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha,
diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
index e3f88e6e1122..bdde1d831492 100644
--- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
+++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp
@@ -96,25 +96,38 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[],
   KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(
       &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType));
 
-  // use default cusparse algo for best performance
+  // Prior to CUDA 11.2.1, ALG2 was more performant than default for imbalanced
+  // matrices. After 11.2.1, the default is performant for imbalanced matrices,
+  // and ALG2 now means something else. CUDA >= 11.2.1 corresponds to
+  // CUSPARSE_VERSION >= 11402.
+#if CUSPARSE_VERSION >= 11402
+  const bool useAlg2 = false;
+#else
+  const bool useAlg2 = handle->get_algorithm() == SPMV_MERGE_PATH;
+#endif
+
+  // In CUDA 11.2.0, the algorithm enums were renamed.
+  // This corresponds to CUSPARSE_VERSION >= 11400.
 #if CUSPARSE_VERSION >= 11400
-  cusparseSpMVAlg_t algo = CUSPARSE_SPMV_ALG_DEFAULT;
+  cusparseSpMVAlg_t algo =
+      useAlg2 ? CUSPARSE_SPMV_CSR_ALG2 : CUSPARSE_SPMV_ALG_DEFAULT;
 #else
-  cusparseSpMVAlg_t algo = CUSPARSE_MV_ALG_DEFAULT;
+  cusparseSpMVAlg_t algo =
+      useAlg2 ? CUSPARSE_CSRMV_ALG2 : CUSPARSE_MV_ALG_DEFAULT;
 #endif
 
   KokkosSparse::Impl::CuSparse10_SpMV_Data* subhandle;
 
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::CuSparse10_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::CuSparse10_SpMV_Data*>(
+        handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for cusparse");
     subhandle->set_exec_space(exec);
   } else {
-    subhandle   = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
 
     /* create matrix */
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr(
@@ -135,7 +148,6 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[],
     KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMalloc(&subhandle->buffer, subhandle->bufferSize));
 #endif
-    handle->is_set_up = true;
   }
 
   /* perform SpMV */
@@ -150,24 +162,23 @@ void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[],
 
   KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle;
 
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::CuSparse9_SpMV_Data*>(
+        handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for cusparse");
     subhandle->set_exec_space(exec);
   } else {
     /* create and set the subhandle and matrix descriptor */
-    subhandle   = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
     cusparseMatDescr_t descrA = 0;
     KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL));
     KOKKOS_CUSPARSE_SAFE_CALL(
         cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO));
-    handle->is_set_up = true;
   }
 
   /* perform the actual SpMV operation */
@@ -419,16 +430,16 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[],
   rocsparse_spmv_alg alg = rocsparse_spmv_alg_default;
 
   KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle;
-  if (handle->is_set_up) {
-    subhandle =
-        dynamic_cast<KokkosSparse::Impl::RocSparse_CRS_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<KokkosSparse::Impl::RocSparse_CRS_SpMV_Data*>(
+        handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS");
     subhandle->set_exec_space(exec);
   } else {
-    subhandle   = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
     /* Create the rocsparse csr descr */
     // We need to do some casting to void*
     // Note that row_map is always a const view so const_cast is necessary,
@@ -476,7 +487,6 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[],
     KOKKOS_IMPL_HIP_SAFE_CALL(
         hipMalloc(&subhandle->buffer, subhandle->bufferSize));
 #endif
-    handle->is_set_up = true;
   }
 
   /* Perform the actual computation */
@@ -592,8 +602,8 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
   Subhandle* subhandle;
   const MKLScalar* x_mkl = reinterpret_cast<const MKLScalar*>(x);
   MKLScalar* y_mkl       = reinterpret_cast<MKLScalar*>(y);
-  if (handle->is_set_up) {
-    subhandle = dynamic_cast<Subhandle*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<Subhandle*>(handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for MKL CRS");
@@ -603,7 +613,7 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
     // Use the default execution space instance, as classic MKL does not use
     // a specific instance.
     subhandle             = new Subhandle(ExecSpace());
-    handle->tpl           = subhandle;
+    handle->tpl_rank1     = subhandle;
     subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL;
     subhandle->descr.mode = SPARSE_FILL_MODE_FULL;
     subhandle->descr.diag = SPARSE_DIAG_NON_UNIT;
@@ -632,7 +642,6 @@ inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha,
           const_cast<MKL_INT*>(Arowptrs), const_cast<MKL_INT*>(Arowptrs + 1),
           const_cast<MKL_INT*>(Aentries), Avalues_mkl));
     }
-    handle->is_set_up = true;
   }
   MKLScalar alpha_mkl = KokkosToMKLScalar<Scalar>(alpha);
   MKLScalar beta_mkl  = KokkosToMKLScalar<Scalar>(beta);
@@ -757,15 +766,15 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle,
     mkl_mode = oneapi::mkl::transpose::trans;
 
   OneMKL_SpMV_Data* subhandle;
-  if (handle->is_set_up) {
-    subhandle = dynamic_cast<OneMKL_SpMV_Data*>(handle->tpl);
+  if (handle->tpl_rank1) {
+    subhandle = dynamic_cast<OneMKL_SpMV_Data*>(handle->tpl_rank1);
     if (!subhandle)
       throw std::runtime_error(
           "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS");
     subhandle->set_exec_space(exec);
   } else {
-    subhandle   = new OneMKL_SpMV_Data(exec);
-    handle->tpl = subhandle;
+    subhandle         = new OneMKL_SpMV_Data(exec);
+    handle->tpl_rank1 = subhandle;
     oneapi::mkl::sparse::init_matrix_handle(&subhandle->mat);
     // Even for out-of-order SYCL queue, the inputs here do not depend on
     // kernels being sequenced
@@ -780,7 +789,6 @@ inline void spmv_onemkl(const execution_space& exec, Handle* handle,
     // optimize_gemv has finished
     oneapi::mkl::sparse::optimize_gemv(exec.sycl_queue(), mkl_mode,
                                        subhandle->mat, {ev});
-    handle->is_set_up = true;
   }
 
   // Uncommon case: an out-of-order SYCL queue does not promise that previously

From 1bddb553d2151f49f70d7f23ce11ceeca55cbe11 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 22 May 2024 07:29:57 -0600
Subject: [PATCH 3/3] Fix unused param warning

---
 packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
index 3405ae7575d2..67a2f0563962 100644
--- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
+++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_spec.hpp
@@ -203,7 +203,8 @@ struct SPMV_MV<ExecutionSpace, Handle, AMatrix, XVector, YVector, false, false,
                KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
   typedef typename YVector::non_const_value_type coefficient_type;
 
-  static void spmv_mv(const ExecutionSpace& space, Handle* handle,
+  // TODO: pass handle through to implementation and use tuning parameters
+  static void spmv_mv(const ExecutionSpace& space, Handle* /* handle */,
                       const char mode[], const coefficient_type& alpha,
                       const AMatrix& A, const XVector& x,
                       const coefficient_type& beta, const YVector& y) {