diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index aaacb45ede..99b0fa9dc3 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -49,12 +49,16 @@ struct SerialQR_FormQ_Internal { /// B is m x m // set identity - if (is_Q_zero) - SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); - else - SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); + if (is_Q_zero) { + for (int idx = 0; idx < m; ++idx) { + Q[(qs0 + qs1) * idx] = value_type(1); + // SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); + } + } else { + SerialSetIdentityInternal::invoke(m, m, Q, qs0, qs1); + } - return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 8aa4a6361c..851bd85ab9 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -53,7 +53,7 @@ struct SerialQR_Internal { A_part2x2.partWithATL(A, m, n, 0, 0); t_part2x1.partWithAT(t, m, 0); - for (int m_atl = 0; m_atl < m; ++m_atl) { + for (int m_atl = 0; m_atl < Kokkos::min(m, n); ++m_atl) { // part 2x2 into 3x3 A_part3x3.partWithABR(A_part2x2, 1, 1); const int m_A22 = m - m_atl - 1; diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 2378e5ff01..3a081b10cc 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -30,6 +30,7 @@ #include "Test_Batched_SerialLU.hpp" #include "Test_Batched_SerialLU_Real.hpp" #include "Test_Batched_SerialLU_Complex.hpp" +#include "Test_Batched_SerialQR.hpp" #include "Test_Batched_SerialSolveLU.hpp" #include "Test_Batched_SerialSolveLU_Real.hpp" #include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp new file mode 100644 index 0000000000..4ccb81400a --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -0,0 +1,407 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \author Cameron Smith (smithc11@rpi.edu) + +#include "gtest/gtest.h" +#include + +#include // KokkosBatched::QR +#include "KokkosBatched_ApplyQ_Decl.hpp" // KokkosBatched::ApplyQ +#include "KokkosBatched_QR_FormQ_Serial_Internal.hpp" +#include // KokkosBlas::Algo +#include + +template +struct qrFunctor { + using Scalar = typename MatricesType::value_type; + + int maxMatSize; + MatricesType As; + IndexViewType numRows; + IndexViewType numCols; + IndexViewType offsetA; + TauViewType taus; + TmpViewType ws; + MatricesType Qs; + IndexViewType offsetQ; + + qrFunctor(const int maxMatSize_, MatricesType As_, IndexViewType numRows_, IndexViewType numCols_, + IndexViewType offsetA_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, IndexViewType offsetQ_) + : maxMatSize(maxMatSize_), + As(As_), + numRows(numRows_), + numCols(numCols_), + offsetA(offsetA_), + taus(taus_), + ws(ws_), + Qs(Qs_), + offsetQ(offsetQ_) {} + + KOKKOS_FUNCTION + void operator()(const int matIdx) const { + Kokkos::View> A(&As(offsetA(matIdx)), numRows(matIdx), + numCols(matIdx)); + Kokkos::View> tau(&taus(matIdx * maxMatSize), + Kokkos::min(numRows(matIdx), numCols(matIdx))); + Kokkos::View> w(&ws(matIdx * maxMatSize), + Kokkos::min(numRows(matIdx), numCols(matIdx))); + Kokkos::View> Q(&Qs(offsetQ(matIdx)), numRows(matIdx), + numRows(matIdx)); + + KokkosBatched::SerialQR::invoke(A, tau, w); + + // Store identity in Q + for (int idx = 0; idx < numRows(matIdx); ++idx) { + Q(matIdx, matIdx) = Kokkos::ArithTraits::one(); + } + + // Call ApplyQ on Q + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + } +}; + +template +void test_QR_square() { + // Analytical test with a rectangular matrix + // [12, -51, 4] [150, -69, 58] [14, 21, -14] + // A = [ 6, 167, -68] Q = 1/175 [ 75, 158, -6] R = [ 0, 175, -70] + // [-4, 24, -41] [-50, 30, 165] [ 0, 0, -35] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 3; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 12; + A_h(0, 1) = -51; + A_h(0, 2) = 4; + A_h(1, 0) = 6; + A_h(1, 1) = 167; + A_h(1, 2) = -68; + A_h(2, 0) = -4; + A_h(2, 1) = 24; + A_h(2, 2) = -41; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-21), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 2), static_cast(14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(6. / 26.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(-175), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 2), static_cast(70), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 0), static_cast(-4.0 / 26.0), tol); + // Test::EXPECT_NEAR_KK_REL(A_h(2, 1), 35.0, tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(A_h(2, 2), static_cast(35), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(7. / 13.), tol); + // Test::EXPECT_NEAR_KK_REL(tau(1), 25. / 32., tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(tau(2), static_cast(1. / 2.), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-14), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-21), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(14), B_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-175), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(70), B_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(35), B_h(2, 2), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data()); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 7.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(69. / 175.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-58. / 175.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3. / 7.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-158. / 175.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(6. / 175.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(2. / 7.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 35.), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-33. / 35.), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_rectangular() { + // Analytical test with a rectangular matrix + // [3, 5] [-0.60, -0.64, 0.48] [-5, -3] + // A = [4, 0] Q = [-0.80, -0.48, -0.36] R = [ 0, 5] + // [0, -3] [ 0.00, -0.60, -0.80] [ 0, 0] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 2; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 3; + A_h(0, 1) = 5; + A_h(1, 0) = 4; + A_h(1, 1) = 0; + A_h(2, 0) = 0; + A_h(2, 1) = -3; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-5.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-3.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(0.5), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(5.0), tol); + Test::EXPECT_NEAR_KK(A_h(2, 0), static_cast(0.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 1), static_cast(1. / 3.), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), 5. / 8., tol); + Test::EXPECT_NEAR_KK_REL(tau(1), 10. / 18., tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-5.0), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3.0), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(5.0), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 1), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data()); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.64), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.48), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.80), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.48), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.36), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.80), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_batch() { + // Generate a batch of matrices + // Compute QR factorization + // Verify that R is triangular + // Verify that Q is unitary + // Check that Q*R = A + + using ExecutionSpace = typename Device::execution_space; + + constexpr int numMat = 314; + constexpr int maxMatSize = 100; + Kokkos::View numRows("rows in matrix", numMat); + Kokkos::View numCols("cols in matrix", numMat); + Kokkos::View offsetA("matrix offset", numMat + 1); + Kokkos::View offsetQ("matrix offset", numMat + 1); + Kokkos::View tau("tau", maxMatSize * numMat); + Kokkos::View tmp("work buffer", maxMatSize * numMat); + + typename Kokkos::View::HostMirror numRows_h = Kokkos::create_mirror_view(numRows); + typename Kokkos::View::HostMirror numCols_h = Kokkos::create_mirror_view(numCols); + typename Kokkos::View::HostMirror offsetA_h = Kokkos::create_mirror_view(offsetA); + typename Kokkos::View::HostMirror offsetQ_h = Kokkos::create_mirror_view(offsetQ); + + std::mt19937 gen; + gen.seed(27182818); + std::uniform_int_distribution distrib(1, maxMatSize); + + offsetA_h(0) = 0; + offsetQ_h(0) = 0; + int a = 0, b = 0; + for (int matIdx = 0; matIdx < numMat; ++matIdx) { + a = distrib(gen); + b = distrib(gen); + + numRows_h(matIdx) = Kokkos::max(a, b); + numCols_h(matIdx) = Kokkos::min(a, b); + offsetA_h(matIdx + 1) = offsetA_h(matIdx) + a * b; + offsetQ_h(matIdx + 1) = offsetQ_h(matIdx) + numRows_h(matIdx) * numRows_h(matIdx); + } + + Kokkos::deep_copy(numRows, numRows_h); + Kokkos::deep_copy(numCols, numCols_h); + Kokkos::deep_copy(offsetA, offsetA_h); + Kokkos::deep_copy(offsetQ, offsetQ_h); + + const int numVals = offsetA_h(numMat); + Kokkos::View mats("matrices", numVals); + Kokkos::View Qs("Q matrices", offsetQ_h(numMat)); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, mats, rand_pool, randStart, randEnd); + } + + qrFunctor myFunc(maxMatSize, mats, numRows, numCols, offsetA, tau, tmp, Qs, offsetQ); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_qr_square_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_qr_square_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(); +} +#endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +// TEST_F(TestCategory, batched_scalar_serial_qr_scomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +// TEST_F(TestCategory, batched_scalar_serial_qr_dcomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif