diff --git a/Z4co/src/derivs.hxx b/Z4co/src/derivs.hxx index 89518948..a3b9a62e 100644 --- a/Z4co/src/derivs.hxx +++ b/Z4co/src/derivs.hxx @@ -152,7 +152,7 @@ template inline ARITH_INLINE ARITH_DEVICE ARITH_HOST simd deriv2_2d(const int vavail, const simdl &mask, const T *restrict const var, const ptrdiff_t di, const ptrdiff_t dj, const T dx, const T dy) { - constexpr size_t vsize = tuple_size_v >; + constexpr size_t vsize = tuple_size_v>; if (di == 1) { assert(vavail > 0); constexpr int maxnpoints = deriv_order + 1 + vsize - 1; @@ -160,7 +160,7 @@ deriv2_2d(const int vavail, const simdl &mask, const T *restrict const var, array, div_ceil(maxnpoints, int(vsize))> arrx; for (int i = 0; i < maxnpoints; i += vsize) { if (i < npoints) { - const simdl mask1 = mask_for_loop_tail >(i, npoints); + const simdl mask1 = mask_for_loop_tail>(i, npoints); arrx[div_floor(i, int(vsize))] = deriv1d(mask1, &var[i - deriv_order / 2], dj, dy); } @@ -177,7 +177,7 @@ deriv2_2d(const int vavail, const simdl &mask, const T *restrict const var, for (int j = -deriv_order / 2; j <= deriv_order / 2; ++j) if (j == 0) { #ifdef CCTK_DEBUG - arrx[deriv_order / 2 + j] = Arith::nan >()(); // unused + arrx[deriv_order / 2 + j] = Arith::nan>()(); // unused #endif } else { arrx[deriv_order / 2 + j] = deriv1d(mask, &var[j * dj], di, dx); @@ -233,10 +233,9 @@ deriv_upwind(const simdl &mask, const GF3D2 &gf_, } template -inline ARITH_INLINE - ARITH_DEVICE ARITH_HOST enable_if_t<(dir1 == dir2), simd > - deriv2(const int vavail, const simdl &mask, const GF3D2 &gf_, - const vect &I, const vec &dx) { +inline ARITH_INLINE ARITH_DEVICE ARITH_HOST enable_if_t<(dir1 == dir2), simd> +deriv2(const int vavail, const simdl &mask, const GF3D2 &gf_, + const vect &I, const vec &dx) { static_assert(dir1 >= 0 && dir1 < D, ""); static_assert(dir2 >= 0 && dir2 < D, ""); const auto &DI = vect::unit; @@ -245,10 +244,9 @@ inline ARITH_INLINE } template -inline ARITH_INLINE - ARITH_DEVICE ARITH_HOST enable_if_t<(dir1 != dir2), simd > - deriv2(const int vavail, const simdl &mask, const GF3D2 &gf_, - const vect &I, const vec &dx) { +inline ARITH_INLINE ARITH_DEVICE ARITH_HOST enable_if_t<(dir1 != dir2), simd> +deriv2(const int vavail, const simdl &mask, const GF3D2 &gf_, + const vect &I, const vec &dx) { static_assert(dir1 >= 0 && dir1 < D, ""); static_assert(dir2 >= 0 && dir2 < D, ""); const auto &DI = vect::unit; diff --git a/Z4co/src/enforce.cxx b/Z4co/src/enforce.cxx index e84f87dd..dbb43f75 100644 --- a/Z4co/src/enforce.cxx +++ b/Z4co/src/enforce.cxx @@ -44,7 +44,7 @@ extern "C" void Z4co_Enforce(CCTK_ARGUMENTS) { typedef simdl vbool; constexpr size_t vsize = tuple_size_v; - const auto delta3 = one >()(); + const auto delta3 = one>()(); #ifdef __CUDACC__ const nvtxRangeId_t range = nvtxRangeStartA("Z4co_Enforce::enforce"); diff --git a/Z4co/src/initial1.cxx b/Z4co/src/initial1.cxx index 388db65d..3f6e226a 100644 --- a/Z4co/src/initial1.cxx +++ b/Z4co/src/initial1.cxx @@ -74,7 +74,7 @@ extern "C" void Z4co_Initial1(CCTK_ARGUMENTS) { typedef simdl vbool; constexpr size_t vsize = tuple_size_v; - const auto delta3 = one >()(); + const auto delta3 = one>()(); const Loop::GridDescBaseDevice grid(cctkGH); #ifdef __CUDACC__ @@ -86,7 +86,7 @@ extern "C" void Z4co_Initial1(CCTK_ARGUMENTS) { const GF3D2index index1(layout1, p.I); // Load - const smat g = gf_g1(mask, index1, one >()()); + const smat g = gf_g1(mask, index1, one>()()); const smat K = gf_K1(mask, index1); const vreal alp = gf_alp1(mask, index1, 1); const vec beta = gf_beta1(mask, index1); diff --git a/Z4co/src/initial2.cxx b/Z4co/src/initial2.cxx index ae22b2f9..152066c7 100644 --- a/Z4co/src/initial2.cxx +++ b/Z4co/src/initial2.cxx @@ -44,7 +44,7 @@ extern "C" void Z4co_Initial2(CCTK_ARGUMENTS) { typedef simdl vbool; constexpr size_t vsize = tuple_size_v; - const auto delta3 = one >()(); + const auto delta3 = one>()(); const Loop::GridDescBaseDevice grid(cctkGH); #ifdef __CUDACC__ diff --git a/Z4co/src/make.code.defn b/Z4co/src/make.code.defn new file mode 100644 index 00000000..8f1a55a5 --- /dev/null +++ b/Z4co/src/make.code.defn @@ -0,0 +1,11 @@ +# Main make.code.defn file for thorn Z4c + +# Source files in this directory +SRCS = \ + enforce.cxx \ + initial1.cxx \ + initial2.cxx \ + rhs.cxx + +# Subdirectories containing source files +SUBDIRS = diff --git a/Z4co/src/rhs.cxx b/Z4co/src/rhs.cxx new file mode 100644 index 00000000..21c5b571 --- /dev/null +++ b/Z4co/src/rhs.cxx @@ -0,0 +1,260 @@ +#include + +#ifdef __CUDACC__ +// Disable CCTK_DEBUG since the debug information takes too much +// parameter space to launch the kernels +#ifdef CCTK_DEBUG +#undef CCTK_DEBUG +#endif +#endif + +#include "derivs.hxx" +#include "physics.hxx" + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#include + +namespace Z4co { +using namespace Arith; +using namespace Loop; +using namespace std; + +extern "C" void Z4co_RHS(CCTK_ARGUMENTS) { + DECLARE_CCTK_ARGUMENTS_Z4co_RHS; + DECLARE_CCTK_PARAMETERS; + + for (int d = 0; d < 3; ++d) + if (cctk_nghostzones[d] < deriv_order / 2 + 1) + CCTK_VERROR("Need at least %d ghost zones", deriv_order / 2 + 1); + + // + + const array indextype = {0, 0, 0}; + const array nghostzones = {cctk_nghostzones[0], cctk_nghostzones[1], + cctk_nghostzones[2]}; + vect imin, imax; + GridDescBase(cctkGH).box_int<0, 0, 0>(nghostzones, imin, imax); + // Suffix 1: with ghost zones, suffix 0: without ghost zones + const GF3D2layout layout2(cctkGH, indextype); + const GF3D5layout layout5(imin, imax); + + const GF3D2 gf_chi1(layout2, chi); + + const smat, 3> gf_gammat1{ + GF3D2(layout2, gammatxx), + GF3D2(layout2, gammatxy), + GF3D2(layout2, gammatxz), + GF3D2(layout2, gammatyy), + GF3D2(layout2, gammatyz), + GF3D2(layout2, gammatzz)}; + + const GF3D2 gf_Kh1(layout2, Kh); + + const smat, 3> gf_At1{ + GF3D2(layout2, Atxx), + GF3D2(layout2, Atxy), + GF3D2(layout2, Atxz), + GF3D2(layout2, Atyy), + GF3D2(layout2, Atyz), + GF3D2(layout2, Atzz)}; + + const vec, 3> gf_Gamt1{ + GF3D2(layout2, Gamtx), + GF3D2(layout2, Gamty), + GF3D2(layout2, Gamtz)}; + + const GF3D2 gf_Theta1(layout2, Theta); + + const GF3D2 gf_alphaG1(layout2, alphaG); + + const vec, 3> gf_betaG1{ + GF3D2(layout2, betaGx), + GF3D2(layout2, betaGy), + GF3D2(layout2, betaGz)}; + + // + + // Ideas: + // + // - Outline certain functions, e.g. `det` or `raise_index`. Ensure + // they are called with floating-point arguments, not tensor + // indices. + + const int ntmps = 154; + GF3D5vector tmps(layout5, ntmps); + int itmp = 0; + + const auto make_gf = [&]() { return GF3D5(tmps(itmp++)); }; + const auto make_vec = [&](const auto &f) { + return vec, 3>([&](int) { return f(); }); + }; + const auto make_mat = [&](const auto &f) { + return smat, 3>([&](int, int) { return f(); }); + }; + const auto make_vec_gf = [&]() { return make_vec(make_gf); }; + const auto make_mat_gf = [&]() { return make_mat(make_gf); }; + const auto make_vec_vec_gf = [&]() { return make_vec(make_vec_gf); }; + const auto make_vec_mat_gf = [&]() { return make_vec(make_mat_gf); }; + const auto make_mat_vec_gf = [&]() { return make_mat(make_vec_gf); }; + const auto make_mat_mat_gf = [&]() { return make_mat(make_mat_gf); }; + + const GF3D5 gf_chi0(make_gf()); + const vec, 3> gf_dchi0(make_vec_gf()); + const smat, 3> gf_ddchi0(make_mat_gf()); + calc_derivs2(cctkGH, gf_chi1, gf_chi0, gf_dchi0, gf_ddchi0, layout5); + + const smat, 3> gf_gammat0(make_mat_gf()); + const smat, 3>, 3> gf_dgammat0(make_mat_vec_gf()); + const smat, 3>, 3> gf_ddgammat0(make_mat_mat_gf()); + calc_derivs2(cctkGH, gf_gammat1, gf_gammat0, gf_dgammat0, gf_ddgammat0, + layout5); + + const GF3D5 gf_Kh0(make_gf()); + const vec, 3> gf_dKh0(make_vec_gf()); + calc_derivs(cctkGH, gf_Kh1, gf_Kh0, gf_dKh0, layout5); + + const smat, 3> gf_At0(make_mat_gf()); + const smat, 3>, 3> gf_dAt0(make_mat_vec_gf()); + calc_derivs(cctkGH, gf_At1, gf_At0, gf_dAt0, layout5); + + const vec, 3> gf_Gamt0(make_vec_gf()); + const vec, 3>, 3> gf_dGamt0(make_vec_vec_gf()); + calc_derivs(cctkGH, gf_Gamt1, gf_Gamt0, gf_dGamt0, layout5); + + const GF3D5 gf_Theta0(make_gf()); + const vec, 3> gf_dTheta0(make_vec_gf()); + calc_derivs(cctkGH, gf_Theta1, gf_Theta0, gf_dTheta0, layout5); + + const GF3D5 gf_alphaG0(make_gf()); + const vec, 3> gf_dalphaG0(make_vec_gf()); + const smat, 3> gf_ddalphaG0(make_mat_gf()); + calc_derivs2(cctkGH, gf_alphaG1, gf_alphaG0, gf_dalphaG0, gf_ddalphaG0, + layout5); + + const vec, 3> gf_betaG0(make_vec_gf()); + const vec, 3>, 3> gf_dbetaG0(make_vec_vec_gf()); + const vec, 3>, 3> gf_ddbetaG0(make_vec_mat_gf()); + calc_derivs2(cctkGH, gf_betaG1, gf_betaG0, gf_dbetaG0, gf_ddbetaG0, layout5); + + if (itmp != ntmps) + CCTK_VERROR("Wrong number of temporary variables: ntmps=%d itmp=%d", ntmps, + itmp); + itmp = -1; + + // + + const GF3D2 gf_eTtt1(layout2, eTtt); + + const vec, 3> gf_eTti1{ + GF3D2(layout2, eTtx), + GF3D2(layout2, eTty), + GF3D2(layout2, eTtz)}; + + const smat, 3> gf_eTij1{ + GF3D2(layout2, eTxx), + GF3D2(layout2, eTxy), + GF3D2(layout2, eTxz), + GF3D2(layout2, eTyy), + GF3D2(layout2, eTyz), + GF3D2(layout2, eTzz)}; + + // + + const GF3D2 gf_chi_rhs1(layout2, chi_rhs); + + const smat, 3> gf_gammat_rhs1{ + GF3D2(layout2, gammatxx_rhs), + GF3D2(layout2, gammatxy_rhs), + GF3D2(layout2, gammatxz_rhs), + GF3D2(layout2, gammatyy_rhs), + GF3D2(layout2, gammatyz_rhs), + GF3D2(layout2, gammatzz_rhs)}; + + const GF3D2 gf_Kh_rhs1(layout2, Kh_rhs); + + const smat, 3> gf_At_rhs1{ + GF3D2(layout2, Atxx_rhs), GF3D2(layout2, Atxy_rhs), + GF3D2(layout2, Atxz_rhs), GF3D2(layout2, Atyy_rhs), + GF3D2(layout2, Atyz_rhs), GF3D2(layout2, Atzz_rhs)}; + + const vec, 3> gf_Gamt_rhs1{ + GF3D2(layout2, Gamtx_rhs), + GF3D2(layout2, Gamty_rhs), + GF3D2(layout2, Gamtz_rhs)}; + + const GF3D2 gf_Theta_rhs1(layout2, Theta_rhs); + + const GF3D2 gf_alphaG_rhs1(layout2, alphaG_rhs); + + const vec, 3> gf_betaG_rhs1{ + GF3D2(layout2, betaGx_rhs), + GF3D2(layout2, betaGy_rhs), + GF3D2(layout2, betaGz_rhs)}; + + // + + typedef simd vreal; + typedef simdl vbool; + constexpr size_t vsize = tuple_size_v; + + const Loop::GridDescBaseDevice grid(cctkGH); + +#ifdef __CUDACC__ + const nvtxRangeId_t range = nvtxRangeStartA("Z4co_RHS::rhs"); +#endif + noinline([&]() __attribute__((__flatten__, __hot__)) { + grid.loop_int_device<0, 0, 0, vsize>( + grid.nghostzones, [=] ARITH_DEVICE(const PointDesc &p) ARITH_INLINE { + const vbool mask = mask_for_loop_tail(p.i, p.imax); + const GF3D2index index2(layout2, p.I); + const GF3D5index index5(layout5, p.I); + +#include "../wolfram/Z4co_set_rhs.hxx" + }); + }); +#ifdef __CUDACC__ + nvtxRangeEnd(range); +#endif + + // Upwind and dissipation terms + + // TODO: Consider fusing the loops to reduce memory bandwidth + + apply_upwind_diss(cctkGH, gf_chi1, gf_betaG1, gf_chi_rhs1); + + for (int a = 0; a < 3; ++a) + for (int b = a; b < 3; ++b) + apply_upwind_diss(cctkGH, gf_gammat1(a, b), gf_betaG1, + gf_gammat_rhs1(a, b)); + + apply_upwind_diss(cctkGH, gf_Kh1, gf_betaG1, gf_Kh_rhs1); + + for (int a = 0; a < 3; ++a) + for (int b = a; b < 3; ++b) + apply_upwind_diss(cctkGH, gf_At1(a, b), gf_betaG1, gf_At_rhs1(a, b)); + + for (int a = 0; a < 3; ++a) + apply_upwind_diss(cctkGH, gf_Gamt1(a), gf_betaG1, gf_Gamt_rhs1(a)); + + if (!set_Theta_zero) + apply_upwind_diss(cctkGH, gf_Theta1, gf_betaG1, gf_Theta_rhs1); + + apply_upwind_diss(cctkGH, gf_alphaG1, gf_betaG1, gf_alphaG_rhs1); + + for (int a = 0; a < 3; ++a) + apply_upwind_diss(cctkGH, gf_betaG1(a), gf_betaG1, gf_betaG_rhs1(a)); +} + +} // namespace Z4co