From a1406e44dde9a65ae5475b39a937b6e3ad00750a Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Wed, 15 Jan 2025 14:29:56 +0100 Subject: [PATCH 1/9] [oneDPL][ranges][merge] support size limit for output --- include/oneapi/dpl/pstl/algorithm_impl.h | 123 +++++++++++++++++- .../oneapi/dpl/pstl/algorithm_ranges_impl.h | 36 ++--- .../dpl/pstl/glue_algorithm_ranges_impl.h | 7 +- .../hetero/algorithm_ranges_impl_hetero.h | 50 ++++--- .../dpcpp/parallel_backend_sycl_merge.h | 88 ++++++++----- .../dpcpp/parallel_backend_sycl_utils.h | 52 ++++++-- include/oneapi/dpl/pstl/omp/parallel_for.h | 11 +- include/oneapi/dpl/pstl/parallel_backend.h | 3 + .../oneapi/dpl/pstl/parallel_backend_serial.h | 2 +- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 5 +- include/oneapi/dpl/pstl/unseq_backend_simd.h | 34 +++++ .../ranges/std_ranges_merge.pass.cpp | 58 +++++++-- test/parallel_api/ranges/std_ranges_test.h | 9 +- 13 files changed, 372 insertions(+), 106 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ae9094f721a..e6513e10399 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -31,6 +31,7 @@ #include "parallel_backend.h" #include "parallel_impl.h" #include "iterator_impl.h" +#include "../functional" #if _ONEDPL_HETERO_BACKEND # include "hetero/algorithm_impl_hetero.h" // for __pattern_fill_n, __pattern_generate_n @@ -2948,6 +2949,49 @@ __pattern_remove_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, // merge //------------------------------------------------------------------------ +template +std::pair +__brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, + /* __is_vector = */ std::false_type) +{ + while(__it_1 != __it_1_e && __it_2 != __it_2_e) + { + if (__comp(*__it_1, *__it_2)) + { + *__it_out = *__it_1; + ++__it_out, ++__it_1; + } + else + { + *__it_out = *__it_2; + ++__it_out, ++__it_2; + } + if(__it_out == __it_out_e) + return {__it_1, __it_2}; + } + + if(__it_1 == __it_1_e) + { + for(; __it_2 != __it_2_e && __it_out != __it_out_e; ++__it_2, ++__it_out) + *__it_out = *__it_2; + } + else + { + //assert(__it_2 == __it_2_e); + for(; __it_1 != __it_1_e && __it_out != __it_out_e; ++__it_1, ++__it_out) + *__it_out = *__it_1; + } + return {__it_1, __it_2}; +} + +template +std::pair +__brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, + /* __is_vector = */ std::true_type) +{ + return __unseq_backend::__simd_merge(__it_1, __it_1_e, __it_2, __it_2_e, __it_out, __it_out_e, __comp); +} + template _OutputIterator __brick_merge(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, @@ -2980,10 +3024,87 @@ __pattern_merge(_Tag, _ExecutionPolicy&&, _ForwardIterator1 __first1, _ForwardIt typename _Tag::__is_vector{}); } +template +std::pair<_It1, _It2> +__pattern_merge_2(_Tag, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, + _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) +{ + return __brick_merge_2(__it_1, __it_1 + __n_1, __it_2, __it_2 + __n_2, __it_out, __it_out + __n_out, __comp, + typename _Tag::__is_vector{}); +} + +template +std::pair<_It1, _It2> +__pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, + _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + + _It1 __it_res_1; + _It2 __it_res_2; + + __internal::__except_handler([&]() { + __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _Index3(0), __n_out, + [=, &__it_res_1, &__it_res_2](_Index3 __i, _Index3 __j) + { + //a start merging point on the merge path; for each thread + _Index1 __r = 0; //row index + _Index2 __c = 0; //column index + + if(__i > 0) + { + //calc merge path intersection: + const _Index3 __d_size = + std::abs(std::max<_Index2>(0, __i - __n_2) - (std::min<_Index1>(__i, __n_1) - 1)) + 1; + + auto __get_row = [__i, __n_1](auto __d) + { return std::min<_Index1>(__i, __n_1) - __d - 1; }; + auto __get_column = [__i, __n_1](auto __d) + { return std::max<_Index1>(0, __i - __n_1 - 1) + __d + (__i / (__n_1 + 1) > 0 ? 1 : 0); }; + + oneapi::dpl::counting_iterator<_Index3> __it_d(0); + + auto __res_d = *std::lower_bound(__it_d, __it_d + __d_size, 1, + [&](auto __d, auto __val) { + auto __r = __get_row(__d); + auto __c = __get_column(__d); + + oneapi::dpl::__internal::__compare<_Comp, oneapi::dpl::identity> + __cmp{__comp, oneapi::dpl::identity{}}; + const auto __res = (__cmp(__it_1[__r], __it_2[__c]) ? 1 : 0); + + return __res < __val; + } + ); + + //intersection point + __r = __get_row(__res_d); + __c = __get_column(__res_d); + ++__r; //to get a merge matrix ceil, lying on the current diagonal + } + + //serial merge n elements, starting from input x and y, to [i, j) output range + auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, + __it_2 + __c, __it_2 + __n_2, + __it_out + __i, __it_out + __j, __comp, _IsVector{}); + + if(__j == __n_out) + { + __it_res_1 = __res.first; + __it_res_2 = __res.second; + } + }, _ONEDPL_MERGE_CUT_OFF); //grainsize + }); + + return {__it_res_1, __it_res_2}; +} + template _RandomAccessIterator3 -__pattern_merge(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, +__pattern_merge(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __d_first, _Compare __comp) { diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 55d29a56be8..2c7d9873079 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -448,31 +448,31 @@ auto __pattern_merge(_Tag __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - static_assert(__is_parallel_tag_v<_Tag> || typename _Tag::__is_vector{}); - assert(std::ranges::size(__r1) + std::ranges::size(__r2) <= std::ranges::size(__out_r)); // for debug purposes only - + using __return_type = std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, + std::ranges::borrowed_iterator_t<_OutRange>>; + auto __comp_2 = [__comp, __proj1, __proj2](auto&& __val1, auto&& __val2) { return std::invoke(__comp, std::invoke(__proj1, std::forward(__val1)), std::invoke(__proj2, std::forward(__val2)));}; - auto __res = oneapi::dpl::__internal::__pattern_merge(__tag, std::forward<_ExecutionPolicy>(__exec), - std::ranges::begin(__r1), std::ranges::begin(__r1) + std::ranges::size(__r1), std::ranges::begin(__r2), - std::ranges::begin(__r2) + std::ranges::size(__r2), std::ranges::begin(__out_r), __comp_2); + using _Index1 = std::ranges::range_difference_t<_R1>; + using _Index2 = std::ranges::range_difference_t<_R2>; + using _Index3 = std::ranges::range_difference_t<_OutRange>; - using __return_type = std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, - std::ranges::borrowed_iterator_t<_OutRange>>; + _Index1 __n_1 = std::ranges::size(__r1); + _Index2 __n_2 = std::ranges::size(__r2); + _Index3 __n_out = std::min<_Index3>(__n_1 + __n_2, std::ranges::size(__out_r)); - return __return_type{std::ranges::begin(__r1) + std::ranges::size(__r1), std::ranges::begin(__r2) + std::ranges::size(__r2), __res}; -} + auto __it_1 = std::ranges::begin(__r1); + auto __it_2 = std::ranges::begin(__r2); + auto __it_out = std::ranges::begin(__out_r); -template -auto -__pattern_merge(__serial_tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, - _Proj1 __proj1, _Proj2 __proj2) -{ - return std::ranges::merge(std::forward<_R1>(__r1), std::forward<_R2>(__r2), std::ranges::begin(__out_r), __comp, __proj1, - __proj2); + if(__n_out == 0) + return __return_type{__it_1, __it_2, __it_out}; + + auto __res = __pattern_merge_2(__tag, std::forward<_ExecutionPolicy>(__exec), __it_2, __n_2, __it_1, __n_1, __it_out, __n_out, __comp_2); + + return __return_type{__res.second, __res.first, __it_out + __n_out}; } } // namespace __ranges diff --git a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h index 77a4875ccde..f2fc3840632 100644 --- a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h @@ -1173,9 +1173,12 @@ merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& _ { const auto __dispatch_tag = oneapi::dpl::__ranges::__select_backend(__exec, __rng1, __rng2, __rng3); - return oneapi::dpl::__internal::__ranges::__pattern_merge( + auto __view_res = views::all_write(::std::forward<_Range3>(__rng3)); + oneapi::dpl::__internal::__ranges::__pattern_merge( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), views::all_read(::std::forward<_Range1>(__rng1)), - views::all_read(::std::forward<_Range2>(__rng2)), views::all_write(::std::forward<_Range3>(__rng3)), __comp); + views::all_read(::std::forward<_Range2>(__rng2)), __view_res, __comp); + + return __view_res.size(); } template diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index da7820b91a2..a16bdb6cdde 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -51,10 +51,11 @@ namespace __ranges //------------------------------------------------------------------------ template -void +auto __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function __f, _Ranges&&... __rngs) { - auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); + using _Size = std::make_unsigned_t...>>; + auto __n = std::min({_Size(__rngs.size())...}); if (__n > 0) { oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), @@ -62,6 +63,7 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function ::std::forward<_Ranges>(__rngs)...) .__deferrable_wait(); } + return __n; } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -680,44 +682,44 @@ struct __copy2_wrapper; template -oneapi::dpl::__internal::__difference_t<_Range3> +std::pair, oneapi::dpl::__internal::__difference_t<_Range2>> __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) { auto __n1 = __rng1.size(); auto __n2 = __rng2.size(); - auto __n = __n1 + __n2; - if (__n == 0) - return 0; + if (__rng3.size() == 0) + return {0, 0}; //To consider the direct copying pattern call in case just one of sequences is empty. if (__n1 == 0) { - oneapi::dpl::__internal::__ranges::__pattern_walk_n( + auto __res = oneapi::dpl::__internal::__ranges::__pattern_walk_n( __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy1_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}, ::std::forward<_Range2>(__rng2), ::std::forward<_Range3>(__rng3)); + return {0, __res}; } - else if (__n2 == 0) + + if (__n2 == 0) { - oneapi::dpl::__internal::__ranges::__pattern_walk_n( + auto __res = oneapi::dpl::__internal::__ranges::__pattern_walk_n( __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy2_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range3>(__rng3)); - } - else - { - __par_backend_hetero::__parallel_merge(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - ::std::forward<_Range3>(__rng3), __comp) - .__deferrable_wait(); + return {__res, 0}; } - return __n; + auto __res = __par_backend_hetero::__parallel_merge(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), + ::std::forward<_Range3>(__rng3), __comp); + + auto __val = __res.get(); + return {__val.first, __val.second}; } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -727,12 +729,18 @@ auto __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - assert(std::ranges::size(__r1) + std::ranges::size(__r2) <= std::ranges::size(__out_r)); // for debug purposes only - auto __comp_2 = [__comp, __proj1, __proj2](auto&& __val1, auto&& __val2) { return std::invoke(__comp, std::invoke(__proj1, std::forward(__val1)), std::invoke(__proj2, std::forward(__val2)));}; + using _Index1 = std::ranges::range_difference_t<_R1>; + using _Index2 = std::ranges::range_difference_t<_R2>; + using _Index3 = std::ranges::range_difference_t<_OutRange>; + + _Index1 __n_1 = std::ranges::size(__r1); + _Index2 __n_2 = std::ranges::size(__r2); + _Index3 __n_out = std::min<_Index3>(__n_1 + __n_2, std::ranges::size(__out_r)); + auto __res = oneapi::dpl::__internal::__ranges::__pattern_merge(__tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::views::all_read(__r1), oneapi::dpl::__ranges::views::all_read(__r2), oneapi::dpl::__ranges::views::all_write(__out_r), __comp_2); @@ -740,8 +748,8 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _R1& using __return_t = std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_t{std::ranges::begin(__r1) + std::ranges::size(__r1), std::ranges::begin(__r2) + - std::ranges::size(__r2), std::ranges::begin(__out_r) + __res}; + return __return_t{std::ranges::begin(__r1) + __res.first, std::ranges::begin(__r2) + __res.second, + std::ranges::begin(__out_r) + __n_out}; } #endif //_ONEDPL_CPP20_RANGES_PRESENT diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9cfaa550e58..ae7533ba9b9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -133,9 +133,9 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_ // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template -void +std::pair, oneapi::dpl::__internal::__difference_t<_Rng2>> __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _Index __start1, const _Index __start2, - const _Index __start3, const _Index __chunk, const _Index __n1, const _Index __n2, _Compare __comp) + const _Index __start3, const _Index __chunk, const _Index __n1, const _Index __n2, _Compare __comp, _Index __n3 = 0) { const _Index __rng1_size = std::min<_Index>(__n1 > __start1 ? __n1 - __start1 : _Index{0}, __chunk); const _Index __rng2_size = std::min<_Index>(__n2 > __start2 ? __n2 - __start2 : _Index{0}, __chunk); @@ -143,7 +143,9 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I const _Index __rng1_idx_end = __start1 + __rng1_size; const _Index __rng2_idx_end = __start2 + __rng2_size; - const _Index __rng3_idx_end = __start3 + __rng3_size; + _Index __rng3_idx_end = __start3 + __rng3_size; + if(__n3 > 0) + __rng3_idx_end = std::min<_Index>(__n3, __rng3_idx_end); _Index __rng1_idx = __start1; _Index __rng2_idx = __start2; @@ -162,6 +164,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I else __rng3[__rng3_idx] = __rng1[__rng1_idx++]; } + return {__rng1_idx, __rng2_idx}; } // Please see the comment for __parallel_for_submitter for optional kernel name explanation @@ -177,7 +180,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; + const _IdType __n = std::min<_IdType>(__n1 + __n2, __rng3.size()); assert(__n1 > 0 || __n2 > 0); @@ -188,20 +191,34 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + using __val_t = _split_point_t<_IdType>; + using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __val_t>; + auto __p_res_storage = new __result_and_scratch_storage_t(__exec, 1, 0); + + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + std::shared_ptr<__result_and_scratch_storage_base> __p_result_base(__p_res_storage); + auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = - __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + [&__rng1, &__rng2, &__rng3, __p_res_storage, __comp, __chunk, __steps, __n, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __result_acc = __p_res_storage->template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + auto __id = __item_id.get_linear_id(); + const _IdType __i_elem = __id * __chunk; + + const auto __n_merge = std::min<_IdType>(__chunk, __n - __i_elem); + const auto __start = __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); + auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __n_merge, __n1, __n2, __comp, __n); + + if(__id == __steps - 1) //the last WI does additional work + { + auto __res_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__result_acc); + *__res_ptr = __ends; + } }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, std::shared_ptr<__result_and_scratch_storage_base>{}); + }); + return __future(std::move(__event), std::move(__p_result_base)); } }; @@ -225,10 +242,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range parameters template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2, const std::size_t __n) const { - const std::size_t __n = __rng1.size() + __rng2.size(); - // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -244,13 +259,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculation of split points on each base diagonal template sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _IdType __n, _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; const _IdType __base_diag_chunk = __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; @@ -288,13 +302,16 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); + const _IdType __n = std::min<_IdType>(__n1 + __n2, __rng3.size()); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __n, __comp, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + auto __result_acc = __base_diagonals_sp_global_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); + __cgh.depends_on(__event); __cgh.parallel_for<_MergeKernelName...>( @@ -320,8 +337,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); + auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp, __n); + if(__global_idx == __nd_range_params.steps - 1) + { + auto __res_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__result_acc); + *__res_ptr = __ends; + } }); }); } @@ -331,24 +353,28 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { - assert(__rng1.size() > 0 || __rng2.size() > 0); + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + assert(__n1 > 0 || __n2 > 0); + + const _IdType __n = std::min<_IdType>(__n1 + __n2, __rng3.size()); _PRINT_INFO_IN_DEBUG_MODE(__exec); // Calculate nd-range parameters - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2, __n); // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); + using __val_t = _split_point_t<_IdType>; + auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, __val_t>(__exec, + 1, __nd_range_params.base_diag_count + 1); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); // Find split-points on the base diagonals - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __n, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); // Merge data using split points on each diagonal @@ -391,7 +417,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; - const std::size_t __n = __rng1.size() + __rng2.size(); + const std::uint64_t __n = std::min(__rng1.size() + __rng2.size(), __rng3.size()); if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 348484f1477..11e7d313b5b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -524,6 +524,7 @@ struct __usm_or_buffer_accessor struct __result_and_scratch_storage_base { virtual ~__result_and_scratch_storage_base() = default; + virtual std::size_t __get_data(sycl::event, std::size_t* __p_buf) const = 0; }; template @@ -656,6 +657,16 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base #endif } + _T + __wait_and_get_value(sycl::event __event) const + { + if (is_USM()) + __event.wait_and_throw(); + + return __get_value(); + } + +private: bool is_USM() const { @@ -665,17 +676,17 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base // Note: this member function assumes the result is *ready*, since the __future has already // waited on the relevant event. _T - __get_value(size_t idx = 0) const + __get_value() const { - assert(idx < __result_n); + assert( __result_n == 1); if (__use_USM_host && __supports_USM_device) { - return *(__result_buf.get() + idx); + return *(__result_buf.get()); } else if (__supports_USM_device) { _T __tmp; - __exec.queue().memcpy(&__tmp, __scratch_buf.get() + __scratch_n + idx, 1 * sizeof(_T)).wait(); + __exec.queue().memcpy(&__tmp, __scratch_buf.get() + __scratch_n, 1 * sizeof(_T)).wait(); return __tmp; } else @@ -684,14 +695,29 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } } - template - _T - __wait_and_get_value(_Event&& __event, size_t idx = 0) const + template + std::size_t + __fill_data(std::pair<_Type, _Type>&& __p, std::size_t* __p_buf) const + { + __p_buf[0] = __p.first; + __p_buf[1] = __p.second; + return 2; + } + + template + std::size_t + __fill_data(_Args&&...) const + { + assert(!"Unsupported return type"); + return 0; + } + + virtual std::size_t __get_data(sycl::event __event, std::size_t* __p_buf) const override { if (is_USM()) __event.wait_and_throw(); - return __get_value(idx); + return __fill_data(__get_value(), __p_buf); } }; @@ -731,6 +757,16 @@ class __future : private std::tuple<_Args...> return __storage.__wait_and_get_value(__my_event); } + constexpr auto + __wait_and_get_value(const std::shared_ptr<__result_and_scratch_storage_base>& __p_storage) + { + std::size_t __buf[2] = {0, 0}; + auto __n = __p_storage->__get_data(__my_event, __buf); + assert(__n == 2); + + return std::pair{__buf[0], __buf[1]}; + } + template constexpr auto __wait_and_get_value(const _T& __val) diff --git a/include/oneapi/dpl/pstl/omp/parallel_for.h b/include/oneapi/dpl/pstl/omp/parallel_for.h index 1a0ea24d798..917b3089059 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_for.h +++ b/include/oneapi/dpl/pstl/omp/parallel_for.h @@ -29,10 +29,10 @@ namespace __omp_backend template void -__parallel_for_body(_Index __first, _Index __last, _Fp __f) +__parallel_for_body(_Index __first, _Index __last, _Fp __f, std::size_t __grainsize) { // initial partition of the iteration space into chunks - auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last); + auto __policy = oneapi::dpl::__omp_backend::__chunk_partitioner(__first, __last, __grainsize); // To avoid over-subscription we use taskloop for the nested parallelism _PSTL_PRAGMA(omp taskloop untied mergeable) @@ -49,20 +49,21 @@ __parallel_for_body(_Index __first, _Index __last, _Fp __f) template void -__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f) +__parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f, + std::size_t __grainsize = __default_chunk_size) { if (omp_in_parallel()) { // we don't create a nested parallel region in an existing parallel // region: just create tasks - oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f); + oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); } else { // in any case (nested or non-nested) one parallel region is created and // only one thread creates a set of tasks _PSTL_PRAGMA(omp parallel) - _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f); } + _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); } } } diff --git a/include/oneapi/dpl/pstl/parallel_backend.h b/include/oneapi/dpl/pstl/parallel_backend.h index b243e8fb492..841a9357eb7 100644 --- a/include/oneapi/dpl/pstl/parallel_backend.h +++ b/include/oneapi/dpl/pstl/parallel_backend.h @@ -35,6 +35,9 @@ # endif #endif +//the parallel backend constants +#define _ONEDPL_MERGE_CUT_OFF 2000 + namespace oneapi { namespace dpl diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 6acd4b617f9..032306dbe69 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -45,7 +45,7 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag) template void __parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, - _Fp __f) + _Fp __f, std::size_t __grainsize = 1) { __f(__first, __last); } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 59821e98156..a977fc3d1a9 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -92,10 +92,11 @@ class __parallel_for_body // wrapper over tbb::parallel_for template void -__parallel_for(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f) +__parallel_for(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f, + std::size_t __grainsize = 1) { tbb::this_task_arena::isolate([=]() { - tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f)); + tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last, __grainsize), __parallel_for_body<_Index, _Fp>(__f)); }); } diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 7e454c80268..e3379456837 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -879,6 +879,40 @@ __simd_remove_if(_RandomAccessIterator __first, _DifferenceType __n, _UnaryPredi } return __current + __cnt; } + +template +std::pair<_Iterator1, _Iterator2> +__simd_merge(_Iterator1 __x, _Iterator1 __x_e, _Iterator2 __y, _Iterator2 __y_e, _Iterator3 __i, _Iterator3 __j, _Comp __comp) +{ + _ONEDPL_PRAGMA_SIMD + for(_Iterator3 __k = __i; __k < __j; ++__k) + { + if(__x >= __x_e) + { + assert(__y < __y_e); + *__k = *__y; + ++__y; + } + else if(__y >= __y_e) + { + assert(__x < __x_e); + *__k = *__x; + ++__x; + } + else if(std::invoke(__comp, *__x, *__y)) + { + *__k = *__x; + ++__x; + } + else + { + *__k = *__y; + ++__y; + } + } + return {__x, __y}; +} + } // namespace __unseq_backend } // namespace dpl } // namespace oneapi diff --git a/test/parallel_api/ranges/std_ranges_merge.pass.cpp b/test/parallel_api/ranges/std_ranges_merge.pass.cpp index 7752c82f275..2883d7db90f 100644 --- a/test/parallel_api/ranges/std_ranges_merge.pass.cpp +++ b/test/parallel_api/ranges/std_ranges_merge.pass.cpp @@ -25,25 +25,59 @@ main() //A checker below modifies a return type; a range based version with policy has another return type. auto merge_checker = [](std::ranges::random_access_range auto&& r_1, std::ranges::random_access_range auto&& r_2, - std::ranges::random_access_range auto&& r_out, auto&&... args) + std::ranges::random_access_range auto&& r_out, auto comp, auto proj1, + auto proj2) { - auto res = std::ranges::merge(std::forward(r_1), std::forward(r_2), - std::ranges::begin(r_out), std::forward(args)...); - using ret_type = std::ranges::merge_result, std::ranges::borrowed_iterator_t, std::ranges::borrowed_iterator_t>; - return ret_type{res.in1, res.in2, res.out}; + + auto it_out = std::ranges::begin(r_out); + auto it_1 = std::ranges::begin(r_1); + auto it_2 = std::ranges::begin(r_2); + auto it_1_e = std::ranges::end(r_1); + auto it_2_e = std::ranges::end(r_2); + auto it_out_e = std::ranges::end(r_out); + + + while(it_1 != it_1_e && it_2 != it_2_e) + { + if (std::invoke(comp, std::invoke(proj2, *it_2), std::invoke(proj1, *it_1))) + { + *it_out = *it_2; + ++it_out, ++it_2; + } + else + { + *it_out = *it_1; + ++it_out, ++it_1; + } + if(it_out == it_out_e) + return ret_type{it_1, it_2, it_out}; + } + + if(it_1 == it_1_e) + { + for(; it_2 != it_2_e && it_out != it_out_e; ++it_2, ++it_out) + *it_out = *it_2; + } + else + { + for(; it_1 != it_1_e && it_out != it_out_e; ++it_1, ++it_out) + *it_out = *it_1; + } + + return ret_type{it_1, it_2, it_out}; }; - test_range_algo<0, int, data_in_in_out>{big_sz}(dpl_ranges::merge, merge_checker, std::ranges::less{}); + test_range_algo<0, int, data_in_in_out_lim>{big_sz}(dpl_ranges::merge, merge_checker, std::ranges::less{}, std::identity{}, std::identity{}); - test_range_algo<1, int, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, proj, proj); - test_range_algo<2, P2, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, &P2::x, &P2::x); - test_range_algo<3, P2, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, &P2::proj, &P2::proj); + test_range_algo<1, int, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, proj, proj); + test_range_algo<2, P2, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, &P2::x, &P2::x); + test_range_algo<3, P2, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::less{}, &P2::proj, &P2::proj); - test_range_algo<4, int, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, proj, proj); - test_range_algo<5, P2, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, &P2::x, &P2::x); - test_range_algo<6, P2, data_in_in_out>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, &P2::proj, &P2::proj); + test_range_algo<4, int, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, proj, proj); + test_range_algo<5, P2, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, &P2::x, &P2::x); + test_range_algo<6, P2, data_in_in_out_lim>{}(dpl_ranges::merge, merge_checker, std::ranges::greater{}, &P2::proj, &P2::proj); #endif //_ENABLE_STD_RANGES_TESTING return TestUtils::done(_ENABLE_STD_RANGES_TESTING); diff --git a/test/parallel_api/ranges/std_ranges_test.h b/test/parallel_api/ranges/std_ranges_test.h index 6057ccebfcd..51e0665a27f 100644 --- a/test/parallel_api/ranges/std_ranges_test.h +++ b/test/parallel_api/ranges/std_ranges_test.h @@ -277,14 +277,12 @@ struct test Container cont_in1(exec, n_in1, [](auto i) { return i;}); Container cont_in2(exec, n_in2, [](auto i) { return i/3;}); - const int max_n_out = max_n*2; - Container cont_out(exec, max_n_out, [](auto i) { return 0;}); - Container cont_exp(exec, max_n_out, [](auto i) { return 0;}); + Container cont_out(exec, n_out, [](auto i) { return 0;}); + Container cont_exp(exec, n_out, [](auto i) { return 0;}); assert(n_in1 <= max_n); assert(n_in2 <= max_n); - assert(n_out <= max_n_out); - + auto src_view1 = tr_in(std::views::all(cont_in1())); auto src_view2 = tr_in(std::views::all(cont_in2())); auto expected_view = tr_out(std::views::all(cont_exp())); @@ -322,6 +320,7 @@ struct test { const int r_size = max_n; process_data_in_in_out(r_size, r_size, r_size, exec, algo, checker, args...); + process_data_in_in_out(r_size, r_size, r_size*2, exec, algo, checker, args...); process_data_in_in_out(r_size/2, r_size, r_size, exec, algo, checker, args...); process_data_in_in_out(r_size, r_size/2, r_size, exec, algo, checker, args...); process_data_in_in_out(r_size, r_size, r_size/2, std::forward(exec), algo, checker, args...); From 3a8b83d669d81602ec323f619330ba77d3b378f1 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Thu, 16 Jan 2025 18:00:21 +0100 Subject: [PATCH 2/9] [oneDPL][ranges][merge] minor changes --- include/oneapi/dpl/pstl/algorithm_impl.h | 2 -- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index e6513e10399..2a6230df0ef 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -31,7 +31,6 @@ #include "parallel_backend.h" #include "parallel_impl.h" #include "iterator_impl.h" -#include "../functional" #if _ONEDPL_HETERO_BACKEND # include "hetero/algorithm_impl_hetero.h" // for __pattern_fill_n, __pattern_generate_n @@ -2977,7 +2976,6 @@ __brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_o } else { - //assert(__it_2 == __it_2_e); for(; __it_1 != __it_1_e && __it_out != __it_out_e; ++__it_1, ++__it_out) *__it_out = *__it_1; } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ae7533ba9b9..4df4c0b5c4d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -143,9 +143,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I const _Index __rng1_idx_end = __start1 + __rng1_size; const _Index __rng2_idx_end = __start2 + __rng2_size; - _Index __rng3_idx_end = __start3 + __rng3_size; - if(__n3 > 0) - __rng3_idx_end = std::min<_Index>(__n3, __rng3_idx_end); + const _Index __rng3_idx_end = __n3 > 0 ? std::min<_Index>(__n3, __start3 + __rng3_size) : __start3 + __rng3_size; _Index __rng1_idx = __start1; _Index __rng2_idx = __start2; From 1eae44d3268ec75cff628eb155d3a36716c65c6a Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Thu, 16 Jan 2025 18:53:37 +0100 Subject: [PATCH 3/9] + #include "../functional" //for oneapi::dpl::identity --- include/oneapi/dpl/pstl/algorithm_impl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 2a6230df0ef..bcd64770bfe 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -31,6 +31,7 @@ #include "parallel_backend.h" #include "parallel_impl.h" #include "iterator_impl.h" +#include "../functional" //for oneapi::dpl::identity #if _ONEDPL_HETERO_BACKEND # include "hetero/algorithm_impl_hetero.h" // for __pattern_fill_n, __pattern_generate_n From 4ad3bda69ac82f4b8285f889eb3b54e965a49ff0 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Fri, 17 Jan 2025 12:42:02 +0100 Subject: [PATCH 4/9] [oneDPL][ranges][merge] minor changes --- include/oneapi/dpl/pstl/algorithm_impl.h | 2 +- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index bcd64770bfe..0937d95077a 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -3072,7 +3072,7 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i oneapi::dpl::__internal::__compare<_Comp, oneapi::dpl::identity> __cmp{__comp, oneapi::dpl::identity{}}; - const auto __res = (__cmp(__it_1[__r], __it_2[__c]) ? 1 : 0); + const auto __res = __cmp(__it_1[__r], __it_2[__c]) ? 1 : 0; return __res < __val; } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 4df4c0b5c4d..a7417c3e49f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -216,6 +216,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }); }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() return __future(std::move(__event), std::move(__p_result_base)); } }; @@ -335,7 +337,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + const auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp, __n); if(__global_idx == __nd_range_params.steps - 1) { @@ -415,7 +417,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; - const std::uint64_t __n = std::min(__rng1.size() + __rng2.size(), __rng3.size()); + const std::size_t __n = std::min(__rng1.size() + __rng2.size(), __rng3.size()); if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; From 2288700acc1c66d926beb97a96651c3a506dbd2f Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Fri, 17 Jan 2025 12:52:38 +0100 Subject: [PATCH 5/9] [oneDPL][ranges][merge] minor changes --- include/oneapi/dpl/pstl/algorithm_impl.h | 2 +- .../oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 6 +++--- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 0937d95077a..b8296da7a24 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -3085,7 +3085,7 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i } //serial merge n elements, starting from input x and y, to [i, j) output range - auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, + const auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, __it_2 + __c, __it_2 + __n_2, __it_out + __i, __it_out + __j, __comp, _IsVector{}); diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index a16bdb6cdde..7830a8545d0 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -686,9 +686,9 @@ std::pair, oneapi::dpl::__inter __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) { - auto __n1 = __rng1.size(); - auto __n2 = __rng2.size(); - if (__rng3.size() == 0) + const auto __n1 = __rng1.size(); + const auto __n2 = __rng2.size(); + if (__rng3.empty()) return {0, 0}; //To consider the direct copying pattern call in case just one of sequences is empty. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a7417c3e49f..fc83f6ebb70 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -135,7 +135,8 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_ template std::pair, oneapi::dpl::__internal::__difference_t<_Rng2>> __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _Index __start1, const _Index __start2, - const _Index __start3, const _Index __chunk, const _Index __n1, const _Index __n2, _Compare __comp, _Index __n3 = 0) + const _Index __start3, const _Index __chunk, const _Index __n1, const _Index __n2, _Compare __comp, + const _Index __n3 = 0) { const _Index __rng1_size = std::min<_Index>(__n1 > __start1 ? __n1 - __start1 : _Index{0}, __chunk); const _Index __rng2_size = std::min<_Index>(__n2 > __start2 ? __n2 - __start2 : _Index{0}, __chunk); From 65cf8f42638e91bcff1e972a64e0720a137b240f Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 20 Jan 2025 11:14:01 +0100 Subject: [PATCH 6/9] [oneDPL][ranges][merge] _ONEDPL_MERGE_CUT_OFF -> __merge_algo_cut_off --- include/oneapi/dpl/pstl/algorithm_impl.h | 4 ++-- include/oneapi/dpl/pstl/parallel_backend.h | 3 --- include/oneapi/dpl/pstl/parallel_backend_tbb.h | 9 +++------ include/oneapi/dpl/pstl/parallel_backend_utils.h | 2 ++ 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index b8296da7a24..3d7cdd91abc 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -3094,7 +3094,7 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i __it_res_1 = __res.first; __it_res_2 = __res.second; } - }, _ONEDPL_MERGE_CUT_OFF); //grainsize + }, oneapi::dpl::__utils::__merge_algo_cut_off); //grainsize }); return {__it_res_1, __it_res_2}; @@ -3103,7 +3103,7 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i template _RandomAccessIterator3 -__pattern_merge(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, +__pattern_merge(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __d_first, _Compare __comp) { diff --git a/include/oneapi/dpl/pstl/parallel_backend.h b/include/oneapi/dpl/pstl/parallel_backend.h index 841a9357eb7..b243e8fb492 100644 --- a/include/oneapi/dpl/pstl/parallel_backend.h +++ b/include/oneapi/dpl/pstl/parallel_backend.h @@ -35,9 +35,6 @@ # endif #endif -//the parallel backend constants -#define _ONEDPL_MERGE_CUT_OFF 2000 - namespace oneapi { namespace dpl diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index a977fc3d1a9..ac4ecfa69c9 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -412,7 +412,6 @@ __parallel_transform_scan(oneapi::dpl::__internal::__tbb_backend_tag, _Execution // // These are used by parallel implementations but do not depend on them. //------------------------------------------------------------------------ -#define _ONEDPL_MERGE_CUT_OFF 2000 template class __func_task; @@ -731,7 +730,7 @@ class __merge_func _LeafMerge _M_leaf_merge; _SizeType _M_nsort; //number of elements to be sorted for partial_sort algorithm - static const _SizeType __merge_cut_off = _ONEDPL_MERGE_CUT_OFF; + static const _SizeType __merge_cut_off = oneapi::dpl::__utils::__merge_algo_cut_off; bool _root; //means a task is merging root task bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer @@ -1223,8 +1222,7 @@ operator()(__task* __self) typedef typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; typedef typename ::std::common_type_t<_DifferenceType1, _DifferenceType2> _SizeType; const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys); - const _SizeType __merge_cut_off = _ONEDPL_MERGE_CUT_OFF; - if (__n <= __merge_cut_off) + if (__n <= oneapi::dpl::__utils::__merge_algo_cut_off) { _M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp); return nullptr; @@ -1264,8 +1262,7 @@ __parallel_merge(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, typedef typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; typedef typename ::std::common_type_t<_DifferenceType1, _DifferenceType2> _SizeType; const _SizeType __n = (__xe - __xs) + (__ye - __ys); - const _SizeType __merge_cut_off = _ONEDPL_MERGE_CUT_OFF; - if (__n <= __merge_cut_off) + if (__n <= oneapi::dpl::__utils::__merge_algo_cut_off) { // Fall back on serial merge __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp); diff --git a/include/oneapi/dpl/pstl/parallel_backend_utils.h b/include/oneapi/dpl/pstl/parallel_backend_utils.h index 9d50cb041f0..96d0b17820c 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_utils.h +++ b/include/oneapi/dpl/pstl/parallel_backend_utils.h @@ -29,6 +29,8 @@ namespace dpl namespace __utils { +inline constexpr auto __merge_algo_cut_off = 2000; + //------------------------------------------------------------------------ // raw buffer (with specified _TAllocator) //------------------------------------------------------------------------ From 7750787b7b98c51db737f7fda578b4ab875db629 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 20 Jan 2025 10:55:55 +0100 Subject: [PATCH 7/9] [oneDPL][ranges][merge] minor changes: writing return type for __wait_and_get_value explicitly. --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 11e7d313b5b..0fc8550f7a9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -743,7 +743,7 @@ class __future : private std::tuple<_Args...> _Event __my_event; template - constexpr auto + constexpr _T __wait_and_get_value(const sycl::buffer<_T>& __buf) { //according to a contract, returned value is one-element sycl::buffer @@ -757,18 +757,18 @@ class __future : private std::tuple<_Args...> return __storage.__wait_and_get_value(__my_event); } - constexpr auto + constexpr std::pair __wait_and_get_value(const std::shared_ptr<__result_and_scratch_storage_base>& __p_storage) { std::size_t __buf[2] = {0, 0}; auto __n = __p_storage->__get_data(__my_event, __buf); assert(__n == 2); - return std::pair{__buf[0], __buf[1]}; + return {__buf[0], __buf[1]}; } template - constexpr auto + constexpr _T __wait_and_get_value(const _T& __val) { wait(); From 350387af7b16cb20ab8c1a477ee045cf64104ba9 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 20 Jan 2025 11:38:39 +0100 Subject: [PATCH 8/9] [oneDPL][ranges][merge] + clang format --- include/oneapi/dpl/pstl/algorithm_impl.h | 150 +++++++++--------- .../oneapi/dpl/pstl/algorithm_ranges_impl.h | 12 +- .../dpl/pstl/glue_algorithm_ranges_impl.h | 2 +- .../hetero/algorithm_ranges_impl_hetero.h | 10 +- .../dpcpp/parallel_backend_sycl_merge.h | 35 ++-- .../dpcpp/parallel_backend_sycl_utils.h | 16 +- include/oneapi/dpl/pstl/omp/parallel_for.h | 5 +- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 3 +- include/oneapi/dpl/pstl/unseq_backend_simd.h | 13 +- 9 files changed, 129 insertions(+), 117 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 3d7cdd91abc..3975bc12cc2 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -2949,44 +2949,44 @@ __pattern_remove_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, // merge //------------------------------------------------------------------------ -template +template std::pair __brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, - /* __is_vector = */ std::false_type) + /* __is_vector = */ std::false_type) { - while(__it_1 != __it_1_e && __it_2 != __it_2_e) + while (__it_1 != __it_1_e && __it_2 != __it_2_e) { - if (__comp(*__it_1, *__it_2)) - { - *__it_out = *__it_1; - ++__it_out, ++__it_1; - } - else - { - *__it_out = *__it_2; - ++__it_out, ++__it_2; - } - if(__it_out == __it_out_e) + if (__comp(*__it_1, *__it_2)) + { + *__it_out = *__it_1; + ++__it_out, ++__it_1; + } + else + { + *__it_out = *__it_2; + ++__it_out, ++__it_2; + } + if (__it_out == __it_out_e) return {__it_1, __it_2}; } - if(__it_1 == __it_1_e) + if (__it_1 == __it_1_e) { - for(; __it_2 != __it_2_e && __it_out != __it_out_e; ++__it_2, ++__it_out) + for (; __it_2 != __it_2_e && __it_out != __it_out_e; ++__it_2, ++__it_out) *__it_out = *__it_2; } else { - for(; __it_1 != __it_1_e && __it_out != __it_out_e; ++__it_1, ++__it_out) + for (; __it_1 != __it_1_e && __it_out != __it_out_e; ++__it_1, ++__it_out) *__it_out = *__it_1; } return {__it_1, __it_2}; } -template +template std::pair __brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, - /* __is_vector = */ std::true_type) + /* __is_vector = */ std::true_type) { return __unseq_backend::__simd_merge(__it_1, __it_1_e, __it_2, __it_2_e, __it_out, __it_out_e, __comp); } @@ -3023,21 +3023,21 @@ __pattern_merge(_Tag, _ExecutionPolicy&&, _ForwardIterator1 __first1, _ForwardIt typename _Tag::__is_vector{}); } -template +template std::pair<_It1, _It2> -__pattern_merge_2(_Tag, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, - _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) +__pattern_merge_2(_Tag, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, _Index2 __n_2, + _OutIt __it_out, _Index3 __n_out, _Comp __comp) { return __brick_merge_2(__it_1, __it_1 + __n_1, __it_2, __it_2 + __n_2, __it_out, __it_out + __n_out, __comp, typename _Tag::__is_vector{}); } -template +template std::pair<_It1, _It2> __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, - _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) + _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; @@ -3045,56 +3045,54 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i _It2 __it_res_2; __internal::__except_handler([&]() { - __par_backend::__parallel_for(__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _Index3(0), __n_out, - [=, &__it_res_1, &__it_res_2](_Index3 __i, _Index3 __j) - { - //a start merging point on the merge path; for each thread - _Index1 __r = 0; //row index - _Index2 __c = 0; //column index - - if(__i > 0) - { - //calc merge path intersection: - const _Index3 __d_size = - std::abs(std::max<_Index2>(0, __i - __n_2) - (std::min<_Index1>(__i, __n_1) - 1)) + 1; - - auto __get_row = [__i, __n_1](auto __d) - { return std::min<_Index1>(__i, __n_1) - __d - 1; }; - auto __get_column = [__i, __n_1](auto __d) - { return std::max<_Index1>(0, __i - __n_1 - 1) + __d + (__i / (__n_1 + 1) > 0 ? 1 : 0); }; - - oneapi::dpl::counting_iterator<_Index3> __it_d(0); - - auto __res_d = *std::lower_bound(__it_d, __it_d + __d_size, 1, - [&](auto __d, auto __val) { - auto __r = __get_row(__d); - auto __c = __get_column(__d); - - oneapi::dpl::__internal::__compare<_Comp, oneapi::dpl::identity> - __cmp{__comp, oneapi::dpl::identity{}}; - const auto __res = __cmp(__it_1[__r], __it_2[__c]) ? 1 : 0; - - return __res < __val; - } - ); - - //intersection point - __r = __get_row(__res_d); - __c = __get_column(__res_d); - ++__r; //to get a merge matrix ceil, lying on the current diagonal - } - - //serial merge n elements, starting from input x and y, to [i, j) output range - const auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, - __it_2 + __c, __it_2 + __n_2, - __it_out + __i, __it_out + __j, __comp, _IsVector{}); - - if(__j == __n_out) - { - __it_res_1 = __res.first; - __it_res_2 = __res.second; - } - }, oneapi::dpl::__utils::__merge_algo_cut_off); //grainsize + __par_backend::__parallel_for( + __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _Index3(0), __n_out, + [=, &__it_res_1, &__it_res_2](_Index3 __i, _Index3 __j) { + //a start merging point on the merge path; for each thread + _Index1 __r = 0; //row index + _Index2 __c = 0; //column index + + if (__i > 0) + { + //calc merge path intersection: + const _Index3 __d_size = + std::abs(std::max<_Index2>(0, __i - __n_2) - (std::min<_Index1>(__i, __n_1) - 1)) + 1; + + auto __get_row = [__i, __n_1](auto __d) { return std::min<_Index1>(__i, __n_1) - __d - 1; }; + auto __get_column = [__i, __n_1](auto __d) { + return std::max<_Index1>(0, __i - __n_1 - 1) + __d + (__i / (__n_1 + 1) > 0 ? 1 : 0); + }; + + oneapi::dpl::counting_iterator<_Index3> __it_d(0); + + auto __res_d = *std::lower_bound(__it_d, __it_d + __d_size, 1, [&](auto __d, auto __val) { + auto __r = __get_row(__d); + auto __c = __get_column(__d); + + oneapi::dpl::__internal::__compare<_Comp, oneapi::dpl::identity> __cmp{__comp, + oneapi::dpl::identity{}}; + const auto __res = __cmp(__it_1[__r], __it_2[__c]) ? 1 : 0; + + return __res < __val; + }); + + //intersection point + __r = __get_row(__res_d); + __c = __get_column(__res_d); + ++__r; //to get a merge matrix ceil, lying on the current diagonal + } + + //serial merge n elements, starting from input x and y, to [i, j) output range + const auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, __it_2 + __c, __it_2 + __n_2, + __it_out + __i, __it_out + __j, __comp, _IsVector{}); + + if (__j == __n_out) + { + __it_res_1 = __res.first; + __it_res_2 = __res.second; + } + }, + oneapi::dpl::__utils::__merge_algo_cut_off); //grainsize }); return {__it_res_1, __it_res_2}; diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 2c7d9873079..dfc630fb1d5 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -448,9 +448,10 @@ auto __pattern_merge(_Tag __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _OutRange&& __out_r, _Comp __comp, _Proj1 __proj1, _Proj2 __proj2) { - using __return_type = std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, - std::ranges::borrowed_iterator_t<_OutRange>>; - + using __return_type = + std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, + std::ranges::borrowed_iterator_t<_OutRange>>; + auto __comp_2 = [__comp, __proj1, __proj2](auto&& __val1, auto&& __val2) { return std::invoke(__comp, std::invoke(__proj1, std::forward(__val1)), std::invoke(__proj2, std::forward(__val2)));}; @@ -467,10 +468,11 @@ __pattern_merge(_Tag __tag, _ExecutionPolicy&& __exec, _R1&& __r1, _R2&& __r2, _ auto __it_2 = std::ranges::begin(__r2); auto __it_out = std::ranges::begin(__out_r); - if(__n_out == 0) + if (__n_out == 0) return __return_type{__it_1, __it_2, __it_out}; - auto __res = __pattern_merge_2(__tag, std::forward<_ExecutionPolicy>(__exec), __it_2, __n_2, __it_1, __n_1, __it_out, __n_out, __comp_2); + auto __res = __pattern_merge_2(__tag, std::forward<_ExecutionPolicy>(__exec), __it_2, __n_2, __it_1, __n_1, + __it_out, __n_out, __comp_2); return __return_type{__res.second, __res.first, __it_out + __n_out}; } diff --git a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h index f2fc3840632..048c3348cd6 100644 --- a/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/glue_algorithm_ranges_impl.h @@ -1177,7 +1177,7 @@ merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& _ oneapi::dpl::__internal::__ranges::__pattern_merge( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), views::all_read(::std::forward<_Range1>(__rng1)), views::all_read(::std::forward<_Range2>(__rng2)), __view_res, __comp); - + return __view_res.size(); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7830a8545d0..472027a6f57 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -700,7 +700,7 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ran ::std::forward<_ExecutionPolicy>(__exec)), oneapi::dpl::__internal::__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy>{}, ::std::forward<_Range2>(__rng2), ::std::forward<_Range3>(__rng3)); - return {0, __res}; + return {0, __res}; } if (__n2 == 0) @@ -714,9 +714,9 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ran return {__res, 0}; } - auto __res = __par_backend_hetero::__parallel_merge(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2), - ::std::forward<_Range3>(__rng3), __comp); + auto __res = __par_backend_hetero::__parallel_merge( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), + ::std::forward<_Range2>(__rng2), ::std::forward<_Range3>(__rng3), __comp); auto __val = __res.get(); return {__val.first, __val.second}; @@ -748,7 +748,7 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _R1& using __return_t = std::ranges::merge_result, std::ranges::borrowed_iterator_t<_R2>, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_t{std::ranges::begin(__r1) + __res.first, std::ranges::begin(__r2) + __res.second, + return __return_t{std::ranges::begin(__r1) + __res.first, std::ranges::begin(__r2) + __res.second, std::ranges::begin(__out_r) + __n_out}; } #endif //_ONEDPL_CPP20_RANGES_PRESENT diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fc83f6ebb70..847880d8ef0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -193,24 +193,27 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N using __val_t = _split_point_t<_IdType>; using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __val_t>; auto __p_res_storage = new __result_and_scratch_storage_t(__exec, 1, 0); - + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_base(__p_res_storage); - auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __p_res_storage, __comp, __chunk, __steps, __n, __n1, __n2](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __p_res_storage, __comp, __chunk, __steps, __n, + __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __result_acc = __p_res_storage->template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); + auto __result_acc = + __p_res_storage->template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { auto __id = __item_id.get_linear_id(); const _IdType __i_elem = __id * __chunk; const auto __n_merge = std::min<_IdType>(__chunk, __n - __i_elem); - const auto __start = __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); - auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __n_merge, __n1, __n2, __comp, __n); + const auto __start = + __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); + auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __n_merge, + __n1, __n2, __comp, __n); - if(__id == __steps - 1) //the last WI does additional work + if (__id == __steps - 1) //the last WI does additional work { auto __res_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__result_acc); *__res_ptr = __ends; @@ -243,7 +246,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range parameters template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2, const std::size_t __n) const + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2, + const std::size_t __n) const { // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -260,8 +264,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculation of split points on each base diagonal template sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _IdType __n, _Compare __comp, - const nd_range_params& __nd_range_params, + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _IdType __n, + _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); @@ -311,7 +315,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - auto __result_acc = __base_diagonals_sp_global_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); + auto __result_acc = __base_diagonals_sp_global_storage.template __get_result_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.depends_on(__event); @@ -339,8 +344,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } const auto __ends = __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp, __n); - if(__global_idx == __nd_range_params.steps - 1) + __nd_range_params.chunk, __n1, __n2, __comp, __n); + if (__global_idx == __nd_range_params.steps - 1) { auto __res_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__result_acc); *__res_ptr = __ends; @@ -367,8 +372,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) using __val_t = _split_point_t<_IdType>; - auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, __val_t>(__exec, - 1, __nd_range_params.base_diag_count + 1); + auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, __val_t>( + __exec, 1, __nd_range_params.base_diag_count + 1); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 0fc8550f7a9..41f0de7f672 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -524,7 +524,8 @@ struct __usm_or_buffer_accessor struct __result_and_scratch_storage_base { virtual ~__result_and_scratch_storage_base() = default; - virtual std::size_t __get_data(sycl::event, std::size_t* __p_buf) const = 0; + virtual std::size_t + __get_data(sycl::event, std::size_t* __p_buf) const = 0; }; template @@ -666,7 +667,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return __get_value(); } -private: + private: bool is_USM() const { @@ -678,7 +679,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base _T __get_value() const { - assert( __result_n == 1); + assert(__result_n == 1); if (__use_USM_host && __supports_USM_device) { return *(__result_buf.get()); @@ -695,7 +696,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } } - template + template std::size_t __fill_data(std::pair<_Type, _Type>&& __p, std::size_t* __p_buf) const { @@ -704,7 +705,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return 2; } - template + template std::size_t __fill_data(_Args&&...) const { @@ -712,7 +713,8 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return 0; } - virtual std::size_t __get_data(sycl::event __event, std::size_t* __p_buf) const override + virtual std::size_t + __get_data(sycl::event __event, std::size_t* __p_buf) const override { if (is_USM()) __event.wait_and_throw(); @@ -761,7 +763,7 @@ class __future : private std::tuple<_Args...> __wait_and_get_value(const std::shared_ptr<__result_and_scratch_storage_base>& __p_storage) { std::size_t __buf[2] = {0, 0}; - auto __n = __p_storage->__get_data(__my_event, __buf); + auto __n = __p_storage->__get_data(__my_event, __buf); assert(__n == 2); return {__buf[0], __buf[1]}; diff --git a/include/oneapi/dpl/pstl/omp/parallel_for.h b/include/oneapi/dpl/pstl/omp/parallel_for.h index 917b3089059..7fd0ec78636 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_for.h +++ b/include/oneapi/dpl/pstl/omp/parallel_for.h @@ -63,7 +63,10 @@ __parallel_for(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&&, _ // in any case (nested or non-nested) one parallel region is created and // only one thread creates a set of tasks _PSTL_PRAGMA(omp parallel) - _PSTL_PRAGMA(omp single nowait) { oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); } + _PSTL_PRAGMA(omp single nowait) + { + oneapi::dpl::__omp_backend::__parallel_for_body(__first, __last, __f, __grainsize); + } } } diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index ac4ecfa69c9..3ae1ba31ba1 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -96,7 +96,8 @@ __parallel_for(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&&, _ std::size_t __grainsize = 1) { tbb::this_task_arena::isolate([=]() { - tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last, __grainsize), __parallel_for_body<_Index, _Fp>(__f)); + tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last, __grainsize), + __parallel_for_body<_Index, _Fp>(__f)); }); } diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index e3379456837..cf60a762ba2 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -880,26 +880,27 @@ __simd_remove_if(_RandomAccessIterator __first, _DifferenceType __n, _UnaryPredi return __current + __cnt; } -template +template std::pair<_Iterator1, _Iterator2> -__simd_merge(_Iterator1 __x, _Iterator1 __x_e, _Iterator2 __y, _Iterator2 __y_e, _Iterator3 __i, _Iterator3 __j, _Comp __comp) +__simd_merge(_Iterator1 __x, _Iterator1 __x_e, _Iterator2 __y, _Iterator2 __y_e, _Iterator3 __i, _Iterator3 __j, + _Comp __comp) { _ONEDPL_PRAGMA_SIMD - for(_Iterator3 __k = __i; __k < __j; ++__k) + for (_Iterator3 __k = __i; __k < __j; ++__k) { - if(__x >= __x_e) + if (__x >= __x_e) { assert(__y < __y_e); *__k = *__y; ++__y; } - else if(__y >= __y_e) + else if (__y >= __y_e) { assert(__x < __x_e); *__k = *__x; ++__x; } - else if(std::invoke(__comp, *__x, *__y)) + else if (std::invoke(__comp, *__x, *__y)) { *__k = *__x; ++__x; From c59b92ca0c223ca60c60ee27918cbbf5029e3143 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Thu, 23 Jan 2025 17:29:48 +0100 Subject: [PATCH 9/9] [oneDPL][ranges][merge] minor changes: __pattern_merge_2 -> __pattern_merge_path (renaming) --- include/oneapi/dpl/pstl/algorithm_fwd.h | 11 ++++++++++ include/oneapi/dpl/pstl/algorithm_impl.h | 28 ++++++++++++------------ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_fwd.h b/include/oneapi/dpl/pstl/algorithm_fwd.h index 5e2b3e504d9..3d127c72fcc 100644 --- a/include/oneapi/dpl/pstl/algorithm_fwd.h +++ b/include/oneapi/dpl/pstl/algorithm_fwd.h @@ -941,6 +941,17 @@ _RandomAccessIterator3 __pattern_merge(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _RandomAccessIterator1, _RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator2, _RandomAccessIterator3, _Compare); +template +std::pair<_It1, _It2> +__pattern_merge_path(_Tag, _ExecutionPolicy&&, _It1, _Index1, _It2, _Index2, _OutIt, _Index3, _Comp); + +template +std::pair<_It1, _It2> +__pattern_merge_path(__parallel_tag<_IsVector>, _ExecutionPolicy&&, _It1, _Index1, _It2, _Index2, _OutIt, _Index3, + _Comp); + //------------------------------------------------------------------------ // inplace_merge //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index 3975bc12cc2..11aea9642b2 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -2951,12 +2951,12 @@ __pattern_remove_if(__parallel_tag<_IsVector> __tag, _ExecutionPolicy&& __exec, template std::pair -__brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, - /* __is_vector = */ std::false_type) +__brick_merge_out_lim(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, + _Comp __comp, /* __is_vector = */ std::false_type) { while (__it_1 != __it_1_e && __it_2 != __it_2_e) { - if (__comp(*__it_1, *__it_2)) + if (std::invoke(__comp, *__it_1, *__it_2)) { *__it_out = *__it_1; ++__it_out, ++__it_1; @@ -2985,8 +2985,8 @@ __brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_o template std::pair -__brick_merge_2(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, _Comp __comp, - /* __is_vector = */ std::true_type) +__brick_merge_out_lim(It1 __it_1, It1 __it_1_e, It2 __it_2, It2 __it_2_e, ItOut __it_out, ItOut __it_out_e, + _Comp __comp, /* __is_vector = */ std::true_type) { return __unseq_backend::__simd_merge(__it_1, __it_1_e, __it_2, __it_2_e, __it_out, __it_out_e, __comp); } @@ -3023,21 +3023,21 @@ __pattern_merge(_Tag, _ExecutionPolicy&&, _ForwardIterator1 __first1, _ForwardIt typename _Tag::__is_vector{}); } -template std::pair<_It1, _It2> -__pattern_merge_2(_Tag, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, _Index2 __n_2, - _OutIt __it_out, _Index3 __n_out, _Comp __comp) +__pattern_merge_path(_Tag, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, _Index2 __n_2, + _OutIt __it_out, _Index3 __n_out, _Comp __comp) { - return __brick_merge_2(__it_1, __it_1 + __n_1, __it_2, __it_2 + __n_2, __it_out, __it_out + __n_out, __comp, - typename _Tag::__is_vector{}); + return __brick_merge_out_lim(__it_1, __it_1 + __n_1, __it_2, __it_2 + __n_2, __it_out, __it_out + __n_out, __comp, + typename _Tag::__is_vector{}); } template std::pair<_It1, _It2> -__pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, - _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) +__pattern_merge_path(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __it_1, _Index1 __n_1, _It2 __it_2, + _Index2 __n_2, _OutIt __it_out, _Index3 __n_out, _Comp __comp) { using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; @@ -3083,8 +3083,8 @@ __pattern_merge_2(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _It1 __i } //serial merge n elements, starting from input x and y, to [i, j) output range - const auto __res = __brick_merge_2(__it_1 + __r, __it_1 + __n_1, __it_2 + __c, __it_2 + __n_2, - __it_out + __i, __it_out + __j, __comp, _IsVector{}); + const auto __res = __brick_merge_out_lim(__it_1 + __r, __it_1 + __n_1, __it_2 + __c, __it_2 + __n_2, + __it_out + __i, __it_out + __j, __comp, _IsVector{}); if (__j == __n_out) {