Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host Implementation of Histogram APIs #1974

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
a89c292
histogram CPU initial implementation
danhoeflinger Nov 8, 2024
c2002b6
atomics histogram implementation
danhoeflinger Nov 8, 2024
99de690
clang format
danhoeflinger Nov 8, 2024
9a50c2f
comment about atomics
danhoeflinger Nov 8, 2024
9a35424
first draft of atomic increment (unchecked)
danhoeflinger Nov 12, 2024
40bc904
atomics include and fix builtin
danhoeflinger Nov 12, 2024
ba9b609
large case
danhoeflinger Nov 12, 2024
e3fc3ce
fix threshold check
danhoeflinger Nov 15, 2024
7fc9a5e
minor improvements
danhoeflinger Nov 19, 2024
022ea44
MSVC fixes
danhoeflinger Nov 20, 2024
6ca5a7a
parallelize initialization of openMP temp histograms
danhoeflinger Nov 20, 2024
1744e53
removing unnecessary type casting
danhoeflinger Dec 13, 2024
569db46
improving accumulation of local histograms (simd)
danhoeflinger Dec 13, 2024
c794d1f
Properly using IsVector
danhoeflinger Dec 14, 2024
cbf087c
typo fix
danhoeflinger Dec 16, 2024
6f0a9cb
special handling thread zero to use global mem
danhoeflinger Dec 16, 2024
8d61609
cleanup
danhoeflinger Dec 16, 2024
7c2b127
atomic version removal
danhoeflinger Dec 16, 2024
59d3162
Revert "cleanup"
danhoeflinger Dec 16, 2024
8fdbf15
Revert "special handling thread zero to use global mem"
danhoeflinger Dec 16, 2024
bda4466
comments and cleanup
danhoeflinger Dec 16, 2024
0be0aad
handling coarser grained parallelism
danhoeflinger Dec 16, 2024
34e9d9f
undo-ing broken thread restriction in openMP
danhoeflinger Dec 16, 2024
bae7c6a
lift pattern up to algorithm_impl level
danhoeflinger Dec 18, 2024
7a67c0b
cleanup unnecessary code
danhoeflinger Dec 18, 2024
e7d3952
further cleanup / formatting
danhoeflinger Dec 18, 2024
e95689d
add grain size todo
danhoeflinger Dec 20, 2024
6b5019d
more general thread local storage
danhoeflinger Dec 20, 2024
95d483f
implement omp on demand tls
danhoeflinger Dec 30, 2024
0942617
formatting
danhoeflinger Dec 30, 2024
ea1f413
formatting
danhoeflinger Dec 30, 2024
d79cd5e
comments and clarity
danhoeflinger Dec 30, 2024
cfcba7e
bugfix for serial impl
danhoeflinger Dec 30, 2024
8fdac77
removing debugging output
danhoeflinger Dec 30, 2024
f9ab0ae
formatting
danhoeflinger Dec 30, 2024
85f6b07
comment adjustment
danhoeflinger Dec 30, 2024
f7e8477
minor naming / formatting
danhoeflinger Dec 30, 2024
d5585fe
formatting
danhoeflinger Dec 30, 2024
2ce3fd0
Address review feedback
danhoeflinger Jan 6, 2025
907b1a5
formatting
danhoeflinger Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
set(SET_BACKEND_${ONEDPL_BACKEND_NAME} TRUE)

if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp)$")
find_package(TBB 2021 REQUIRED tbb OPTIONAL_COMPONENTS tbbmalloc)
find_package(TBB 2021 REQUIRED tbb tbbmalloc)
message(STATUS "oneDPL uses oneTBB ${TBB_VERSION}")
target_link_libraries(oneDPL INTERFACE TBB::tbb)
endif()
Expand Down Expand Up @@ -330,7 +330,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
set(_openmp_flag -fopenmp)
elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
set(_openmp_flag /Qopenmp)
set(_openmp_flag -Qopenmp)
endif()
if (_openmp_flag)
message(STATUS "Using ${_openmp_flag} for openMP")
Expand Down
81 changes: 81 additions & 0 deletions include/oneapi/dpl/pstl/algorithm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4289,6 +4289,87 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera
return __res.base();
}

template <typename _ForwardIterator, typename _IdxHashFunc, typename _RandomAccessIterator, class _IsVector>
void
__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
_RandomAccessIterator __histogram_first, _IsVector) noexcept
{
for (; __first != __last; ++__first)
{
std::int32_t __bin = __func.get_bin(*__first);
if (__bin >= 0)
{
++__histogram_first[__bin];
}
}
}

template <class _Tag, typename _ExecutionPolicy, typename _ForwardIterator, typename _Size, typename _IdxHashFunc,
typename _RandomAccessIterator>
void
__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
{
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
__pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
_HistogramValueT{0});
__brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
}

template <class _IsVector, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size,
typename _IdxHashFunc, typename _RandomAccessIterator2>
void
__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
_RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func,
_RandomAccessIterator2 __histogram_first)
{
using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
using _DiffType = typename ::std::iterator_traits<_RandomAccessIterator2>::difference_type;

_DiffType __n = __last - __first;
if (__n > 0)
{
__par_backend::__thread_enumerable_storage<std::vector<_HistogramValueT>> __tls{__num_bins,
_HistogramValueT{0}};

//main histogram loop
//TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
__par_backend::__parallel_for(
__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __first, __last,
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
[__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
__internal::__brick_histogram(__first_local, __last_local, __func,
__tls.get_for_current_thread().begin(), _IsVector{});
});
// now accumulate temporary storage into output global histogram
__par_backend::__parallel_for(
__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __histogram_first,
__histogram_first + __num_bins,
[__histogram_first, &__tls](auto __global_histogram_first, auto __global_histogram_last) {
_DiffType __local_n = __global_histogram_last - __global_histogram_first;
std::uint32_t __num_temporary_copies = __tls.size();
_DiffType __range_begin_id = __global_histogram_first - __histogram_first;
//initialize output global histogram with first local histogram via assign
__internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n,
__global_histogram_first, oneapi::dpl::__internal::__pstl_assign(),
_IsVector{});
for (std::uint32_t __i = 1; __i < __num_temporary_copies; ++__i)
{
//accumulate into output global histogram with other local histogram via += operator
__internal::__brick_walk2_n(
__tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first,
[](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
}
});
}
else
{
__pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first,
__histogram_first + __num_bins, _HistogramValueT{0});
}
}

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
18 changes: 1 addition & 17 deletions include/oneapi/dpl/pstl/histogram_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "histogram_extension_defs.h"
#include "histogram_binhash_utils.h"
#include "iterator_impl.h"
#include "algorithm_impl.h"

#if _ONEDPL_HETERO_BACKEND
# include "hetero/histogram_impl_hetero.h"
Expand All @@ -29,23 +30,6 @@ namespace oneapi
namespace dpl
{

namespace __internal
{

template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
typename _RandomAccessIterator2>
void
__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
{
static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);

static_assert(sizeof(_Size) == 0 /*false*/,
"Histogram API is currently unsupported for policies other than device execution policies");
}

} // namespace __internal

template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _ValueType,
typename _RandomAccessIterator2>
oneapi::dpl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _RandomAccessIterator2>
Expand Down
85 changes: 85 additions & 0 deletions include/oneapi/dpl/pstl/omp/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <vector>
#include <type_traits>
#include <omp.h>
#include <atomic>

#include "../parallel_backend_utils.h"
#include "../unseq_backend_simd.h"
Expand Down Expand Up @@ -153,6 +154,90 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
__f(__first, __last);
}

// abstract class to allow inclusion in __thread_enumerable_storage as member without requiring explicit template
// instantiation of param types
template <typename _StorageType>
class __construct_by_args_base
{
public:
virtual ~__construct_by_args_base() {}
virtual std::unique_ptr<_StorageType> construct() = 0;
};

// Helper class to allow construction of _StorageType from a stored argument pack
template <typename _StorageType, typename... _P>
class __construct_by_args : public __construct_by_args_base<_StorageType>
{
public:
std::unique_ptr<_StorageType>
construct()
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
{
return std::move(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason std::move is needed here?

std::apply([](auto... __arg_pack) { return std::make_unique<_StorageType>(__arg_pack...); }, __pack));
}
__construct_by_args(_P&&... __args) : __pack(std::forward<_P>(__args)...) {}

private:
std::tuple<_P...> __pack;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we include <tuple> in this header?

};

template <typename _StorageType>
struct __thread_enumerable_storage
{
template <typename... Args>
__thread_enumerable_storage(Args&&... __args) : __num_elements(0)
{
__construct_helper =
std::make_unique<__construct_by_args<_StorageType, Args...>>(std::forward<Args>(__args)...);
_PSTL_PRAGMA(omp parallel)
_PSTL_PRAGMA(omp single) { __thread_specific_storage.resize(omp_get_num_threads()); }
}

std::uint32_t
size() const
{
// only count storage which has been instantiated
return __num_elements.load();
}

_StorageType&
get_with_id(std::uint32_t __i)
{
assert(__i < size());

std::uint32_t __count = 0;
std::uint32_t __j = 0;

for (; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
{
// Only include storage from threads which have instantiated a storage object
if (__thread_specific_storage[__j])
{
__count++;
}
}
Comment on lines +211 to +218
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we expect in the general case that each thread constructs its own container? If so, would it be possible to add a check like the following before this? :

if (size() == __thread_specific_storage.size())
    return *__thread_specific_storage[__i];

If I understand correctly, each thread constructing its local storage would mean the ith id would be at index i. For a CPU with a large number of threads this may be more performant.

// Need to back up one once we have found a valid storage object
return *__thread_specific_storage[__j - 1];
}

_StorageType&
get_for_current_thread()
{
std::uint32_t __i = omp_get_thread_num();
if (!__thread_specific_storage[__i])
{
// create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
__thread_specific_storage[__i] = __construct_helper->construct();
__num_elements.fetch_add(1);
}
return *__thread_specific_storage[__i];
}

std::vector<std::unique_ptr<_StorageType>> __thread_specific_storage;
std::atomic<std::uint32_t> __num_elements;
std::unique_ptr<__construct_by_args_base<_StorageType>> __construct_helper;
};

} // namespace __omp_backend
} // namespace dpl
} // namespace oneapi
Expand Down
29 changes: 29 additions & 0 deletions include/oneapi/dpl/pstl/parallel_backend_serial.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,35 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
{
}

template <typename _StorageType>
struct __thread_enumerable_storage
{
template <typename... Args>
__thread_enumerable_storage(Args&&... __args) : __storage(std::forward<Args>(__args)...)
{
}

std::uint32_t
size() const
{
return std::uint32_t{1};
}

_StorageType&
get_for_current_thread()
{
return __storage;
}

_StorageType&
get_with_id(std::uint32_t __i)
{
return get_for_current_thread();
}

_StorageType __storage;
};

template <class _ExecutionPolicy, class _Index, class _Fp>
void
__parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,
Expand Down
30 changes: 30 additions & 0 deletions include/oneapi/dpl/pstl/parallel_backend_tbb.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <tbb/parallel_invoke.h>
#include <tbb/task_arena.h>
#include <tbb/tbb_allocator.h>
#include <tbb/enumerable_thread_specific.h>
#if TBB_INTERFACE_VERSION > 12000
# include <tbb/task.h>
#endif
Expand Down Expand Up @@ -1306,6 +1307,35 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
}

template <typename _StorageType>
struct __thread_enumerable_storage
{
template <typename... Args>
__thread_enumerable_storage(Args&&... __args) : __thread_specific_storage(std::forward<Args>(__args)...)
{
}

std::uint32_t
size() const
{
return __thread_specific_storage.size();
}

_StorageType&
get_for_current_thread()
{
return __thread_specific_storage.local();
}

_StorageType&
get_with_id(std::uint32_t __i)
{
return __thread_specific_storage.begin()[__i];
}

tbb::enumerable_thread_specific<_StorageType> __thread_specific_storage;
};

} // namespace __tbb_backend
} // namespace dpl
} // namespace oneapi
Expand Down
Loading
Loading