Skip to content

Commit

Permalink
Make kernel compilation logic more portable (#1991)
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitriy-sobolev authored Jan 22, 2025
1 parent 9a1dada commit e4e4b8f
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 21 deletions.
4 changes: 2 additions & 2 deletions include/oneapi/dpl/internal/scan_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ struct __sycl_scan_by_segment_impl
__cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_scan_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -272,7 +272,7 @@ struct __sycl_scan_by_segment_impl
__cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanPrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_scan_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
4 changes: 2 additions & 2 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
__cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
#endif
__cgh.parallel_for<_LocalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel_1,
#endif
sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
Expand All @@ -351,7 +351,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
__cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
#endif
__cgh.parallel_for<_GroupScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel_2,
#endif
// TODO: try to balance work between several workgroups instead of one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, :
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -303,7 +303,7 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -548,7 +548,7 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel,
#endif
//Each SYCL work group processes one data segment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ struct __parallel_transform_reduce_impl
__cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_ReduceKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__kernel,
#endif
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
#endif
__cgh.parallel_for<_SegReduceCountKernel>(
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_reduce_count_kernel,
#endif
sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -206,7 +206,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
__cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceOffsetKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_reduce_offset_kernel,
#endif
sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand All @@ -229,7 +229,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
__cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_reduce_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -352,7 +352,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
__cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReducePrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
__seg_reduce_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ using __kernel_name_generator =
_BaseName<_CustomName>;
#endif

#if _ONEDPL_COMPILE_KERNEL
template <typename... _KernelNames>
class __kernel_compiler
{
Expand Down Expand Up @@ -287,7 +288,7 @@ class __kernel_compiler
{
return __kernel_array_type{__kernel_bundle.get_kernel(__kernel_ids[_Ip])...};
}
#else
#elif _ONEDPL_LIBSYCL_PROGRAM_PRESENT
template <typename _Exec>
static auto
__compile(_Exec&& __exec)
Expand All @@ -300,6 +301,7 @@ class __kernel_compiler
}
#endif
};
#endif // _ONEDPL_COMPILE_KERNEL

#if _ONEDPL_DEBUG_SYCL
template <typename _Policy>
Expand Down
21 changes: 20 additions & 1 deletion include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
#endif
#define _ONEDPL_LIBSYCL_VERSION_LESS_THAN(_VERSION) (_ONEDPL_LIBSYCL_VERSION && _ONEDPL_LIBSYCL_VERSION < _VERSION)
#if defined(ACPP_VERSION_MAJOR) && defined(ACPP_VERSION_MINOR) && defined(ACPP_VERSION_PATCH)
# define _ONEDPL_ACPP_VERSION (ACPP_VERSION_MAJOR * 10000 + ACPP_VERSION_MINOR * 100 + ACPP_VERSION_PATCH)
#endif
#if _ONEDPL_FPGA_DEVICE
# if _ONEDPL_LIBSYCL_VERSION >= 50400
# include <sycl/ext/intel/fpga_extensions.hpp>
Expand All @@ -59,7 +63,9 @@
#define _ONEDPL_SYCL2020_COLLECTIVES_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_BUFFER_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_ACCESSOR_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
// Kernel bundle support is not expected in ACPP, see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1296.
#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT \
(!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300) && !_ONEDPL_ACPP_VERSION)
#define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
Expand All @@ -83,10 +89,23 @@
// Feature macros for DPC++ SYCL runtime library alternatives to non-supported SYCL 2020 features
#define _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_LIBSYCL_PROGRAM_PRESENT (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
#define _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT (_ONEDPL_LIBSYCL_VERSION == 50200)
#define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT \
(SYCL_EXT_ONEAPI_SUB_GROUP_MASK >= 1 && _ONEDPL_LIBSYCL_VERSION >= 50700)
// Compilation of a kernel is requiried to obtain valid work_group_size
// when target devices are CPU or FPGA emulator. Since CPU and GPU devices
// cannot be distinguished during compilation, the macro is enabled by default.
#define _ONEDPL_CAN_COMPILE_KERNEL (_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT || _ONEDPL_LIBSYCL_PROGRAM_PRESENT)
#if !defined(_ONEDPL_COMPILE_KERNEL)
# define _ONEDPL_COMPILE_KERNEL _ONEDPL_CAN_COMPILE_KERNEL
#else
# if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_CAN_COMPILE_KERNEL
# error "No SYCL kernel compilation method available (neither SYCL 2020 kernel bundle nor other alternatives)."
# endif
#endif

#define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100))

// Macro to check if we are compiling for SPIR-V devices. This macro must only be used within
Expand Down
7 changes: 0 additions & 7 deletions include/oneapi/dpl/pstl/onedpl_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,13 +334,6 @@
# define _ONEDPL_USE_GROUP_ALGOS 1
# endif
// Compilation of a kernel is requiried to obtain valid work_group_size
// when target devices are CPU or FPGA emulator. Since CPU and GPU devices
// cannot be distinguished during compilation, the macro is enabled by default.
# if !defined(_ONEDPL_COMPILE_KERNEL)
# define _ONEDPL_COMPILE_KERNEL 1
# endif
# define _ONEDPL_BUILT_IN_STABLE_NAME_PRESENT __has_builtin(__builtin_sycl_unique_stable_name)
#endif // _ONEDPL_BACKEND_SYCL
Expand Down

0 comments on commit e4e4b8f

Please sign in to comment.