diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 7a1a26bbc9..910d13c308 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -6,7 +6,7 @@ from __future__ import annotations # TODO: required for Python 3.7 docs env import ctypes -from typing import Callable +from typing import Callable, Optional import numba import numpy as np @@ -46,6 +46,30 @@ def _dtype_validation(dt1, dt2): raise TypeError(f"dtype mismatch: __init__={dt1}, __call__={dt2}") +def _validate_and_get_stream(stream) -> Optional[int]: + # null stream is allowed + if stream is None: + return None + + if not hasattr(stream, "__cuda_stream__"): + raise TypeError( + f"stream argument {stream} does not implement the '__cuda_stream__' protocol" + ) + + stream_property = stream.__cuda_stream__ + if ( + isinstance(stream_property, tuple) + and len(stream_property) == 2 + and all(isinstance(i, int) for i in stream_property) + ): + version, handle = stream_property + return handle + + raise TypeError( + f"__cuda_stream__ property of '{stream}' must return a 'Tuple[int, int]'; got {stream_property} instead" + ) + + class _Reduce: # TODO: constructor shouldn't require concrete `d_in`, `d_out`: def __init__( @@ -85,7 +109,9 @@ def __init__( if error != enums.CUDA_SUCCESS: raise ValueError("Error building reduce") - def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray): + def __call__( + self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray, stream=None + ): d_in_cccl = cccl.to_cccl_iter(d_in) if d_in_cccl.type.value == cccl.IteratorKind.ITERATOR: assert num_items is not None @@ -101,6 +127,7 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray ) _dtype_validation(self._ctor_d_out_dtype, d_out.dtype) _dtype_validation(self._ctor_init_dtype, h_init.dtype) + stream_handle = _validate_and_get_stream(stream) bindings = get_bindings() if temp_storage is None: temp_storage_bytes = ctypes.c_size_t() @@ -120,7 +147,7 @@ def __call__(self, temp_storage, d_in, d_out, num_items: int, h_init: np.ndarray ctypes.c_ulonglong(num_items), self.op_wrapper.handle(), cccl.host_array_to_value(h_init), - None, + stream_handle, ) if error != enums.CUDA_SUCCESS: raise ValueError("Error reducing") diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index 9549ef7bee..f3ecb072e0 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -550,3 +550,88 @@ def binary_op(x, y): d_in = cp.zeros(size)[::2] with pytest.raises(ValueError, match="Non-contiguous arrays are not supported."): _ = algorithms.reduce_into(d_in, d_out, binary_op, h_init) + + +def test_reduce_with_stream(): + # Simple cupy stream wrapper that implements the __cuda_stream__ protocol for the purposes of this test + class Stream: + def __init__(self, cp_stream): + self.cp_stream = cp_stream + + @property + def __cuda_stream__(self): + return (0, self.cp_stream.ptr) + + def add_op(x, y): + return x + y + + h_init = np.asarray([0], dtype=np.int32) + h_in = random_int(5, np.int32) + + stream = cp.cuda.Stream() + with stream: + d_in = cp.asarray(h_in) + d_out = cp.empty(1, dtype=np.int32) + + stream_wrapper = Stream(stream) + reduce_into = algorithms.reduce_into( + d_in=d_in, d_out=d_out, op=add_op, h_init=h_init + ) + temp_storage_size = reduce_into( + None, + d_in=d_in, + d_out=d_out, + num_items=d_in.size, + h_init=h_init, + stream=stream_wrapper, + ) + d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8) + + reduce_into(d_temp_storage, d_in, d_out, d_in.size, h_init, stream=stream_wrapper) + np.testing.assert_allclose(d_in.sum().get(), d_out.get()) + + +def test_reduce_invalid_stream(): + # Invalid stream that doesn't implement __cuda_stream__ + class Stream1: + def __init__(self): + pass + + # Invalid stream that implements __cuda_stream__ but returns the wrong type + class Stream2: + def __init__(self): + pass + + @property + def __cuda_stream__(self): + return None + + def add_op(x, y): + return x + y + + d_out = cp.empty(1) + h_init = np.empty(1) + d_in = cp.empty(1) + reduce_into = algorithms.reduce_into(d_in, d_out, add_op, h_init) + + with pytest.raises( + TypeError, match="does not implement the '__cuda_stream__' protocol" + ): + _ = reduce_into( + None, + d_in=d_in, + d_out=d_out, + num_items=d_in.size, + h_init=h_init, + stream=Stream1(), + ) + + with pytest.raises(TypeError, match="must return a 'Tuple\\[int, int\\]';"): + _ = reduce_into( + None, + d_in=d_in, + d_out=d_out, + num_items=d_in.size, + h_init=h_init, + stream=Stream2(), + )