diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py new file mode 100644 index 00000000000..2647c1835ee --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import functools +from numba import cuda + + +def cache_with_key(key): + """ + Decorator to cache the result of the decorated function. Uses the + provided `key` function to compute the key for cache lookup. `key` + receives all arguments passed to the function. + + Notes + ----- + The CUDA compute capability of the current device is appended to + the cache key returned by `key`. + """ + + def deco(func): + cache = {} + + @functools.wraps(func) + def inner(*args, **kwargs): + cc = cuda.get_current_device().compute_capability + cache_key = (key(*args, **kwargs), cc) + if cache_key not in cache: + result = func(*args, **kwargs) + cache[cache_key] = result + return cache[cache_key] + + return inner + + return deco + + +class CachableFunction: + """ + A type that wraps a function and provides custom comparison + (__eq__) and hash (__hash__) implementations. + + The purpose of this class is to enable caching and comparison of + functions based on their bytecode, constants, and closures, while + ignoring other attributes such as their names or docstrings. + """ + + def __init__(self, func): + self._func = func + self._identity = ( + self._func.__code__.co_code, + self._func.__code__.co_consts, + self._func.__closure__, + ) + + def __eq__(self, other): + return self._identity == other._identity + + def __hash__(self): + return hash(self._identity) + + def __repr__(self): + return str(self._func) diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py new file mode 100644 index 00000000000..9c0718e71f0 --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Utilities for extracting information from `__cuda_array_interface__`. +""" + +import numpy as np + +from ..typing import DeviceArrayLike + + +def get_dtype(arr: DeviceArrayLike) -> np.dtype: + return np.dtype(arr.__cuda_array_interface__["typestr"]) diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 29e7786b5f8..41843742827 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from __future__ import annotations # TODO: required for Python 3.7 docs env + import ctypes import numba import numpy as np @@ -12,6 +14,10 @@ from .. import _cccl as cccl from .._bindings import get_paths, get_bindings +from .._caching import CachableFunction, cache_with_key +from ..typing import DeviceArrayLike +from ..iterators._iterators import IteratorBase +from .._utils import cai class _Op: @@ -41,12 +47,18 @@ def _dtype_validation(dt1, dt2): class _Reduce: # TODO: constructor shouldn't require concrete `d_in`, `d_out`: - def __init__(self, d_in, d_out, op: Callable, h_init: np.ndarray): + def __init__( + self, + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, + ): d_in_cccl = cccl.to_cccl_iter(d_in) self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name( d_in_cccl.value_type.type.value ) - self._ctor_d_out_dtype = d_out.dtype + self._ctor_d_out_dtype = cai.get_dtype(d_out) self._ctor_init_dtype = h_init.dtype cc_major, cc_minor = cuda.get_current_device().compute_capability cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths() @@ -119,9 +131,28 @@ def __del__(self): bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result)) +def make_cache_key( + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, +): + d_in_key = d_in.kind if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in) + d_out_key = cai.get_dtype(d_out) + op_key = CachableFunction(op) + h_init_key = h_init.dtype + return (d_in_key, d_out_key, op_key, h_init_key) + + # TODO Figure out `sum` without operator and initial value # TODO Accept stream -def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray): +@cache_with_key(make_cache_key) +def reduce_into( + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, +): """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``. Example: diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index e92578efbe7..babc4a12a11 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -1,15 +1,19 @@ import ctypes import operator +import uuid from functools import lru_cache from typing import Dict, Callable from llvmlite import ir from numba.core.extending import intrinsic, overload from numba.core.typing.ctypes_utils import to_ctypes +from numba.cuda.dispatcher import CUDADispatcher from numba import cuda, types import numba import numpy as np +from .._caching import CachableFunction + _DEVICE_POINTER_SIZE = 8 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8 @@ -20,16 +24,35 @@ def cached_compile(func, sig, abi_name=None, **kwargs): return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs) +class IteratorKind: + def __init__(self, value_type): + self.value_type = value_type + + def __repr__(self): + return f"{self.__class__.__name__}[{str(self.value_type)}]" + + def __eq__(self, other): + return type(self) is type(other) and self.value_type == other.value_type + + def __hash__(self): + return hash(self.value_type) + + +@lru_cache(maxsize=None) +def _get_abi_suffix(kind: IteratorKind): + # given an IteratorKind, return a UUID. The value is cached so + # that the same UUID is always returned for a given IteratorKind. + return uuid.uuid4().hex + + class IteratorBase: """ An Iterator is a wrapper around a pointer, and must define the following: - - a `state` property that returns a `ctypes.c_void_p` object, representing - a pointer to some data. - - an `advance` (static) method that receives the state pointer and performs + - an `advance` (static) method that receives the pointer and performs an action that advances the pointer by the offset `distance` (returns nothing). - - a `dereference` (static) method that dereferences the state pointer + - a `dereference` (static) method that dereferences the pointer and returns a value. Iterators are not meant to be used directly. They are constructed and passed @@ -38,29 +61,40 @@ class IteratorBase: The `advance` and `dereference` must be compilable to device code by numba. """ - def __init__(self, numba_type: types.Type, value_type: types.Type, abi_name: str): + iterator_kind_type: type # must be a subclass of IteratorKind + + def __init__( + self, + cvalue: ctypes.c_void_p, + numba_type: types.Type, + value_type: types.Type, + ): """ Parameters ---------- + cvalue + A ctypes type representing the object pointed to by the iterator. numba_type - A numba type that specifies how to interpret the state pointer. + A numba type representing the type of the input to the advance + and dereference functions. value_type The numba type of the value returned by the dereference operation. - abi_name - A unique identifier that will determine the abi_names for the - advance and dereference operations. """ + self.cvalue = cvalue self.numba_type = numba_type self.value_type = value_type - self.abi_name = abi_name + + @property + def kind(self): + return self.__class__.iterator_kind_type(self.value_type) # TODO: should we cache this? Current docs environment doesn't allow # using Python > 3.7. We could use a hand-rolled cached_property if # needed. @property def ltoirs(self) -> Dict[str, bytes]: - advance_abi_name = self.abi_name + "_advance" - deref_abi_name = self.abi_name + "_dereference" + advance_abi_name = "advance_" + _get_abi_suffix(self.kind) + deref_abi_name = "dereference_" + _get_abi_suffix(self.kind) advance_ltoir, _ = cached_compile( self.__class__.advance, ( @@ -81,7 +115,7 @@ def ltoirs(self) -> Dict[str, bytes]: @property def state(self) -> ctypes.c_void_p: - raise NotImplementedError("Subclasses must override advance staticmethod") + return ctypes.cast(ctypes.pointer(self.cvalue), ctypes.c_void_p) @staticmethod def advance(state, distance): @@ -122,16 +156,20 @@ def impl(ptr, offset): return impl +class RawPointerType(IteratorKind): + pass + + class RawPointer(IteratorBase): - def __init__(self, ptr: int, ntype: types.Type): - value_type = ntype - self._cvalue = ctypes.c_void_p(ptr) + iterator_kind_type = RawPointerType + + def __init__(self, ptr: int, value_type: types.Type): + cvalue = ctypes.c_void_p(ptr) numba_type = types.CPointer(types.CPointer(value_type)) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -142,13 +180,9 @@ def advance(state, distance): def dereference(state): return state[0][0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) - -def pointer(container, ntype: types.Type) -> RawPointer: - return RawPointer(container.__cuda_array_interface__["data"][0], ntype) +def pointer(container, value_type: types.Type) -> RawPointer: + return RawPointer(container.__cuda_array_interface__["data"][0], value_type) @intrinsic @@ -172,16 +206,21 @@ def codegen(context, builder, sig, args): return base.dtype(base), codegen +class CacheModifiedPointerType(IteratorKind): + pass + + class CacheModifiedPointer(IteratorBase): + iterator_kind_type = CacheModifiedPointerType + def __init__(self, ptr: int, ntype: types.Type): - self._cvalue = ctypes.c_void_p(ptr) + cvalue = ctypes.c_void_p(ptr) value_type = ntype numba_type = types.CPointer(types.CPointer(value_type)) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -192,21 +231,22 @@ def advance(state, distance): def dereference(state): return load_cs(state[0]) - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) + +class ConstantIteratorKind(IteratorKind): + pass class ConstantIterator(IteratorBase): + iterator_kind_type = ConstantIteratorKind + def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) - self._cvalue = to_ctypes(value_type)(value) + cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -217,21 +257,22 @@ def advance(state, distance): def dereference(state): return state[0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) + +class CountingIteratorKind(IteratorKind): + pass class CountingIterator(IteratorBase): + iterator_kind_type = CountingIteratorKind + def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) - self._cvalue = to_ctypes(value_type)(value) + cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -242,9 +283,13 @@ def advance(state, distance): def dereference(state): return state[0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) + +class TransformIteratorKind(IteratorKind): + def __eq__(self, other): + return type(self) is type(other) and self.value_type == other.value_type + + def __hash__(self): + return hash(self.value_type) def make_transform_iterator(it, op: Callable): @@ -256,31 +301,32 @@ def make_transform_iterator(it, op: Callable): op = cuda.jit(op, device=True) class TransformIterator(IteratorBase): - def __init__(self, it: IteratorBase, op): + iterator_kind_type = TransformIteratorKind + + def __init__(self, it: IteratorBase, op: CUDADispatcher): self._it = it + self._op = CachableFunction(op.py_func) numba_type = it.numba_type - # TODO: the abi name below isn't unique enough when we have e.g., - # two identically named `op` functions with different - # signatures, bytecodes, and/or closure variables. - op_abi_name = f"{self.__class__.__name__}_{op.py_func.__name__}" - # TODO: it would be nice to not need to compile `op` to get # its return type, but there's nothing in the numba API # to do that (yet), _, op_retty = cached_compile( op, (self._it.value_type,), - abi_name=op_abi_name, + abi_name=f"{op.__name__}_{_get_abi_suffix(self.kind)}", output="ltoir", ) value_type = op_retty - abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}" super().__init__( + cvalue=it.cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) + @property + def kind(self): + return self.__class__.iterator_kind_type((self._it.kind, self._op)) + @staticmethod def advance(state, distance): return it_advance(state, distance) @@ -289,8 +335,12 @@ def advance(state, distance): def dereference(state): return op(it_dereference(state)) - @property - def state(self) -> ctypes.c_void_p: - return it.state + def __hash__(self): + return hash((self._it, self._op)) + + def __eq__(self, other): + if not isinstance(other.kind, TransformIteratorKind): + return NotImplemented + return self._it == other._it and self._op == other._op return TransformIterator(it, op) diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py new file mode 100644 index 00000000000..1c4e9c9975f --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py @@ -0,0 +1,12 @@ +from typing_extensions import ( + Protocol, +) # TODO: typing_extensions required for Python 3.7 docs env + + +class DeviceArrayLike(Protocol): + """ + Objects representing a device array, having a `.__cuda_array_interface__` + attribute. + """ + + __cuda_array_interface__: dict diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index 40c998fafee..0db6592bb05 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -100,7 +100,8 @@ def build_extension(self, ext): ], packages=find_namespace_packages(include=["cuda.*"]), python_requires=">=3.9", - install_requires=["numba>=0.60.0", "cuda-python", "jinja2"], + # TODO: typing_extensions required for Python 3.7 docs env + install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"], extras_require={ "test": [ "pytest", diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py new file mode 100644 index 00000000000..3b7910d404d --- /dev/null +++ b/python/cuda_parallel/tests/test_iterators.py @@ -0,0 +1,71 @@ +from cuda.parallel.experimental.iterators import ( + CacheModifiedInputIterator, + ConstantIterator, + CountingIterator, + TransformIterator, +) +import cupy as cp +import numpy as np + + +def test_constant_iterator_equality(): + it1 = ConstantIterator(np.int32(0)) + it2 = ConstantIterator(np.int32(0)) + it3 = ConstantIterator(np.int32(1)) + it4 = ConstantIterator(np.int64(9)) + + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind + + +def test_counting_iterator_equality(): + it1 = CountingIterator(np.int32(0)) + it2 = CountingIterator(np.int32(0)) + it3 = CountingIterator(np.int32(1)) + it4 = CountingIterator(np.int64(9)) + + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind + + +def test_cache_modified_input_iterator_equality(): + ary1 = cp.asarray([0, 1, 2], dtype="int32") + ary2 = cp.asarray([3, 4, 5], dtype="int32") + ary3 = cp.asarray([0, 1, 2], dtype="int64") + + it1 = CacheModifiedInputIterator(ary1, "stream") + it2 = CacheModifiedInputIterator(ary1, "stream") + it3 = CacheModifiedInputIterator(ary2, "stream") + it4 = CacheModifiedInputIterator(ary3, "stream") + + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind + + +def test_equality_transform_iterator(): + def op1(x): + return x + + def op2(x): + return 2 * x + + def op3(x): + return x + + it = CountingIterator(np.int32(0)) + it1 = TransformIterator(it, op1) + it2 = TransformIterator(it, op1) + it3 = TransformIterator(it, op3) + + assert it1.kind == it2.kind == it3.kind + + ary1 = cp.asarray([0, 1, 2]) + ary2 = cp.asarray([3, 4, 5]) + it4 = TransformIterator(ary1, op1) + it5 = TransformIterator(ary1, op1) + it6 = TransformIterator(ary1, op2) + it7 = TransformIterator(ary1, op3) + it8 = TransformIterator(ary2, op1) + + assert it4.kind == it5.kind == it7.kind == it8.kind + assert it4.kind != it6.kind diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index 0f454e3603b..ce5656f635d 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp -import numpy +import numpy as np import pytest import random import numba.cuda @@ -14,31 +14,29 @@ def random_int(shape, dtype): - return numpy.random.randint(0, 5, size=shape).astype(dtype) + return np.random.randint(0, 5, size=shape).astype(dtype) def type_to_problem_sizes(dtype): - if dtype in [numpy.uint8, numpy.int8]: + if dtype in [np.uint8, np.int8]: return [2, 4, 5, 6] - elif dtype in [numpy.uint16, numpy.int16]: + elif dtype in [np.uint16, np.int16]: return [4, 8, 12, 14] - elif dtype in [numpy.uint32, numpy.int32]: + elif dtype in [np.uint32, np.int32]: return [16, 20, 24, 28] - elif dtype in [numpy.uint64, numpy.int64]: + elif dtype in [np.uint64, np.int64]: return [16, 20, 24, 28] else: raise ValueError("Unsupported dtype") -@pytest.mark.parametrize( - "dtype", [numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64] -) +@pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64]) def test_device_reduce(dtype): def op(a, b): return a + b init_value = 42 - h_init = numpy.array([init_value], dtype=dtype) + h_init = np.array([init_value], dtype=dtype) d_output = numba.cuda.device_array(1, dtype=dtype) reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init) @@ -47,7 +45,7 @@ def op(a, b): h_input = random_int(num_items, dtype) d_input = numba.cuda.to_device(h_input) temp_storage_size = reduce_into(None, d_input, d_output, None, h_init) - d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8) reduce_into(d_temp_storage, d_input, d_output, None, h_init) h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value @@ -57,19 +55,19 @@ def test_complex_device_reduce(): def op(a, b): return a + b - h_init = numpy.array([40.0 + 2.0j], dtype=complex) + h_init = np.array([40.0 + 2.0j], dtype=complex) d_output = numba.cuda.device_array(1, dtype=complex) reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init) for num_items in [42, 420000]: - h_input = numpy.random.random(num_items) + 1j * numpy.random.random(num_items) + h_input = np.random.random(num_items) + 1j * np.random.random(num_items) d_input = numba.cuda.to_device(h_input) temp_storage_bytes = reduce_into(None, d_input, d_output, None, h_init) - d_temp_storage = numba.cuda.device_array(temp_storage_bytes, numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_bytes, np.uint8) reduce_into(d_temp_storage, d_input, d_output, None, h_init) result = d_output.copy_to_host()[0] - expected = numpy.sum(h_input, initial=h_init[0]) + expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -77,9 +75,9 @@ def test_device_reduce_dtype_mismatch(): def min_op(a, b): return a if a < b else b - dtypes = [numpy.int32, numpy.int64] - h_inits = [numpy.array([], dt) for dt in dtypes] - h_inputs = [numpy.array([], dt) for dt in dtypes] + dtypes = [np.int32, np.int64] + h_inits = [np.array([], dt) for dt in dtypes] + h_inputs = [np.array([], dt) for dt in dtypes] d_outputs = [numba.cuda.device_array(1, dt) for dt in dtypes] d_inputs = [numba.cuda.to_device(h_inp) for h_inp in h_inputs] @@ -109,14 +107,14 @@ def add_op(a, b): expected_result = add_op(expected_result, v) if use_numpy_array: - h_input = numpy.array(l_varr, dtype_inp) + h_input = np.array(l_varr, dtype_inp) d_input = numba.cuda.to_device(h_input) else: d_input = i_input d_output = numba.cuda.device_array(1, dtype_out) # to store device sum - h_init = numpy.array([start_sum_with], dtype_out) + h_init = np.array([start_sum_with], dtype_out) reduce_into = algorithms.reduce_into( d_in=d_input, d_out=d_output, op=add_op, h_init=h_init @@ -125,7 +123,7 @@ def add_op(a, b): temp_storage_size = reduce_into( None, d_in=d_input, d_out=d_output, num_items=len(l_varr), h_init=h_init ) - d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8) reduce_into(d_temp_storage, d_input, d_output, len(l_varr), h_init) @@ -168,9 +166,9 @@ def test_device_sum_cache_modified_input_it( ): rng = random.Random(0) l_varr = [rng.randrange(100) for _ in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = numba.cuda.to_device(numpy.array(l_varr, dtype=dtype_inp)) + input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) i_input = iterators.CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -181,7 +179,7 @@ def test_device_sum_constant_it( use_numpy_array, supported_value_type, num_items=3, start_sum_with=10 ): l_varr = [42 for distance in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp i_input = iterators.ConstantIterator(dtype_inp.type(42)) _test_device_sum_with_iterator( @@ -193,7 +191,7 @@ def test_device_sum_counting_it( use_numpy_array, supported_value_type, num_items=3, start_sum_with=10 ): l_varr = [start_sum_with + distance for distance in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp i_input = iterators.CountingIterator(dtype_inp.type(start_sum_with)) _test_device_sum_with_iterator( @@ -217,8 +215,8 @@ def test_device_sum_map_mul2_count_it( ): l_varr = [2 * (start_sum_with + distance) for distance in range(num_items)] vtn_out, vtn_inp = value_type_name_pair - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) i_input = iterators.TransformIterator( iterators.CountingIterator(dtype_inp.type(start_sum_with)), mul2 ) @@ -248,8 +246,8 @@ def test_device_sum_map_mul_map_mul_count_it( fac_out * (fac_mid * (start_sum_with + distance)) for distance in range(num_items) ] - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) mul_funcs = {2: mul2, 3: mul3} i_input = iterators.TransformIterator( iterators.TransformIterator( @@ -275,8 +273,8 @@ def test_device_sum_map_mul2_cp_array_it( use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10 ): vtn_out, vtn_inp = value_type_name_pair - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) rng = random.Random(0) l_d_in = [rng.randrange(100) for _ in range(num_items)] a_d_in = cp.array(l_d_in, dtype_inp) @@ -285,3 +283,221 @@ def test_device_sum_map_mul2_cp_array_it( _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array ) + + +def test_reducer_caching(): + def sum_op(x, y): + return x + y + + # inputs are device arrays + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are device arrays of different dtype: + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int32"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # outputs are of different dtype: + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int32"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # inputs are of same dtype but different size + # (should still use cached reducer): + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(5, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are counting iterators of the + # same value type: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are counting iterators of different value type: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int64(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + def op1(x): + return x + + def op2(x): + return 2 * x + + def op3(x): + return x + + # inputs are TransformIterators + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators with different + # op: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op2), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # inputs are TransformIterators with same op + # but different name: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op3), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + + # inputs are CountingIterators of same kind + # but different state: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(1)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + + assert reducer_1 is reducer_2 + + # inputs are TransformIterators of same kind + # but different state: + ary1 = cp.asarray([0, 1, 2], dtype="int64") + ary2 = cp.asarray([0, 1], dtype="int64") + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(ary1, op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(ary2, op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators of same kind + # but different state: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(1)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators with different kind: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int64(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2