From cb0eccc89aa5ebd9477c80adf923a233012313cb Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 21 Dec 2024 08:19:41 -0500 Subject: [PATCH 01/12] Define __eq__ and __hash__ for Iterators --- .../experimental/iterators/_iterators.py | 101 ++++++++++++------ python/cuda_parallel/tests/test_iterators.py | 58 ++++++++++ 2 files changed, 128 insertions(+), 31 deletions(-) create mode 100644 python/cuda_parallel/tests/test_iterators.py diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index e92578efbe7..6be218bc95d 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -6,6 +6,7 @@ from llvmlite import ir from numba.core.extending import intrinsic, overload from numba.core.typing.ctypes_utils import to_ctypes +from numba.cuda.dispatcher import CUDADispatcher from numba import cuda, types import numba import numpy as np @@ -15,6 +16,19 @@ _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8 +def _compare_funcs(func1, func2): + # return True if the functions compare equal for + # caching purposes, False otherwise + code1 = func1.__code__ + code2 = func2.__code__ + + return ( + code1.co_code == code2.co_code + and code1.co_consts == code2.co_consts + and func1.__closure__ == func2.__closure__ + ) + + @lru_cache(maxsize=256) # TODO: what's a reasonable value? def cached_compile(func, sig, abi_name=None, **kwargs): return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs) @@ -24,12 +38,10 @@ class IteratorBase: """ An Iterator is a wrapper around a pointer, and must define the following: - - a `state` property that returns a `ctypes.c_void_p` object, representing - a pointer to some data. - - an `advance` (static) method that receives the state pointer and performs + - an `advance` (static) method that receives the pointer and performs an action that advances the pointer by the offset `distance` (returns nothing). - - a `dereference` (static) method that dereferences the state pointer + - a `dereference` (static) method that dereferences the pointer and returns a value. Iterators are not meant to be used directly. They are constructed and passed @@ -38,18 +50,28 @@ class IteratorBase: The `advance` and `dereference` must be compilable to device code by numba. """ - def __init__(self, numba_type: types.Type, value_type: types.Type, abi_name: str): + def __init__( + self, + cvalue: ctypes.c_void_p, + numba_type: types.Type, + value_type: types.Type, + abi_name: str, + ): """ Parameters ---------- + cvalue + A ctypes type representing the object pointed to by the iterator. numba_type - A numba type that specifies how to interpret the state pointer. + A numba type representing the type of the input to the advance + and dereference functions. value_type The numba type of the value returned by the dereference operation. abi_name A unique identifier that will determine the abi_names for the advance and dereference operations. """ + self.cvalue = cvalue self.numba_type = numba_type self.value_type = value_type self.abi_name = abi_name @@ -81,7 +103,7 @@ def ltoirs(self) -> Dict[str, bytes]: @property def state(self) -> ctypes.c_void_p: - raise NotImplementedError("Subclasses must override advance staticmethod") + return ctypes.cast(ctypes.pointer(self.cvalue), ctypes.c_void_p) @staticmethod def advance(state, distance): @@ -91,6 +113,21 @@ def advance(state, distance): def dereference(state): raise NotImplementedError("Subclasses must override dereference staticmethod") + def __hash__(self): + return hash( + (self.cvalue.value, self.numba_type, self.value_type, self.abi_name) + ) + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return NotImplemented + return ( + self.cvalue.value == other.cvalue.value + and self.numba_type == other.numba_type + and self.value_type == other.value_type + and self.abi_name == other.abi_name + ) + def sizeof_pointee(context, ptr): size = context.get_abi_sizeof(ptr.type.pointee) @@ -125,10 +162,11 @@ def impl(ptr, offset): class RawPointer(IteratorBase): def __init__(self, ptr: int, ntype: types.Type): value_type = ntype - self._cvalue = ctypes.c_void_p(ptr) + cvalue = ctypes.c_void_p(ptr) numba_type = types.CPointer(types.CPointer(value_type)) abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, abi_name=abi_name, @@ -142,10 +180,6 @@ def advance(state, distance): def dereference(state): return state[0][0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) - def pointer(container, ntype: types.Type) -> RawPointer: return RawPointer(container.__cuda_array_interface__["data"][0], ntype) @@ -174,11 +208,12 @@ def codegen(context, builder, sig, args): class CacheModifiedPointer(IteratorBase): def __init__(self, ptr: int, ntype: types.Type): - self._cvalue = ctypes.c_void_p(ptr) + cvalue = ctypes.c_void_p(ptr) value_type = ntype numba_type = types.CPointer(types.CPointer(value_type)) abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, abi_name=abi_name, @@ -192,18 +227,15 @@ def advance(state, distance): def dereference(state): return load_cs(state[0]) - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) - class ConstantIterator(IteratorBase): def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) - self._cvalue = to_ctypes(value_type)(value) + cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, abi_name=abi_name, @@ -217,18 +249,15 @@ def advance(state, distance): def dereference(state): return state[0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) - class CountingIterator(IteratorBase): def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) - self._cvalue = to_ctypes(value_type)(value) + cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( + cvalue=cvalue, numba_type=numba_type, value_type=value_type, abi_name=abi_name, @@ -242,10 +271,6 @@ def advance(state, distance): def dereference(state): return state[0] - @property - def state(self) -> ctypes.c_void_p: - return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p) - def make_transform_iterator(it, op: Callable): if hasattr(it, "__cuda_array_interface__"): @@ -256,8 +281,9 @@ def make_transform_iterator(it, op: Callable): op = cuda.jit(op, device=True) class TransformIterator(IteratorBase): - def __init__(self, it: IteratorBase, op): + def __init__(self, it: IteratorBase, op: CUDADispatcher): self._it = it + self._op = op numba_type = it.numba_type # TODO: the abi name below isn't unique enough when we have e.g., # two identically named `op` functions with different @@ -276,6 +302,7 @@ def __init__(self, it: IteratorBase, op): value_type = op_retty abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}" super().__init__( + cvalue=it.cvalue, numba_type=numba_type, value_type=value_type, abi_name=abi_name, @@ -289,8 +316,20 @@ def advance(state, distance): def dereference(state): return op(it_dereference(state)) - @property - def state(self) -> ctypes.c_void_p: - return it.state + def __hash__(self): + return hash( + ( + self._it, + self._op.py_func.__code__.co_code, + self._op.py_func.__closure__, + ) + ) + + def __eq__(self, other): + if not isinstance(other, IteratorBase): + return NotImplemented + return self._it == other._it and _compare_funcs( + self._op.py_func, other._op.py_func + ) return TransformIterator(it, op) diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py new file mode 100644 index 00000000000..d5d15e8174b --- /dev/null +++ b/python/cuda_parallel/tests/test_iterators.py @@ -0,0 +1,58 @@ +from cuda.parallel.experimental.iterators import ( + CacheModifiedInputIterator, + ConstantIterator, + CountingIterator, + TransformIterator, +) +import cupy as cp +import numpy as np + + +def test_constant_iterator_equality(): + assert ConstantIterator(np.int32(0)) == ConstantIterator(np.int32(0)) + assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int32(1)) + assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int64(0)) + + +def test_counting_iterator_equality(): + assert CountingIterator(np.int32(0)) == CountingIterator(np.int32(0)) + assert CountingIterator(np.int32(0)) != CountingIterator(np.int32(1)) + assert CountingIterator(np.int32(0)) != CountingIterator(np.int64(0)) + + +def test_cache_modified_input_iterator_equality(): + ary1 = cp.asarray([0, 1, 2]) + ary2 = cp.asarray([3, 4, 5]) + assert CacheModifiedInputIterator(ary1, "stream") == CacheModifiedInputIterator( + ary1, "stream" + ) + assert CacheModifiedInputIterator(ary1, "stream") != CacheModifiedInputIterator( + ary2, "stream" + ) + + +def test_equality_transform_iterator(): + def op1(x): + return x + + def op2(x): + return 2 * x + + def op3(x): + return x + + it = CountingIterator(np.int32(0)) + assert TransformIterator(it, op1) == TransformIterator(it, op1) + assert TransformIterator(it, op1) != TransformIterator(it, op2) + assert TransformIterator(it, op1) == TransformIterator(it, op3) + + ary1 = cp.asarray([0, 1, 2]) + ary2 = cp.asarray([3, 4, 5]) + assert TransformIterator(ary1, op1) == TransformIterator(ary1, op1) + assert TransformIterator(ary1, op1) != TransformIterator(ary1, op2) + assert TransformIterator(ary1, op1) == TransformIterator(ary1, op3) + assert TransformIterator(ary1, op1) != TransformIterator(ary2, op1) + + +def test_different_iterator_types_equality(): + assert CountingIterator(np.int32(0)) != ConstantIterator(np.int64(0)) From da652e1aecfd8f48740df32f7ef9021288161961 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 21 Dec 2024 08:20:30 -0500 Subject: [PATCH 02/12] Define cache_with_key utility and use it to cache Reduce objects --- .../cuda/parallel/experimental/_caching.py | 31 +++++++++++++++++++ .../experimental/algorithms/reduce.py | 10 ++++++ 2 files changed, 41 insertions(+) create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_caching.py diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py new file mode 100644 index 00000000000..5713ae35d3f --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import functools + + +def cache_with_key(key): + """ + Decorator to cache the result of the decorated function. Uses the + provided `key` function to compute the key for cache lookup. `key` + receives all arguments passed to the function. + """ + + def deco(func): + cache = {} + + @functools.wraps(func) + def inner(*args, **kwargs): + cache_key = key(*args, **kwargs) + if cache_key not in cache: + result = func(*args, **kwargs) + cache[cache_key] = result + # `cache_key` *must* be in `cache`, use `.get()` + # as it is faster: + return cache.get(cache_key) + + return inner + + return deco diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 29e7786b5f8..7ea9084ce82 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -12,6 +12,7 @@ from .. import _cccl as cccl from .._bindings import get_paths, get_bindings +from .._caching import cache_with_key class _Op: @@ -119,8 +120,17 @@ def __del__(self): bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result)) +def make_cache_key(d_in, d_out, op, h_init): + d_in_key = d_in.dtype if hasattr(d_in, "__cuda_array_interface__") else d_in + d_out_key = d_out.dtype if hasattr(d_out, "__cuda_array_interface__") else d_out + op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__) + h_init_key = h_init.dtype + return (d_in_key, d_out_key, op_key, h_init_key) + + # TODO Figure out `sum` without operator and initial value # TODO Accept stream +@cache_with_key(make_cache_key) def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray): """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``. From 221af5c5745c1349a872841eb546e6b906e9e07a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sat, 21 Dec 2024 08:20:54 -0500 Subject: [PATCH 03/12] Add tests for caching Reduce objects --- python/cuda_parallel/tests/test_reduce.py | 213 ++++++++++++++++++---- 1 file changed, 182 insertions(+), 31 deletions(-) diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index 0f454e3603b..d890a656f3f 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import cupy as cp -import numpy +import numpy as np import pytest import random import numba.cuda @@ -14,31 +14,29 @@ def random_int(shape, dtype): - return numpy.random.randint(0, 5, size=shape).astype(dtype) + return np.random.randint(0, 5, size=shape).astype(dtype) def type_to_problem_sizes(dtype): - if dtype in [numpy.uint8, numpy.int8]: + if dtype in [np.uint8, np.int8]: return [2, 4, 5, 6] - elif dtype in [numpy.uint16, numpy.int16]: + elif dtype in [np.uint16, np.int16]: return [4, 8, 12, 14] - elif dtype in [numpy.uint32, numpy.int32]: + elif dtype in [np.uint32, np.int32]: return [16, 20, 24, 28] - elif dtype in [numpy.uint64, numpy.int64]: + elif dtype in [np.uint64, np.int64]: return [16, 20, 24, 28] else: raise ValueError("Unsupported dtype") -@pytest.mark.parametrize( - "dtype", [numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64] -) +@pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64]) def test_device_reduce(dtype): def op(a, b): return a + b init_value = 42 - h_init = numpy.array([init_value], dtype=dtype) + h_init = np.array([init_value], dtype=dtype) d_output = numba.cuda.device_array(1, dtype=dtype) reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init) @@ -47,7 +45,7 @@ def op(a, b): h_input = random_int(num_items, dtype) d_input = numba.cuda.to_device(h_input) temp_storage_size = reduce_into(None, d_input, d_output, None, h_init) - d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8) reduce_into(d_temp_storage, d_input, d_output, None, h_init) h_output = d_output.copy_to_host() assert h_output[0] == sum(h_input) + init_value @@ -57,19 +55,19 @@ def test_complex_device_reduce(): def op(a, b): return a + b - h_init = numpy.array([40.0 + 2.0j], dtype=complex) + h_init = np.array([40.0 + 2.0j], dtype=complex) d_output = numba.cuda.device_array(1, dtype=complex) reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init) for num_items in [42, 420000]: - h_input = numpy.random.random(num_items) + 1j * numpy.random.random(num_items) + h_input = np.random.random(num_items) + 1j * np.random.random(num_items) d_input = numba.cuda.to_device(h_input) temp_storage_bytes = reduce_into(None, d_input, d_output, None, h_init) - d_temp_storage = numba.cuda.device_array(temp_storage_bytes, numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_bytes, np.uint8) reduce_into(d_temp_storage, d_input, d_output, None, h_init) result = d_output.copy_to_host()[0] - expected = numpy.sum(h_input, initial=h_init[0]) + expected = np.sum(h_input, initial=h_init[0]) assert result == pytest.approx(expected) @@ -77,9 +75,9 @@ def test_device_reduce_dtype_mismatch(): def min_op(a, b): return a if a < b else b - dtypes = [numpy.int32, numpy.int64] - h_inits = [numpy.array([], dt) for dt in dtypes] - h_inputs = [numpy.array([], dt) for dt in dtypes] + dtypes = [np.int32, np.int64] + h_inits = [np.array([], dt) for dt in dtypes] + h_inputs = [np.array([], dt) for dt in dtypes] d_outputs = [numba.cuda.device_array(1, dt) for dt in dtypes] d_inputs = [numba.cuda.to_device(h_inp) for h_inp in h_inputs] @@ -109,14 +107,14 @@ def add_op(a, b): expected_result = add_op(expected_result, v) if use_numpy_array: - h_input = numpy.array(l_varr, dtype_inp) + h_input = np.array(l_varr, dtype_inp) d_input = numba.cuda.to_device(h_input) else: d_input = i_input d_output = numba.cuda.device_array(1, dtype_out) # to store device sum - h_init = numpy.array([start_sum_with], dtype_out) + h_init = np.array([start_sum_with], dtype_out) reduce_into = algorithms.reduce_into( d_in=d_input, d_out=d_output, op=add_op, h_init=h_init @@ -125,7 +123,7 @@ def add_op(a, b): temp_storage_size = reduce_into( None, d_in=d_input, d_out=d_output, num_items=len(l_varr), h_init=h_init ) - d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8) + d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8) reduce_into(d_temp_storage, d_input, d_output, len(l_varr), h_init) @@ -168,9 +166,9 @@ def test_device_sum_cache_modified_input_it( ): rng = random.Random(0) l_varr = [rng.randrange(100) for _ in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp - input_devarr = numba.cuda.to_device(numpy.array(l_varr, dtype=dtype_inp)) + input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp)) i_input = iterators.CacheModifiedInputIterator(input_devarr, modifier="stream") _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array @@ -181,7 +179,7 @@ def test_device_sum_constant_it( use_numpy_array, supported_value_type, num_items=3, start_sum_with=10 ): l_varr = [42 for distance in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp i_input = iterators.ConstantIterator(dtype_inp.type(42)) _test_device_sum_with_iterator( @@ -193,7 +191,7 @@ def test_device_sum_counting_it( use_numpy_array, supported_value_type, num_items=3, start_sum_with=10 ): l_varr = [start_sum_with + distance for distance in range(num_items)] - dtype_inp = numpy.dtype(supported_value_type) + dtype_inp = np.dtype(supported_value_type) dtype_out = dtype_inp i_input = iterators.CountingIterator(dtype_inp.type(start_sum_with)) _test_device_sum_with_iterator( @@ -217,8 +215,8 @@ def test_device_sum_map_mul2_count_it( ): l_varr = [2 * (start_sum_with + distance) for distance in range(num_items)] vtn_out, vtn_inp = value_type_name_pair - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) i_input = iterators.TransformIterator( iterators.CountingIterator(dtype_inp.type(start_sum_with)), mul2 ) @@ -248,8 +246,8 @@ def test_device_sum_map_mul_map_mul_count_it( fac_out * (fac_mid * (start_sum_with + distance)) for distance in range(num_items) ] - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) mul_funcs = {2: mul2, 3: mul3} i_input = iterators.TransformIterator( iterators.TransformIterator( @@ -275,8 +273,8 @@ def test_device_sum_map_mul2_cp_array_it( use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10 ): vtn_out, vtn_inp = value_type_name_pair - dtype_inp = numpy.dtype(vtn_inp) - dtype_out = numpy.dtype(vtn_out) + dtype_inp = np.dtype(vtn_inp) + dtype_out = np.dtype(vtn_out) rng = random.Random(0) l_d_in = [rng.randrange(100) for _ in range(num_items)] a_d_in = cp.array(l_d_in, dtype_inp) @@ -285,3 +283,156 @@ def test_device_sum_map_mul2_cp_array_it( _test_device_sum_with_iterator( l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array ) + + +def test_reducer_caching(): + def sum_op(x, y): + return x + y + + # inputs are device arrays + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are device arrays of different dtype: + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int32"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # outputs are of different dtype: + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int32"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # inputs are of same dtype but different size + # (should still use cached reducer): + reducer_1 = algorithms.reduce_into( + cp.zeros(3, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + cp.zeros(5, dtype="int64"), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are counting iterators of the + # same value type: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are counting iterators of different value type: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int64(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + def op1(x): + return x + + def op2(x): + return 2 * x + + def op3(x): + return x + + # inputs are TransformIterators + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators with different + # op: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op2), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 + + # inputs are TransformIterators with same op + # but different name: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op3), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 From 0eea142ebb72a4ca86bd3fb67a55a3169dcec9b2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Sun, 22 Dec 2024 07:12:53 -0500 Subject: [PATCH 04/12] Tighten up types --- .../parallel/experimental/_utils/__init__.py | 0 .../cuda/parallel/experimental/_utils/cai.py | 16 ++++++++++ .../experimental/algorithms/reduce.py | 31 +++++++++++++++---- .../cuda/parallel/experimental/typing.py | 10 ++++++ 4 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py create mode 100644 python/cuda_parallel/cuda/parallel/experimental/typing.py diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py new file mode 100644 index 00000000000..9c0718e71f0 --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Utilities for extracting information from `__cuda_array_interface__`. +""" + +import numpy as np + +from ..typing import DeviceArrayLike + + +def get_dtype(arr: DeviceArrayLike) -> np.dtype: + return np.dtype(arr.__cuda_array_interface__["typestr"]) diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 7ea9084ce82..7cdde1c9132 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -13,6 +13,9 @@ from .. import _cccl as cccl from .._bindings import get_paths, get_bindings from .._caching import cache_with_key +from ..typing import DeviceArrayLike +from ..iterators._iterators import IteratorBase +from .._utils import cai as cai class _Op: @@ -42,12 +45,18 @@ def _dtype_validation(dt1, dt2): class _Reduce: # TODO: constructor shouldn't require concrete `d_in`, `d_out`: - def __init__(self, d_in, d_out, op: Callable, h_init: np.ndarray): + def __init__( + self, + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, + ): d_in_cccl = cccl.to_cccl_iter(d_in) self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name( d_in_cccl.value_type.type.value ) - self._ctor_d_out_dtype = d_out.dtype + self._ctor_d_out_dtype = cai.get_dtype(d_out) self._ctor_init_dtype = h_init.dtype cc_major, cc_minor = cuda.get_current_device().compute_capability cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths() @@ -120,9 +129,14 @@ def __del__(self): bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result)) -def make_cache_key(d_in, d_out, op, h_init): - d_in_key = d_in.dtype if hasattr(d_in, "__cuda_array_interface__") else d_in - d_out_key = d_out.dtype if hasattr(d_out, "__cuda_array_interface__") else d_out +def make_cache_key( + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, +): + d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in) + d_out_key = d_out if isinstance(d_out, IteratorBase) else cai.get_dtype(d_out) op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__) h_init_key = h_init.dtype return (d_in_key, d_out_key, op_key, h_init_key) @@ -131,7 +145,12 @@ def make_cache_key(d_in, d_out, op, h_init): # TODO Figure out `sum` without operator and initial value # TODO Accept stream @cache_with_key(make_cache_key) -def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray): +def reduce_into( + d_in: DeviceArrayLike | IteratorBase, + d_out: DeviceArrayLike, + op: Callable, + h_init: np.ndarray, +): """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``. Example: diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py new file mode 100644 index 00000000000..87f72999bfe --- /dev/null +++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py @@ -0,0 +1,10 @@ +from typing import Protocol + + +class DeviceArrayLike(Protocol): + """ + Objects representing a device array, having a `.__cuda_array_interface__` + attribute. + """ + + __cuda_array_interface__: dict From f198200c462e8307f63f914ce6c2cd7e51eb3809 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 23 Dec 2024 09:48:57 -0500 Subject: [PATCH 05/12] Updates to support 3.7 --- .../cuda/parallel/experimental/algorithms/reduce.py | 2 ++ python/cuda_parallel/cuda/parallel/experimental/typing.py | 4 +++- python/cuda_parallel/setup.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 7cdde1c9132..605e71b7172 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from __future__ import annotations # TODO: required for Python 3.7 docs env + import ctypes import numba import numpy as np diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py index 87f72999bfe..1c4e9c9975f 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/typing.py +++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py @@ -1,4 +1,6 @@ -from typing import Protocol +from typing_extensions import ( + Protocol, +) # TODO: typing_extensions required for Python 3.7 docs env class DeviceArrayLike(Protocol): diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py index 40c998fafee..0db6592bb05 100644 --- a/python/cuda_parallel/setup.py +++ b/python/cuda_parallel/setup.py @@ -100,7 +100,8 @@ def build_extension(self, ext): ], packages=find_namespace_packages(include=["cuda.*"]), python_requires=">=3.9", - install_requires=["numba>=0.60.0", "cuda-python", "jinja2"], + # TODO: typing_extensions required for Python 3.7 docs env + install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"], extras_require={ "test": [ "pytest", From 56f2c61f774e965df75b55b9c3028711c5983a53 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 23 Dec 2024 10:33:43 -0500 Subject: [PATCH 06/12] Address review feedback --- .../cuda/parallel/experimental/_caching.py | 13 +++++++++---- .../cuda/parallel/experimental/algorithms/reduce.py | 4 ++-- .../parallel/experimental/iterators/_iterators.py | 7 +++---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py index 5713ae35d3f..0dff034921b 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import functools +from numba import cuda def cache_with_key(key): @@ -11,6 +12,11 @@ def cache_with_key(key): Decorator to cache the result of the decorated function. Uses the provided `key` function to compute the key for cache lookup. `key` receives all arguments passed to the function. + + Notes + ----- + The CUDA compute capability of the current device is appended to + the cache key returned by `key`. """ def deco(func): @@ -18,13 +24,12 @@ def deco(func): @functools.wraps(func) def inner(*args, **kwargs): - cache_key = key(*args, **kwargs) + cc = cuda.get_current_device().compute_capability + cache_key = (key(*args, **kwargs), *cc) if cache_key not in cache: result = func(*args, **kwargs) cache[cache_key] = result - # `cache_key` *must* be in `cache`, use `.get()` - # as it is faster: - return cache.get(cache_key) + return cache[cache_key] return inner diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 605e71b7172..23b43cd252d 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -17,7 +17,7 @@ from .._caching import cache_with_key from ..typing import DeviceArrayLike from ..iterators._iterators import IteratorBase -from .._utils import cai as cai +from .._utils import cai class _Op: @@ -138,7 +138,7 @@ def make_cache_key( h_init: np.ndarray, ): d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in) - d_out_key = d_out if isinstance(d_out, IteratorBase) else cai.get_dtype(d_out) + d_out_key = cai.get_dtype(d_out) op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__) h_init_key = h_init.dtype return (d_in_key, d_out_key, op_key, h_init_key) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index 6be218bc95d..c49eee008bf 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -160,8 +160,7 @@ def impl(ptr, offset): class RawPointer(IteratorBase): - def __init__(self, ptr: int, ntype: types.Type): - value_type = ntype + def __init__(self, ptr: int, value_type: types.Type): cvalue = ctypes.c_void_p(ptr) numba_type = types.CPointer(types.CPointer(value_type)) abi_name = f"{self.__class__.__name__}_{str(value_type)}" @@ -181,8 +180,8 @@ def dereference(state): return state[0][0] -def pointer(container, ntype: types.Type) -> RawPointer: - return RawPointer(container.__cuda_array_interface__["data"][0], ntype) +def pointer(container, value_type: types.Type) -> RawPointer: + return RawPointer(container.__cuda_array_interface__["data"][0], value_type) @intrinsic From c822da7e100c8b9b255379dd736779055d922b3a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 23 Dec 2024 12:28:32 -0500 Subject: [PATCH 07/12] Introduce IteratorKind to hold iterator type information --- .../cuda/parallel/experimental/_caching.py | 40 +++++++++ .../experimental/algorithms/reduce.py | 6 +- .../experimental/iterators/_iterators.py | 83 ++++++++++++++----- python/cuda_parallel/tests/test_iterators.py | 78 ++++++++++++----- python/cuda_parallel/tests/test_reduce.py | 65 +++++++++++++++ 5 files changed, 229 insertions(+), 43 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py index 0dff034921b..fa9165b7ad8 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -34,3 +34,43 @@ def inner(*args, **kwargs): return inner return deco + + +class CachableFunction: + """ + A type that wraps a function and provides custom comparison + (__eq__) and hash (__hash__) implementations. + + The purpose of this class is to enable caching and comparison of + functions based on their bytecode, constants, and closures, while + ignoring other attributes such as their names or docstrings. + """ + + def __init__(self, func): + self._func = func + + def __eq__(self, other): + func1, func2 = self._func, other._func + + # return True if the functions compare equal for + # caching purposes, False otherwise + code1 = func1.__code__ + code2 = func2.__code__ + + return ( + code1.co_code == code2.co_code + and code1.co_consts == code2.co_consts + and func1.__closure__ == func2.__closure__ + ) + + def __hash__(self): + return hash( + ( + self._func.__code__.co_code, + self._func.__code__.co_consts, + self._func.__closure__, + ) + ) + + def __repr__(self): + return str(self._func) diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py index 23b43cd252d..41843742827 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py +++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py @@ -14,7 +14,7 @@ from .. import _cccl as cccl from .._bindings import get_paths, get_bindings -from .._caching import cache_with_key +from .._caching import CachableFunction, cache_with_key from ..typing import DeviceArrayLike from ..iterators._iterators import IteratorBase from .._utils import cai @@ -137,9 +137,9 @@ def make_cache_key( op: Callable, h_init: np.ndarray, ): - d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in) + d_in_key = d_in.kind if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in) d_out_key = cai.get_dtype(d_out) - op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__) + op_key = CachableFunction(op) h_init_key = h_init.dtype return (d_in_key, d_out_key, op_key, h_init_key) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index c49eee008bf..8ed4174da07 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -11,29 +11,32 @@ import numba import numpy as np +from .._caching import CachableFunction + _DEVICE_POINTER_SIZE = 8 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8 -def _compare_funcs(func1, func2): - # return True if the functions compare equal for - # caching purposes, False otherwise - code1 = func1.__code__ - code2 = func2.__code__ - - return ( - code1.co_code == code2.co_code - and code1.co_consts == code2.co_consts - and func1.__closure__ == func2.__closure__ - ) - - @lru_cache(maxsize=256) # TODO: what's a reasonable value? def cached_compile(func, sig, abi_name=None, **kwargs): return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs) +class IteratorKind: + def __init__(self, value_type): + self.value_type = value_type + + def __repr__(self): + return f"{self.__class__.__name__}[{str(self.value_type)}]" + + def __eq__(self, other): + return type(self) is type(other) and self.value_type == other.value_type + + def __hash__(self): + return hash(self.value_type) + + class IteratorBase: """ An Iterator is a wrapper around a pointer, and must define the following: @@ -50,6 +53,8 @@ class IteratorBase: The `advance` and `dereference` must be compilable to device code by numba. """ + iterator_kind_type: type # must be a subclass of IteratorKind + def __init__( self, cvalue: ctypes.c_void_p, @@ -76,6 +81,10 @@ def __init__( self.value_type = value_type self.abi_name = abi_name + @property + def kind(self): + return self.__class__.iterator_kind_type(self.value_type) + # TODO: should we cache this? Current docs environment doesn't allow # using Python > 3.7. We could use a hand-rolled cached_property if # needed. @@ -159,7 +168,13 @@ def impl(ptr, offset): return impl +class RawPointerType(IteratorKind): + pass + + class RawPointer(IteratorBase): + iterator_kind_type = RawPointerType + def __init__(self, ptr: int, value_type: types.Type): cvalue = ctypes.c_void_p(ptr) numba_type = types.CPointer(types.CPointer(value_type)) @@ -205,7 +220,13 @@ def codegen(context, builder, sig, args): return base.dtype(base), codegen +class CacheModifiedPointerType(IteratorKind): + pass + + class CacheModifiedPointer(IteratorBase): + iterator_kind_type = CacheModifiedPointerType + def __init__(self, ptr: int, ntype: types.Type): cvalue = ctypes.c_void_p(ptr) value_type = ntype @@ -227,7 +248,13 @@ def dereference(state): return load_cs(state[0]) +class ConstantIteratorKind(IteratorKind): + pass + + class ConstantIterator(IteratorBase): + iterator_kind_type = ConstantIteratorKind + def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) cvalue = to_ctypes(value_type)(value) @@ -249,7 +276,13 @@ def dereference(state): return state[0] +class CountingIteratorKind(IteratorKind): + pass + + class CountingIterator(IteratorBase): + iterator_kind_type = CountingIteratorKind + def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) cvalue = to_ctypes(value_type)(value) @@ -271,6 +304,14 @@ def dereference(state): return state[0] +class TransformIteratorKind(IteratorKind): + def __eq__(self, other): + return type(self) is type(other) and self.value_type == other.value_type + + def __hash__(self): + return hash(self.value_type) + + def make_transform_iterator(it, op: Callable): if hasattr(it, "__cuda_array_interface__"): it = pointer(it, numba.from_dtype(it.dtype)) @@ -280,9 +321,11 @@ def make_transform_iterator(it, op: Callable): op = cuda.jit(op, device=True) class TransformIterator(IteratorBase): + iterator_kind_type = TransformIteratorKind + def __init__(self, it: IteratorBase, op: CUDADispatcher): self._it = it - self._op = op + self._op = CachableFunction(op.py_func) numba_type = it.numba_type # TODO: the abi name below isn't unique enough when we have e.g., # two identically named `op` functions with different @@ -307,6 +350,10 @@ def __init__(self, it: IteratorBase, op: CUDADispatcher): abi_name=abi_name, ) + @property + def kind(self): + return self.__class__.iterator_kind_type((self._it.kind, self._op)) + @staticmethod def advance(state, distance): return it_advance(state, distance) @@ -319,16 +366,14 @@ def __hash__(self): return hash( ( self._it, - self._op.py_func.__code__.co_code, - self._op.py_func.__closure__, + self._op._func.py_func.__code__.co_code, + self._op._func.py_func.__closure__, ) ) def __eq__(self, other): if not isinstance(other, IteratorBase): return NotImplemented - return self._it == other._it and _compare_funcs( - self._op.py_func, other._op.py_func - ) + return self._it == other._it and self._op == other._op return TransformIterator(it, op) diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py index d5d15e8174b..89ae076cb4c 100644 --- a/python/cuda_parallel/tests/test_iterators.py +++ b/python/cuda_parallel/tests/test_iterators.py @@ -9,26 +9,47 @@ def test_constant_iterator_equality(): - assert ConstantIterator(np.int32(0)) == ConstantIterator(np.int32(0)) - assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int32(1)) - assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int64(0)) + it1 = ConstantIterator(np.int32(0)) + it2 = ConstantIterator(np.int32(0)) + it3 = ConstantIterator(np.int32(1)) + it4 = ConstantIterator(np.int64(9)) + + assert it1 == it2 + assert it1 != it3 + assert it1 != it4 + + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind def test_counting_iterator_equality(): - assert CountingIterator(np.int32(0)) == CountingIterator(np.int32(0)) - assert CountingIterator(np.int32(0)) != CountingIterator(np.int32(1)) - assert CountingIterator(np.int32(0)) != CountingIterator(np.int64(0)) + it1 = CountingIterator(np.int32(0)) + it2 = CountingIterator(np.int32(0)) + it3 = CountingIterator(np.int32(1)) + it4 = CountingIterator(np.int64(9)) + + assert it1 == it2 + assert it1 != it3 + assert it1 != it4 + + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind def test_cache_modified_input_iterator_equality(): - ary1 = cp.asarray([0, 1, 2]) - ary2 = cp.asarray([3, 4, 5]) - assert CacheModifiedInputIterator(ary1, "stream") == CacheModifiedInputIterator( - ary1, "stream" - ) - assert CacheModifiedInputIterator(ary1, "stream") != CacheModifiedInputIterator( - ary2, "stream" - ) + ary1 = cp.asarray([0, 1, 2], dtype="int32") + ary2 = cp.asarray([3, 4, 5], dtype="int32") + ary3 = cp.asarray([0, 1, 2], dtype="int64") + + it1 = CacheModifiedInputIterator(ary1, "stream") + it2 = CacheModifiedInputIterator(ary1, "stream") + it3 = CacheModifiedInputIterator(ary2, "stream") + it4 = CacheModifiedInputIterator(ary3, "stream") + + assert it1 == it2 + assert it1 != it3 + assert it1.kind == it2.kind == it3.kind + assert it1.kind != it4.kind def test_equality_transform_iterator(): @@ -42,16 +63,31 @@ def op3(x): return x it = CountingIterator(np.int32(0)) - assert TransformIterator(it, op1) == TransformIterator(it, op1) - assert TransformIterator(it, op1) != TransformIterator(it, op2) - assert TransformIterator(it, op1) == TransformIterator(it, op3) + it1 = TransformIterator(it, op1) + it2 = TransformIterator(it, op1) + it3 = TransformIterator(it, op2) + it4 = TransformIterator(it, op3) + + assert it1 == it2 + assert it1 != it3 + assert it1 == it4 + assert it1.kind == it2.kind == it4.kind ary1 = cp.asarray([0, 1, 2]) ary2 = cp.asarray([3, 4, 5]) - assert TransformIterator(ary1, op1) == TransformIterator(ary1, op1) - assert TransformIterator(ary1, op1) != TransformIterator(ary1, op2) - assert TransformIterator(ary1, op1) == TransformIterator(ary1, op3) - assert TransformIterator(ary1, op1) != TransformIterator(ary2, op1) + it5 = TransformIterator(ary1, op1) + it6 = TransformIterator(ary1, op1) + it7 = TransformIterator(ary1, op2) + it8 = TransformIterator(ary1, op3) + it9 = TransformIterator(ary2, op1) + + assert it5 == it6 + assert it5 != it7 + assert it5 == it8 + assert it5 != it9 + + assert it5.kind == it6.kind == it8.kind == it9.kind + assert it5.kind != it7.kind def test_different_iterator_types_equality(): diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py index d890a656f3f..ce5656f635d 100644 --- a/python/cuda_parallel/tests/test_reduce.py +++ b/python/cuda_parallel/tests/test_reduce.py @@ -435,4 +435,69 @@ def op3(x): sum_op, np.zeros([0], dtype="int64"), ) + + # inputs are CountingIterators of same kind + # but different state: + reducer_1 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(0)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.CountingIterator(np.int32(1)), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators of same kind + # but different state: + ary1 = cp.asarray([0, 1, 2], dtype="int64") + ary2 = cp.asarray([0, 1], dtype="int64") + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(ary1, op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(ary2, op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators of same kind + # but different state: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(1)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is reducer_2 + + # inputs are TransformIterators with different kind: + reducer_1 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + reducer_2 = algorithms.reduce_into( + iterators.TransformIterator(iterators.CountingIterator(np.int64(0)), op1), + cp.zeros(1, dtype="int64"), + sum_op, + np.zeros([0], dtype="int64"), + ) + assert reducer_1 is not reducer_2 From 58b6f695e843af403ccb8ca40f2c50b6ff799eaa Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 24 Dec 2024 09:30:57 -0500 Subject: [PATCH 08/12] Use the .kind to generate an abi_name --- .../cuda/parallel/experimental/_caching.py | 26 +++------ .../experimental/iterators/_iterators.py | 53 ++++++------------- python/cuda_parallel/tests/test_iterators.py | 2 + 3 files changed, 26 insertions(+), 55 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py index fa9165b7ad8..3d79436b1b3 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -48,29 +48,17 @@ class CachableFunction: def __init__(self, func): self._func = func + self._identity = ( + self._func.__code__.co_code, + self._func.__code__.co_consts, + self._func.__closure__, + ) def __eq__(self, other): - func1, func2 = self._func, other._func - - # return True if the functions compare equal for - # caching purposes, False otherwise - code1 = func1.__code__ - code2 = func2.__code__ - - return ( - code1.co_code == code2.co_code - and code1.co_consts == code2.co_consts - and func1.__closure__ == func2.__closure__ - ) + return self._identity == other._identity def __hash__(self): - return hash( - ( - self._func.__code__.co_code, - self._func.__code__.co_consts, - self._func.__closure__, - ) - ) + return hash(self._identity) def __repr__(self): return str(self._func) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index 8ed4174da07..7e7a1652309 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -1,5 +1,6 @@ import ctypes import operator +import uuid from functools import lru_cache from typing import Dict, Callable @@ -18,6 +19,14 @@ _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8 +@lru_cache(maxsize=None) +def _get_abi_suffix(kind: "IteratorKind"): + # given an IteratorKind, return a UUID. The value + # is cached so that the same UUID is always returned + # for a given IteratorKind. + return uuid.uuid4().hex + + @lru_cache(maxsize=256) # TODO: what's a reasonable value? def cached_compile(func, sig, abi_name=None, **kwargs): return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs) @@ -60,7 +69,6 @@ def __init__( cvalue: ctypes.c_void_p, numba_type: types.Type, value_type: types.Type, - abi_name: str, ): """ Parameters @@ -72,14 +80,10 @@ def __init__( and dereference functions. value_type The numba type of the value returned by the dereference operation. - abi_name - A unique identifier that will determine the abi_names for the - advance and dereference operations. """ self.cvalue = cvalue self.numba_type = numba_type self.value_type = value_type - self.abi_name = abi_name @property def kind(self): @@ -90,8 +94,8 @@ def kind(self): # needed. @property def ltoirs(self) -> Dict[str, bytes]: - advance_abi_name = self.abi_name + "_advance" - deref_abi_name = self.abi_name + "_dereference" + advance_abi_name = "advance_" + _get_abi_suffix(self.kind) + deref_abi_name = "dereference_" + _get_abi_suffix(self.kind) advance_ltoir, _ = cached_compile( self.__class__.advance, ( @@ -123,18 +127,16 @@ def dereference(state): raise NotImplementedError("Subclasses must override dereference staticmethod") def __hash__(self): - return hash( - (self.cvalue.value, self.numba_type, self.value_type, self.abi_name) - ) + return hash((self.kind, self.cvalue.value, self.numba_type, self.value_type)) def __eq__(self, other): if not isinstance(other, self.__class__): return NotImplemented return ( - self.cvalue.value == other.cvalue.value + self.kind == other.kind + and self.cvalue.value == other.cvalue.value and self.numba_type == other.numba_type and self.value_type == other.value_type - and self.abi_name == other.abi_name ) @@ -178,12 +180,10 @@ class RawPointer(IteratorBase): def __init__(self, ptr: int, value_type: types.Type): cvalue = ctypes.c_void_p(ptr) numba_type = types.CPointer(types.CPointer(value_type)) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -231,12 +231,10 @@ def __init__(self, ptr: int, ntype: types.Type): cvalue = ctypes.c_void_p(ptr) value_type = ntype numba_type = types.CPointer(types.CPointer(value_type)) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -259,12 +257,10 @@ def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -287,12 +283,10 @@ def __init__(self, value: np.number): value_type = numba.from_dtype(value.dtype) cvalue = to_ctypes(value_type)(value) numba_type = types.CPointer(value_type) - abi_name = f"{self.__class__.__name__}_{str(value_type)}" super().__init__( cvalue=cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @staticmethod @@ -327,27 +321,20 @@ def __init__(self, it: IteratorBase, op: CUDADispatcher): self._it = it self._op = CachableFunction(op.py_func) numba_type = it.numba_type - # TODO: the abi name below isn't unique enough when we have e.g., - # two identically named `op` functions with different - # signatures, bytecodes, and/or closure variables. - op_abi_name = f"{self.__class__.__name__}_{op.py_func.__name__}" - # TODO: it would be nice to not need to compile `op` to get # its return type, but there's nothing in the numba API # to do that (yet), _, op_retty = cached_compile( op, (self._it.value_type,), - abi_name=op_abi_name, + abi_name=f"{op.__name__}_{_get_abi_suffix(self.kind)}", output="ltoir", ) value_type = op_retty - abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}" super().__init__( cvalue=it.cvalue, numba_type=numba_type, value_type=value_type, - abi_name=abi_name, ) @property @@ -363,16 +350,10 @@ def dereference(state): return op(it_dereference(state)) def __hash__(self): - return hash( - ( - self._it, - self._op._func.py_func.__code__.co_code, - self._op._func.py_func.__closure__, - ) - ) + return hash((self._it, self._op)) def __eq__(self, other): - if not isinstance(other, IteratorBase): + if not isinstance(other.kind, TransformIteratorKind): return NotImplemented return self._it == other._it and self._op == other._op diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py index 89ae076cb4c..ef94800df43 100644 --- a/python/cuda_parallel/tests/test_iterators.py +++ b/python/cuda_parallel/tests/test_iterators.py @@ -48,6 +48,7 @@ def test_cache_modified_input_iterator_equality(): assert it1 == it2 assert it1 != it3 + assert it1.kind == it2.kind == it3.kind assert it1.kind != it4.kind @@ -71,6 +72,7 @@ def op3(x): assert it1 == it2 assert it1 != it3 assert it1 == it4 + assert it1.kind == it2.kind == it4.kind ary1 = cp.asarray([0, 1, 2]) From d0175721972088ce257f42c94247fc9d095dd1e6 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 30 Dec 2024 07:23:05 -0500 Subject: [PATCH 09/12] Remove __eq__ and __hash__ methods from IteratorBase --- .../experimental/iterators/_iterators.py | 13 ------ python/cuda_parallel/tests/test_iterators.py | 41 ++++--------------- 2 files changed, 8 insertions(+), 46 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index 7e7a1652309..2d1083208b2 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -126,19 +126,6 @@ def advance(state, distance): def dereference(state): raise NotImplementedError("Subclasses must override dereference staticmethod") - def __hash__(self): - return hash((self.kind, self.cvalue.value, self.numba_type, self.value_type)) - - def __eq__(self, other): - if not isinstance(other, self.__class__): - return NotImplemented - return ( - self.kind == other.kind - and self.cvalue.value == other.cvalue.value - and self.numba_type == other.numba_type - and self.value_type == other.value_type - ) - def sizeof_pointee(context, ptr): size = context.get_abi_sizeof(ptr.type.pointee) diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py index ef94800df43..3b7910d404d 100644 --- a/python/cuda_parallel/tests/test_iterators.py +++ b/python/cuda_parallel/tests/test_iterators.py @@ -14,10 +14,6 @@ def test_constant_iterator_equality(): it3 = ConstantIterator(np.int32(1)) it4 = ConstantIterator(np.int64(9)) - assert it1 == it2 - assert it1 != it3 - assert it1 != it4 - assert it1.kind == it2.kind == it3.kind assert it1.kind != it4.kind @@ -28,10 +24,6 @@ def test_counting_iterator_equality(): it3 = CountingIterator(np.int32(1)) it4 = CountingIterator(np.int64(9)) - assert it1 == it2 - assert it1 != it3 - assert it1 != it4 - assert it1.kind == it2.kind == it3.kind assert it1.kind != it4.kind @@ -46,9 +38,6 @@ def test_cache_modified_input_iterator_equality(): it3 = CacheModifiedInputIterator(ary2, "stream") it4 = CacheModifiedInputIterator(ary3, "stream") - assert it1 == it2 - assert it1 != it3 - assert it1.kind == it2.kind == it3.kind assert it1.kind != it4.kind @@ -66,31 +55,17 @@ def op3(x): it = CountingIterator(np.int32(0)) it1 = TransformIterator(it, op1) it2 = TransformIterator(it, op1) - it3 = TransformIterator(it, op2) - it4 = TransformIterator(it, op3) - - assert it1 == it2 - assert it1 != it3 - assert it1 == it4 + it3 = TransformIterator(it, op3) - assert it1.kind == it2.kind == it4.kind + assert it1.kind == it2.kind == it3.kind ary1 = cp.asarray([0, 1, 2]) ary2 = cp.asarray([3, 4, 5]) + it4 = TransformIterator(ary1, op1) it5 = TransformIterator(ary1, op1) - it6 = TransformIterator(ary1, op1) - it7 = TransformIterator(ary1, op2) - it8 = TransformIterator(ary1, op3) - it9 = TransformIterator(ary2, op1) - - assert it5 == it6 - assert it5 != it7 - assert it5 == it8 - assert it5 != it9 - - assert it5.kind == it6.kind == it8.kind == it9.kind - assert it5.kind != it7.kind - + it6 = TransformIterator(ary1, op2) + it7 = TransformIterator(ary1, op3) + it8 = TransformIterator(ary2, op1) -def test_different_iterator_types_equality(): - assert CountingIterator(np.int32(0)) != ConstantIterator(np.int64(0)) + assert it4.kind == it5.kind == it7.kind == it8.kind + assert it4.kind != it6.kind From 68235eefee4a39f4e7dd29c9a48166ddbff267a9 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 30 Dec 2024 07:32:26 -0500 Subject: [PATCH 10/12] Move helper function --- .../experimental/iterators/_iterators.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index 2d1083208b2..076a260ca1f 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -19,14 +19,6 @@ _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8 -@lru_cache(maxsize=None) -def _get_abi_suffix(kind: "IteratorKind"): - # given an IteratorKind, return a UUID. The value - # is cached so that the same UUID is always returned - # for a given IteratorKind. - return uuid.uuid4().hex - - @lru_cache(maxsize=256) # TODO: what's a reasonable value? def cached_compile(func, sig, abi_name=None, **kwargs): return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs) @@ -46,6 +38,14 @@ def __hash__(self): return hash(self.value_type) +@lru_cache(maxsize=None) +def _get_abi_suffix(kind: IteratorKind): + # given an IteratorKind, return a UUID. The value + # is cached so that the same UUID is always returned + # for a given IteratorKind. + return uuid.uuid4().hex + + class IteratorBase: """ An Iterator is a wrapper around a pointer, and must define the following: From f892265643bef96b71a1727f876108cbd8c9d538 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 30 Dec 2024 09:06:40 -0500 Subject: [PATCH 11/12] Formatting --- .../cuda/parallel/experimental/iterators/_iterators.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py index 076a260ca1f..babc4a12a11 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py +++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py @@ -40,9 +40,8 @@ def __hash__(self): @lru_cache(maxsize=None) def _get_abi_suffix(kind: IteratorKind): - # given an IteratorKind, return a UUID. The value - # is cached so that the same UUID is always returned - # for a given IteratorKind. + # given an IteratorKind, return a UUID. The value is cached so + # that the same UUID is always returned for a given IteratorKind. return uuid.uuid4().hex From 5e3d95aa819e8d7bb161f9b3d183ed70e009437a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 31 Dec 2024 13:57:06 -0500 Subject: [PATCH 12/12] Don't unpack tuple in cache key --- python/cuda_parallel/cuda/parallel/experimental/_caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py index 3d79436b1b3..2647c1835ee 100644 --- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py +++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py @@ -25,7 +25,7 @@ def deco(func): @functools.wraps(func) def inner(*args, **kwargs): cc = cuda.get_current_device().compute_capability - cache_key = (key(*args, **kwargs), *cc) + cache_key = (key(*args, **kwargs), cc) if cache_key not in cache: result = func(*args, **kwargs) cache[cache_key] = result