From cb0eccc89aa5ebd9477c80adf923a233012313cb Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 21 Dec 2024 08:19:41 -0500
Subject: [PATCH 01/12] Define __eq__ and __hash__ for Iterators

---
 .../experimental/iterators/_iterators.py      | 101 ++++++++++++------
 python/cuda_parallel/tests/test_iterators.py  |  58 ++++++++++
 2 files changed, 128 insertions(+), 31 deletions(-)
 create mode 100644 python/cuda_parallel/tests/test_iterators.py

diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index e92578efbe7..6be218bc95d 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -6,6 +6,7 @@
 from llvmlite import ir
 from numba.core.extending import intrinsic, overload
 from numba.core.typing.ctypes_utils import to_ctypes
+from numba.cuda.dispatcher import CUDADispatcher
 from numba import cuda, types
 import numba
 import numpy as np
@@ -15,6 +16,19 @@
 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8
 
 
+def _compare_funcs(func1, func2):
+    # return True if the functions compare equal for
+    # caching purposes, False otherwise
+    code1 = func1.__code__
+    code2 = func2.__code__
+
+    return (
+        code1.co_code == code2.co_code
+        and code1.co_consts == code2.co_consts
+        and func1.__closure__ == func2.__closure__
+    )
+
+
 @lru_cache(maxsize=256)  # TODO: what's a reasonable value?
 def cached_compile(func, sig, abi_name=None, **kwargs):
     return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs)
@@ -24,12 +38,10 @@ class IteratorBase:
     """
     An Iterator is a wrapper around a pointer, and must define the following:
 
-    - a `state` property that returns a `ctypes.c_void_p` object, representing
-      a pointer to some data.
-    - an `advance` (static) method that receives the state pointer and performs
+    - an `advance` (static) method that receives the pointer and performs
       an action that advances the pointer by the offset `distance`
       (returns nothing).
-    - a `dereference` (static) method that dereferences the state pointer
+    - a `dereference` (static) method that dereferences the pointer
       and returns a value.
 
     Iterators are not meant to be used directly. They are constructed and passed
@@ -38,18 +50,28 @@ class IteratorBase:
     The `advance` and `dereference` must be compilable to device code by numba.
     """
 
-    def __init__(self, numba_type: types.Type, value_type: types.Type, abi_name: str):
+    def __init__(
+        self,
+        cvalue: ctypes.c_void_p,
+        numba_type: types.Type,
+        value_type: types.Type,
+        abi_name: str,
+    ):
         """
         Parameters
         ----------
+        cvalue
+          A ctypes type representing the object pointed to by the iterator.
         numba_type
-          A numba type that specifies how to interpret the state pointer.
+          A numba type representing the type of the input to the advance
+          and dereference functions.
         value_type
           The numba type of the value returned by the dereference operation.
         abi_name
           A unique identifier that will determine the abi_names for the
           advance and dereference operations.
         """
+        self.cvalue = cvalue
         self.numba_type = numba_type
         self.value_type = value_type
         self.abi_name = abi_name
@@ -81,7 +103,7 @@ def ltoirs(self) -> Dict[str, bytes]:
 
     @property
     def state(self) -> ctypes.c_void_p:
-        raise NotImplementedError("Subclasses must override advance staticmethod")
+        return ctypes.cast(ctypes.pointer(self.cvalue), ctypes.c_void_p)
 
     @staticmethod
     def advance(state, distance):
@@ -91,6 +113,21 @@ def advance(state, distance):
     def dereference(state):
         raise NotImplementedError("Subclasses must override dereference staticmethod")
 
+    def __hash__(self):
+        return hash(
+            (self.cvalue.value, self.numba_type, self.value_type, self.abi_name)
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        return (
+            self.cvalue.value == other.cvalue.value
+            and self.numba_type == other.numba_type
+            and self.value_type == other.value_type
+            and self.abi_name == other.abi_name
+        )
+
 
 def sizeof_pointee(context, ptr):
     size = context.get_abi_sizeof(ptr.type.pointee)
@@ -125,10 +162,11 @@ def impl(ptr, offset):
 class RawPointer(IteratorBase):
     def __init__(self, ptr: int, ntype: types.Type):
         value_type = ntype
-        self._cvalue = ctypes.c_void_p(ptr)
+        cvalue = ctypes.c_void_p(ptr)
         numba_type = types.CPointer(types.CPointer(value_type))
         abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
             abi_name=abi_name,
@@ -142,10 +180,6 @@ def advance(state, distance):
     def dereference(state):
         return state[0][0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
-
 
 def pointer(container, ntype: types.Type) -> RawPointer:
     return RawPointer(container.__cuda_array_interface__["data"][0], ntype)
@@ -174,11 +208,12 @@ def codegen(context, builder, sig, args):
 
 class CacheModifiedPointer(IteratorBase):
     def __init__(self, ptr: int, ntype: types.Type):
-        self._cvalue = ctypes.c_void_p(ptr)
+        cvalue = ctypes.c_void_p(ptr)
         value_type = ntype
         numba_type = types.CPointer(types.CPointer(value_type))
         abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
             abi_name=abi_name,
@@ -192,18 +227,15 @@ def advance(state, distance):
     def dereference(state):
         return load_cs(state[0])
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
-
 
 class ConstantIterator(IteratorBase):
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
-        self._cvalue = to_ctypes(value_type)(value)
+        cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
         abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
             abi_name=abi_name,
@@ -217,18 +249,15 @@ def advance(state, distance):
     def dereference(state):
         return state[0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
-
 
 class CountingIterator(IteratorBase):
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
-        self._cvalue = to_ctypes(value_type)(value)
+        cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
         abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
             abi_name=abi_name,
@@ -242,10 +271,6 @@ def advance(state, distance):
     def dereference(state):
         return state[0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
-
 
 def make_transform_iterator(it, op: Callable):
     if hasattr(it, "__cuda_array_interface__"):
@@ -256,8 +281,9 @@ def make_transform_iterator(it, op: Callable):
     op = cuda.jit(op, device=True)
 
     class TransformIterator(IteratorBase):
-        def __init__(self, it: IteratorBase, op):
+        def __init__(self, it: IteratorBase, op: CUDADispatcher):
             self._it = it
+            self._op = op
             numba_type = it.numba_type
             # TODO: the abi name below isn't unique enough when we have e.g.,
             # two identically named `op` functions with different
@@ -276,6 +302,7 @@ def __init__(self, it: IteratorBase, op):
             value_type = op_retty
             abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}"
             super().__init__(
+                cvalue=it.cvalue,
                 numba_type=numba_type,
                 value_type=value_type,
                 abi_name=abi_name,
@@ -289,8 +316,20 @@ def advance(state, distance):
         def dereference(state):
             return op(it_dereference(state))
 
-        @property
-        def state(self) -> ctypes.c_void_p:
-            return it.state
+        def __hash__(self):
+            return hash(
+                (
+                    self._it,
+                    self._op.py_func.__code__.co_code,
+                    self._op.py_func.__closure__,
+                )
+            )
+
+        def __eq__(self, other):
+            if not isinstance(other, IteratorBase):
+                return NotImplemented
+            return self._it == other._it and _compare_funcs(
+                self._op.py_func, other._op.py_func
+            )
 
     return TransformIterator(it, op)
diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py
new file mode 100644
index 00000000000..d5d15e8174b
--- /dev/null
+++ b/python/cuda_parallel/tests/test_iterators.py
@@ -0,0 +1,58 @@
+from cuda.parallel.experimental.iterators import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    TransformIterator,
+)
+import cupy as cp
+import numpy as np
+
+
+def test_constant_iterator_equality():
+    assert ConstantIterator(np.int32(0)) == ConstantIterator(np.int32(0))
+    assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int32(1))
+    assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int64(0))
+
+
+def test_counting_iterator_equality():
+    assert CountingIterator(np.int32(0)) == CountingIterator(np.int32(0))
+    assert CountingIterator(np.int32(0)) != CountingIterator(np.int32(1))
+    assert CountingIterator(np.int32(0)) != CountingIterator(np.int64(0))
+
+
+def test_cache_modified_input_iterator_equality():
+    ary1 = cp.asarray([0, 1, 2])
+    ary2 = cp.asarray([3, 4, 5])
+    assert CacheModifiedInputIterator(ary1, "stream") == CacheModifiedInputIterator(
+        ary1, "stream"
+    )
+    assert CacheModifiedInputIterator(ary1, "stream") != CacheModifiedInputIterator(
+        ary2, "stream"
+    )
+
+
+def test_equality_transform_iterator():
+    def op1(x):
+        return x
+
+    def op2(x):
+        return 2 * x
+
+    def op3(x):
+        return x
+
+    it = CountingIterator(np.int32(0))
+    assert TransformIterator(it, op1) == TransformIterator(it, op1)
+    assert TransformIterator(it, op1) != TransformIterator(it, op2)
+    assert TransformIterator(it, op1) == TransformIterator(it, op3)
+
+    ary1 = cp.asarray([0, 1, 2])
+    ary2 = cp.asarray([3, 4, 5])
+    assert TransformIterator(ary1, op1) == TransformIterator(ary1, op1)
+    assert TransformIterator(ary1, op1) != TransformIterator(ary1, op2)
+    assert TransformIterator(ary1, op1) == TransformIterator(ary1, op3)
+    assert TransformIterator(ary1, op1) != TransformIterator(ary2, op1)
+
+
+def test_different_iterator_types_equality():
+    assert CountingIterator(np.int32(0)) != ConstantIterator(np.int64(0))

From da652e1aecfd8f48740df32f7ef9021288161961 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 21 Dec 2024 08:20:30 -0500
Subject: [PATCH 02/12] Define cache_with_key utility and use it to cache
 Reduce objects

---
 .../cuda/parallel/experimental/_caching.py    | 31 +++++++++++++++++++
 .../experimental/algorithms/reduce.py         | 10 ++++++
 2 files changed, 41 insertions(+)
 create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_caching.py

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
new file mode 100644
index 00000000000..5713ae35d3f
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import functools
+
+
+def cache_with_key(key):
+    """
+    Decorator to cache the result of the decorated function.  Uses the
+    provided `key` function to compute the key for cache lookup. `key`
+    receives all arguments passed to the function.
+    """
+
+    def deco(func):
+        cache = {}
+
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            cache_key = key(*args, **kwargs)
+            if cache_key not in cache:
+                result = func(*args, **kwargs)
+                cache[cache_key] = result
+            # `cache_key` *must* be in `cache`, use `.get()`
+            # as it is faster:
+            return cache.get(cache_key)
+
+        return inner
+
+    return deco
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 29e7786b5f8..7ea9084ce82 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -12,6 +12,7 @@
 
 from .. import _cccl as cccl
 from .._bindings import get_paths, get_bindings
+from .._caching import cache_with_key
 
 
 class _Op:
@@ -119,8 +120,17 @@ def __del__(self):
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
 
+def make_cache_key(d_in, d_out, op, h_init):
+    d_in_key = d_in.dtype if hasattr(d_in, "__cuda_array_interface__") else d_in
+    d_out_key = d_out.dtype if hasattr(d_out, "__cuda_array_interface__") else d_out
+    op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__)
+    h_init_key = h_init.dtype
+    return (d_in_key, d_out_key, op_key, h_init_key)
+
+
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
+@cache_with_key(make_cache_key)
 def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray):
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 

From 221af5c5745c1349a872841eb546e6b906e9e07a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 21 Dec 2024 08:20:54 -0500
Subject: [PATCH 03/12] Add tests for caching Reduce objects

---
 python/cuda_parallel/tests/test_reduce.py | 213 ++++++++++++++++++----
 1 file changed, 182 insertions(+), 31 deletions(-)

diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index 0f454e3603b..d890a656f3f 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
-import numpy
+import numpy as np
 import pytest
 import random
 import numba.cuda
@@ -14,31 +14,29 @@
 
 
 def random_int(shape, dtype):
-    return numpy.random.randint(0, 5, size=shape).astype(dtype)
+    return np.random.randint(0, 5, size=shape).astype(dtype)
 
 
 def type_to_problem_sizes(dtype):
-    if dtype in [numpy.uint8, numpy.int8]:
+    if dtype in [np.uint8, np.int8]:
         return [2, 4, 5, 6]
-    elif dtype in [numpy.uint16, numpy.int16]:
+    elif dtype in [np.uint16, np.int16]:
         return [4, 8, 12, 14]
-    elif dtype in [numpy.uint32, numpy.int32]:
+    elif dtype in [np.uint32, np.int32]:
         return [16, 20, 24, 28]
-    elif dtype in [numpy.uint64, numpy.int64]:
+    elif dtype in [np.uint64, np.int64]:
         return [16, 20, 24, 28]
     else:
         raise ValueError("Unsupported dtype")
 
 
-@pytest.mark.parametrize(
-    "dtype", [numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]
-)
+@pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64])
 def test_device_reduce(dtype):
     def op(a, b):
         return a + b
 
     init_value = 42
-    h_init = numpy.array([init_value], dtype=dtype)
+    h_init = np.array([init_value], dtype=dtype)
     d_output = numba.cuda.device_array(1, dtype=dtype)
     reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init)
 
@@ -47,7 +45,7 @@ def op(a, b):
         h_input = random_int(num_items, dtype)
         d_input = numba.cuda.to_device(h_input)
         temp_storage_size = reduce_into(None, d_input, d_output, None, h_init)
-        d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+        d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8)
         reduce_into(d_temp_storage, d_input, d_output, None, h_init)
         h_output = d_output.copy_to_host()
         assert h_output[0] == sum(h_input) + init_value
@@ -57,19 +55,19 @@ def test_complex_device_reduce():
     def op(a, b):
         return a + b
 
-    h_init = numpy.array([40.0 + 2.0j], dtype=complex)
+    h_init = np.array([40.0 + 2.0j], dtype=complex)
     d_output = numba.cuda.device_array(1, dtype=complex)
     reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init)
 
     for num_items in [42, 420000]:
-        h_input = numpy.random.random(num_items) + 1j * numpy.random.random(num_items)
+        h_input = np.random.random(num_items) + 1j * np.random.random(num_items)
         d_input = numba.cuda.to_device(h_input)
         temp_storage_bytes = reduce_into(None, d_input, d_output, None, h_init)
-        d_temp_storage = numba.cuda.device_array(temp_storage_bytes, numpy.uint8)
+        d_temp_storage = numba.cuda.device_array(temp_storage_bytes, np.uint8)
         reduce_into(d_temp_storage, d_input, d_output, None, h_init)
 
         result = d_output.copy_to_host()[0]
-        expected = numpy.sum(h_input, initial=h_init[0])
+        expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
 
@@ -77,9 +75,9 @@ def test_device_reduce_dtype_mismatch():
     def min_op(a, b):
         return a if a < b else b
 
-    dtypes = [numpy.int32, numpy.int64]
-    h_inits = [numpy.array([], dt) for dt in dtypes]
-    h_inputs = [numpy.array([], dt) for dt in dtypes]
+    dtypes = [np.int32, np.int64]
+    h_inits = [np.array([], dt) for dt in dtypes]
+    h_inputs = [np.array([], dt) for dt in dtypes]
     d_outputs = [numba.cuda.device_array(1, dt) for dt in dtypes]
     d_inputs = [numba.cuda.to_device(h_inp) for h_inp in h_inputs]
 
@@ -109,14 +107,14 @@ def add_op(a, b):
         expected_result = add_op(expected_result, v)
 
     if use_numpy_array:
-        h_input = numpy.array(l_varr, dtype_inp)
+        h_input = np.array(l_varr, dtype_inp)
         d_input = numba.cuda.to_device(h_input)
     else:
         d_input = i_input
 
     d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
 
-    h_init = numpy.array([start_sum_with], dtype_out)
+    h_init = np.array([start_sum_with], dtype_out)
 
     reduce_into = algorithms.reduce_into(
         d_in=d_input, d_out=d_output, op=add_op, h_init=h_init
@@ -125,7 +123,7 @@ def add_op(a, b):
     temp_storage_size = reduce_into(
         None, d_in=d_input, d_out=d_output, num_items=len(l_varr), h_init=h_init
     )
-    d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+    d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8)
 
     reduce_into(d_temp_storage, d_input, d_output, len(l_varr), h_init)
 
@@ -168,9 +166,9 @@ def test_device_sum_cache_modified_input_it(
 ):
     rng = random.Random(0)
     l_varr = [rng.randrange(100) for _ in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = numba.cuda.to_device(numpy.array(l_varr, dtype=dtype_inp))
+    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
     i_input = iterators.CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -181,7 +179,7 @@ def test_device_sum_constant_it(
     use_numpy_array, supported_value_type, num_items=3, start_sum_with=10
 ):
     l_varr = [42 for distance in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
     i_input = iterators.ConstantIterator(dtype_inp.type(42))
     _test_device_sum_with_iterator(
@@ -193,7 +191,7 @@ def test_device_sum_counting_it(
     use_numpy_array, supported_value_type, num_items=3, start_sum_with=10
 ):
     l_varr = [start_sum_with + distance for distance in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
     i_input = iterators.CountingIterator(dtype_inp.type(start_sum_with))
     _test_device_sum_with_iterator(
@@ -217,8 +215,8 @@ def test_device_sum_map_mul2_count_it(
 ):
     l_varr = [2 * (start_sum_with + distance) for distance in range(num_items)]
     vtn_out, vtn_inp = value_type_name_pair
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     i_input = iterators.TransformIterator(
         iterators.CountingIterator(dtype_inp.type(start_sum_with)), mul2
     )
@@ -248,8 +246,8 @@ def test_device_sum_map_mul_map_mul_count_it(
         fac_out * (fac_mid * (start_sum_with + distance))
         for distance in range(num_items)
     ]
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     mul_funcs = {2: mul2, 3: mul3}
     i_input = iterators.TransformIterator(
         iterators.TransformIterator(
@@ -275,8 +273,8 @@ def test_device_sum_map_mul2_cp_array_it(
     use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10
 ):
     vtn_out, vtn_inp = value_type_name_pair
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     rng = random.Random(0)
     l_d_in = [rng.randrange(100) for _ in range(num_items)]
     a_d_in = cp.array(l_d_in, dtype_inp)
@@ -285,3 +283,156 @@ def test_device_sum_map_mul2_cp_array_it(
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
     )
+
+
+def test_reducer_caching():
+    def sum_op(x, y):
+        return x + y
+
+    # inputs are device arrays
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are device arrays of different dtype:
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int32"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # outputs are of different dtype:
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int32"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # inputs are of same dtype but different size
+    # (should still use cached reducer):
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(5, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are counting iterators of the
+    # same value type:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are counting iterators of different value type:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int64(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    def op1(x):
+        return x
+
+    def op2(x):
+        return 2 * x
+
+    def op3(x):
+        return x
+
+    # inputs are TransformIterators
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators with different
+    # op:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op2),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # inputs are TransformIterators with same op
+    # but different name:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op3),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2

From 0eea142ebb72a4ca86bd3fb67a55a3169dcec9b2 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sun, 22 Dec 2024 07:12:53 -0500
Subject: [PATCH 04/12] Tighten up types

---
 .../parallel/experimental/_utils/__init__.py  |  0
 .../cuda/parallel/experimental/_utils/cai.py  | 16 ++++++++++
 .../experimental/algorithms/reduce.py         | 31 +++++++++++++++----
 .../cuda/parallel/experimental/typing.py      | 10 ++++++
 4 files changed, 51 insertions(+), 6 deletions(-)
 create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py
 create mode 100644 python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
 create mode 100644 python/cuda_parallel/cuda/parallel/experimental/typing.py

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
new file mode 100644
index 00000000000..9c0718e71f0
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Utilities for extracting information from `__cuda_array_interface__`.
+"""
+
+import numpy as np
+
+from ..typing import DeviceArrayLike
+
+
+def get_dtype(arr: DeviceArrayLike) -> np.dtype:
+    return np.dtype(arr.__cuda_array_interface__["typestr"])
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 7ea9084ce82..7cdde1c9132 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -13,6 +13,9 @@
 from .. import _cccl as cccl
 from .._bindings import get_paths, get_bindings
 from .._caching import cache_with_key
+from ..typing import DeviceArrayLike
+from ..iterators._iterators import IteratorBase
+from .._utils import cai as cai
 
 
 class _Op:
@@ -42,12 +45,18 @@ def _dtype_validation(dt1, dt2):
 
 class _Reduce:
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
-    def __init__(self, d_in, d_out, op: Callable, h_init: np.ndarray):
+    def __init__(
+        self,
+        d_in: DeviceArrayLike | IteratorBase,
+        d_out: DeviceArrayLike,
+        op: Callable,
+        h_init: np.ndarray,
+    ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
         )
-        self._ctor_d_out_dtype = d_out.dtype
+        self._ctor_d_out_dtype = cai.get_dtype(d_out)
         self._ctor_init_dtype = h_init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths()
@@ -120,9 +129,14 @@ def __del__(self):
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
 
-def make_cache_key(d_in, d_out, op, h_init):
-    d_in_key = d_in.dtype if hasattr(d_in, "__cuda_array_interface__") else d_in
-    d_out_key = d_out.dtype if hasattr(d_out, "__cuda_array_interface__") else d_out
+def make_cache_key(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_out: DeviceArrayLike,
+    op: Callable,
+    h_init: np.ndarray,
+):
+    d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
+    d_out_key = d_out if isinstance(d_out, IteratorBase) else cai.get_dtype(d_out)
     op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__)
     h_init_key = h_init.dtype
     return (d_in_key, d_out_key, op_key, h_init_key)
@@ -131,7 +145,12 @@ def make_cache_key(d_in, d_out, op, h_init):
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
 @cache_with_key(make_cache_key)
-def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray):
+def reduce_into(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_out: DeviceArrayLike,
+    op: Callable,
+    h_init: np.ndarray,
+):
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 
     Example:
diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py
new file mode 100644
index 00000000000..87f72999bfe
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py
@@ -0,0 +1,10 @@
+from typing import Protocol
+
+
+class DeviceArrayLike(Protocol):
+    """
+    Objects representing a device array, having a `.__cuda_array_interface__`
+    attribute.
+    """
+
+    __cuda_array_interface__: dict

From f198200c462e8307f63f914ce6c2cd7e51eb3809 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 23 Dec 2024 09:48:57 -0500
Subject: [PATCH 05/12] Updates to support 3.7

---
 .../cuda/parallel/experimental/algorithms/reduce.py           | 2 ++
 python/cuda_parallel/cuda/parallel/experimental/typing.py     | 4 +++-
 python/cuda_parallel/setup.py                                 | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 7cdde1c9132..605e71b7172 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -3,6 +3,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from __future__ import annotations  # TODO: required for Python 3.7 docs env
+
 import ctypes
 import numba
 import numpy as np
diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py
index 87f72999bfe..1c4e9c9975f 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/typing.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py
@@ -1,4 +1,6 @@
-from typing import Protocol
+from typing_extensions import (
+    Protocol,
+)  # TODO: typing_extensions required for Python 3.7 docs env
 
 
 class DeviceArrayLike(Protocol):
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 40c998fafee..0db6592bb05 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -100,7 +100,8 @@ def build_extension(self, ext):
     ],
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
-    install_requires=["numba>=0.60.0", "cuda-python", "jinja2"],
+    # TODO: typing_extensions required for Python 3.7 docs env
+    install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"],
     extras_require={
         "test": [
             "pytest",

From 56f2c61f774e965df75b55b9c3028711c5983a53 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 23 Dec 2024 10:33:43 -0500
Subject: [PATCH 06/12] Address review feedback

---
 .../cuda/parallel/experimental/_caching.py          | 13 +++++++++----
 .../cuda/parallel/experimental/algorithms/reduce.py |  4 ++--
 .../parallel/experimental/iterators/_iterators.py   |  7 +++----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
index 5713ae35d3f..0dff034921b 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import functools
+from numba import cuda
 
 
 def cache_with_key(key):
@@ -11,6 +12,11 @@ def cache_with_key(key):
     Decorator to cache the result of the decorated function.  Uses the
     provided `key` function to compute the key for cache lookup. `key`
     receives all arguments passed to the function.
+
+    Notes
+    -----
+    The CUDA compute capability of the current device is appended to
+    the cache key returned by `key`.
     """
 
     def deco(func):
@@ -18,13 +24,12 @@ def deco(func):
 
         @functools.wraps(func)
         def inner(*args, **kwargs):
-            cache_key = key(*args, **kwargs)
+            cc = cuda.get_current_device().compute_capability
+            cache_key = (key(*args, **kwargs), *cc)
             if cache_key not in cache:
                 result = func(*args, **kwargs)
                 cache[cache_key] = result
-            # `cache_key` *must* be in `cache`, use `.get()`
-            # as it is faster:
-            return cache.get(cache_key)
+            return cache[cache_key]
 
         return inner
 
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 605e71b7172..23b43cd252d 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -17,7 +17,7 @@
 from .._caching import cache_with_key
 from ..typing import DeviceArrayLike
 from ..iterators._iterators import IteratorBase
-from .._utils import cai as cai
+from .._utils import cai
 
 
 class _Op:
@@ -138,7 +138,7 @@ def make_cache_key(
     h_init: np.ndarray,
 ):
     d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
-    d_out_key = d_out if isinstance(d_out, IteratorBase) else cai.get_dtype(d_out)
+    d_out_key = cai.get_dtype(d_out)
     op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__)
     h_init_key = h_init.dtype
     return (d_in_key, d_out_key, op_key, h_init_key)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index 6be218bc95d..c49eee008bf 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -160,8 +160,7 @@ def impl(ptr, offset):
 
 
 class RawPointer(IteratorBase):
-    def __init__(self, ptr: int, ntype: types.Type):
-        value_type = ntype
+    def __init__(self, ptr: int, value_type: types.Type):
         cvalue = ctypes.c_void_p(ptr)
         numba_type = types.CPointer(types.CPointer(value_type))
         abi_name = f"{self.__class__.__name__}_{str(value_type)}"
@@ -181,8 +180,8 @@ def dereference(state):
         return state[0][0]
 
 
-def pointer(container, ntype: types.Type) -> RawPointer:
-    return RawPointer(container.__cuda_array_interface__["data"][0], ntype)
+def pointer(container, value_type: types.Type) -> RawPointer:
+    return RawPointer(container.__cuda_array_interface__["data"][0], value_type)
 
 
 @intrinsic

From c822da7e100c8b9b255379dd736779055d922b3a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 23 Dec 2024 12:28:32 -0500
Subject: [PATCH 07/12] Introduce IteratorKind to hold iterator type
 information

---
 .../cuda/parallel/experimental/_caching.py    | 40 +++++++++
 .../experimental/algorithms/reduce.py         |  6 +-
 .../experimental/iterators/_iterators.py      | 83 ++++++++++++++-----
 python/cuda_parallel/tests/test_iterators.py  | 78 ++++++++++++-----
 python/cuda_parallel/tests/test_reduce.py     | 65 +++++++++++++++
 5 files changed, 229 insertions(+), 43 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
index 0dff034921b..fa9165b7ad8 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -34,3 +34,43 @@ def inner(*args, **kwargs):
         return inner
 
     return deco
+
+
+class CachableFunction:
+    """
+    A type that wraps a function and provides custom comparison
+    (__eq__) and hash (__hash__) implementations.
+
+    The purpose of this class is to enable caching and comparison of
+    functions based on their bytecode, constants, and closures, while
+    ignoring other attributes such as their names or docstrings.
+    """
+
+    def __init__(self, func):
+        self._func = func
+
+    def __eq__(self, other):
+        func1, func2 = self._func, other._func
+
+        # return True if the functions compare equal for
+        # caching purposes, False otherwise
+        code1 = func1.__code__
+        code2 = func2.__code__
+
+        return (
+            code1.co_code == code2.co_code
+            and code1.co_consts == code2.co_consts
+            and func1.__closure__ == func2.__closure__
+        )
+
+    def __hash__(self):
+        return hash(
+            (
+                self._func.__code__.co_code,
+                self._func.__code__.co_consts,
+                self._func.__closure__,
+            )
+        )
+
+    def __repr__(self):
+        return str(self._func)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 23b43cd252d..41843742827 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -14,7 +14,7 @@
 
 from .. import _cccl as cccl
 from .._bindings import get_paths, get_bindings
-from .._caching import cache_with_key
+from .._caching import CachableFunction, cache_with_key
 from ..typing import DeviceArrayLike
 from ..iterators._iterators import IteratorBase
 from .._utils import cai
@@ -137,9 +137,9 @@ def make_cache_key(
     op: Callable,
     h_init: np.ndarray,
 ):
-    d_in_key = d_in if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
+    d_in_key = d_in.kind if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
     d_out_key = cai.get_dtype(d_out)
-    op_key = (op.__code__.co_code, op.__code__.co_consts, op.__closure__)
+    op_key = CachableFunction(op)
     h_init_key = h_init.dtype
     return (d_in_key, d_out_key, op_key, h_init_key)
 
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index c49eee008bf..8ed4174da07 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -11,29 +11,32 @@
 import numba
 import numpy as np
 
+from .._caching import CachableFunction
+
 
 _DEVICE_POINTER_SIZE = 8
 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8
 
 
-def _compare_funcs(func1, func2):
-    # return True if the functions compare equal for
-    # caching purposes, False otherwise
-    code1 = func1.__code__
-    code2 = func2.__code__
-
-    return (
-        code1.co_code == code2.co_code
-        and code1.co_consts == code2.co_consts
-        and func1.__closure__ == func2.__closure__
-    )
-
-
 @lru_cache(maxsize=256)  # TODO: what's a reasonable value?
 def cached_compile(func, sig, abi_name=None, **kwargs):
     return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs)
 
 
+class IteratorKind:
+    def __init__(self, value_type):
+        self.value_type = value_type
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}[{str(self.value_type)}]"
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.value_type == other.value_type
+
+    def __hash__(self):
+        return hash(self.value_type)
+
+
 class IteratorBase:
     """
     An Iterator is a wrapper around a pointer, and must define the following:
@@ -50,6 +53,8 @@ class IteratorBase:
     The `advance` and `dereference` must be compilable to device code by numba.
     """
 
+    iterator_kind_type: type  # must be a subclass of IteratorKind
+
     def __init__(
         self,
         cvalue: ctypes.c_void_p,
@@ -76,6 +81,10 @@ def __init__(
         self.value_type = value_type
         self.abi_name = abi_name
 
+    @property
+    def kind(self):
+        return self.__class__.iterator_kind_type(self.value_type)
+
     # TODO: should we cache this? Current docs environment doesn't allow
     # using Python > 3.7. We could use a hand-rolled cached_property if
     # needed.
@@ -159,7 +168,13 @@ def impl(ptr, offset):
     return impl
 
 
+class RawPointerType(IteratorKind):
+    pass
+
+
 class RawPointer(IteratorBase):
+    iterator_kind_type = RawPointerType
+
     def __init__(self, ptr: int, value_type: types.Type):
         cvalue = ctypes.c_void_p(ptr)
         numba_type = types.CPointer(types.CPointer(value_type))
@@ -205,7 +220,13 @@ def codegen(context, builder, sig, args):
     return base.dtype(base), codegen
 
 
+class CacheModifiedPointerType(IteratorKind):
+    pass
+
+
 class CacheModifiedPointer(IteratorBase):
+    iterator_kind_type = CacheModifiedPointerType
+
     def __init__(self, ptr: int, ntype: types.Type):
         cvalue = ctypes.c_void_p(ptr)
         value_type = ntype
@@ -227,7 +248,13 @@ def dereference(state):
         return load_cs(state[0])
 
 
+class ConstantIteratorKind(IteratorKind):
+    pass
+
+
 class ConstantIterator(IteratorBase):
+    iterator_kind_type = ConstantIteratorKind
+
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
         cvalue = to_ctypes(value_type)(value)
@@ -249,7 +276,13 @@ def dereference(state):
         return state[0]
 
 
+class CountingIteratorKind(IteratorKind):
+    pass
+
+
 class CountingIterator(IteratorBase):
+    iterator_kind_type = CountingIteratorKind
+
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
         cvalue = to_ctypes(value_type)(value)
@@ -271,6 +304,14 @@ def dereference(state):
         return state[0]
 
 
+class TransformIteratorKind(IteratorKind):
+    def __eq__(self, other):
+        return type(self) is type(other) and self.value_type == other.value_type
+
+    def __hash__(self):
+        return hash(self.value_type)
+
+
 def make_transform_iterator(it, op: Callable):
     if hasattr(it, "__cuda_array_interface__"):
         it = pointer(it, numba.from_dtype(it.dtype))
@@ -280,9 +321,11 @@ def make_transform_iterator(it, op: Callable):
     op = cuda.jit(op, device=True)
 
     class TransformIterator(IteratorBase):
+        iterator_kind_type = TransformIteratorKind
+
         def __init__(self, it: IteratorBase, op: CUDADispatcher):
             self._it = it
-            self._op = op
+            self._op = CachableFunction(op.py_func)
             numba_type = it.numba_type
             # TODO: the abi name below isn't unique enough when we have e.g.,
             # two identically named `op` functions with different
@@ -307,6 +350,10 @@ def __init__(self, it: IteratorBase, op: CUDADispatcher):
                 abi_name=abi_name,
             )
 
+        @property
+        def kind(self):
+            return self.__class__.iterator_kind_type((self._it.kind, self._op))
+
         @staticmethod
         def advance(state, distance):
             return it_advance(state, distance)
@@ -319,16 +366,14 @@ def __hash__(self):
             return hash(
                 (
                     self._it,
-                    self._op.py_func.__code__.co_code,
-                    self._op.py_func.__closure__,
+                    self._op._func.py_func.__code__.co_code,
+                    self._op._func.py_func.__closure__,
                 )
             )
 
         def __eq__(self, other):
             if not isinstance(other, IteratorBase):
                 return NotImplemented
-            return self._it == other._it and _compare_funcs(
-                self._op.py_func, other._op.py_func
-            )
+            return self._it == other._it and self._op == other._op
 
     return TransformIterator(it, op)
diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py
index d5d15e8174b..89ae076cb4c 100644
--- a/python/cuda_parallel/tests/test_iterators.py
+++ b/python/cuda_parallel/tests/test_iterators.py
@@ -9,26 +9,47 @@
 
 
 def test_constant_iterator_equality():
-    assert ConstantIterator(np.int32(0)) == ConstantIterator(np.int32(0))
-    assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int32(1))
-    assert ConstantIterator(np.int32(0)) != ConstantIterator(np.int64(0))
+    it1 = ConstantIterator(np.int32(0))
+    it2 = ConstantIterator(np.int32(0))
+    it3 = ConstantIterator(np.int32(1))
+    it4 = ConstantIterator(np.int64(9))
+
+    assert it1 == it2
+    assert it1 != it3
+    assert it1 != it4
+
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
 
 
 def test_counting_iterator_equality():
-    assert CountingIterator(np.int32(0)) == CountingIterator(np.int32(0))
-    assert CountingIterator(np.int32(0)) != CountingIterator(np.int32(1))
-    assert CountingIterator(np.int32(0)) != CountingIterator(np.int64(0))
+    it1 = CountingIterator(np.int32(0))
+    it2 = CountingIterator(np.int32(0))
+    it3 = CountingIterator(np.int32(1))
+    it4 = CountingIterator(np.int64(9))
+
+    assert it1 == it2
+    assert it1 != it3
+    assert it1 != it4
+
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
 
 
 def test_cache_modified_input_iterator_equality():
-    ary1 = cp.asarray([0, 1, 2])
-    ary2 = cp.asarray([3, 4, 5])
-    assert CacheModifiedInputIterator(ary1, "stream") == CacheModifiedInputIterator(
-        ary1, "stream"
-    )
-    assert CacheModifiedInputIterator(ary1, "stream") != CacheModifiedInputIterator(
-        ary2, "stream"
-    )
+    ary1 = cp.asarray([0, 1, 2], dtype="int32")
+    ary2 = cp.asarray([3, 4, 5], dtype="int32")
+    ary3 = cp.asarray([0, 1, 2], dtype="int64")
+
+    it1 = CacheModifiedInputIterator(ary1, "stream")
+    it2 = CacheModifiedInputIterator(ary1, "stream")
+    it3 = CacheModifiedInputIterator(ary2, "stream")
+    it4 = CacheModifiedInputIterator(ary3, "stream")
+
+    assert it1 == it2
+    assert it1 != it3
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
 
 
 def test_equality_transform_iterator():
@@ -42,16 +63,31 @@ def op3(x):
         return x
 
     it = CountingIterator(np.int32(0))
-    assert TransformIterator(it, op1) == TransformIterator(it, op1)
-    assert TransformIterator(it, op1) != TransformIterator(it, op2)
-    assert TransformIterator(it, op1) == TransformIterator(it, op3)
+    it1 = TransformIterator(it, op1)
+    it2 = TransformIterator(it, op1)
+    it3 = TransformIterator(it, op2)
+    it4 = TransformIterator(it, op3)
+
+    assert it1 == it2
+    assert it1 != it3
+    assert it1 == it4
+    assert it1.kind == it2.kind == it4.kind
 
     ary1 = cp.asarray([0, 1, 2])
     ary2 = cp.asarray([3, 4, 5])
-    assert TransformIterator(ary1, op1) == TransformIterator(ary1, op1)
-    assert TransformIterator(ary1, op1) != TransformIterator(ary1, op2)
-    assert TransformIterator(ary1, op1) == TransformIterator(ary1, op3)
-    assert TransformIterator(ary1, op1) != TransformIterator(ary2, op1)
+    it5 = TransformIterator(ary1, op1)
+    it6 = TransformIterator(ary1, op1)
+    it7 = TransformIterator(ary1, op2)
+    it8 = TransformIterator(ary1, op3)
+    it9 = TransformIterator(ary2, op1)
+
+    assert it5 == it6
+    assert it5 != it7
+    assert it5 == it8
+    assert it5 != it9
+
+    assert it5.kind == it6.kind == it8.kind == it9.kind
+    assert it5.kind != it7.kind
 
 
 def test_different_iterator_types_equality():
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index d890a656f3f..ce5656f635d 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -435,4 +435,69 @@ def op3(x):
         sum_op,
         np.zeros([0], dtype="int64"),
     )
+
+    # inputs are CountingIterators of same kind
+    # but different state:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(1)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+
     assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators of same kind
+    # but different state:
+    ary1 = cp.asarray([0, 1, 2], dtype="int64")
+    ary2 = cp.asarray([0, 1], dtype="int64")
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(ary1, op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(ary2, op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators of same kind
+    # but different state:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(1)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators with different kind:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int64(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2

From 58b6f695e843af403ccb8ca40f2c50b6ff799eaa Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 24 Dec 2024 09:30:57 -0500
Subject: [PATCH 08/12] Use the .kind to generate an abi_name

---
 .../cuda/parallel/experimental/_caching.py    | 26 +++------
 .../experimental/iterators/_iterators.py      | 53 ++++++-------------
 python/cuda_parallel/tests/test_iterators.py  |  2 +
 3 files changed, 26 insertions(+), 55 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
index fa9165b7ad8..3d79436b1b3 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -48,29 +48,17 @@ class CachableFunction:
 
     def __init__(self, func):
         self._func = func
+        self._identity = (
+            self._func.__code__.co_code,
+            self._func.__code__.co_consts,
+            self._func.__closure__,
+        )
 
     def __eq__(self, other):
-        func1, func2 = self._func, other._func
-
-        # return True if the functions compare equal for
-        # caching purposes, False otherwise
-        code1 = func1.__code__
-        code2 = func2.__code__
-
-        return (
-            code1.co_code == code2.co_code
-            and code1.co_consts == code2.co_consts
-            and func1.__closure__ == func2.__closure__
-        )
+        return self._identity == other._identity
 
     def __hash__(self):
-        return hash(
-            (
-                self._func.__code__.co_code,
-                self._func.__code__.co_consts,
-                self._func.__closure__,
-            )
-        )
+        return hash(self._identity)
 
     def __repr__(self):
         return str(self._func)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index 8ed4174da07..7e7a1652309 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -1,5 +1,6 @@
 import ctypes
 import operator
+import uuid
 from functools import lru_cache
 from typing import Dict, Callable
 
@@ -18,6 +19,14 @@
 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8
 
 
+@lru_cache(maxsize=None)
+def _get_abi_suffix(kind: "IteratorKind"):
+    # given an IteratorKind, return a UUID. The value
+    # is cached so that the same UUID is always returned
+    # for a given IteratorKind.
+    return uuid.uuid4().hex
+
+
 @lru_cache(maxsize=256)  # TODO: what's a reasonable value?
 def cached_compile(func, sig, abi_name=None, **kwargs):
     return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs)
@@ -60,7 +69,6 @@ def __init__(
         cvalue: ctypes.c_void_p,
         numba_type: types.Type,
         value_type: types.Type,
-        abi_name: str,
     ):
         """
         Parameters
@@ -72,14 +80,10 @@ def __init__(
           and dereference functions.
         value_type
           The numba type of the value returned by the dereference operation.
-        abi_name
-          A unique identifier that will determine the abi_names for the
-          advance and dereference operations.
         """
         self.cvalue = cvalue
         self.numba_type = numba_type
         self.value_type = value_type
-        self.abi_name = abi_name
 
     @property
     def kind(self):
@@ -90,8 +94,8 @@ def kind(self):
     # needed.
     @property
     def ltoirs(self) -> Dict[str, bytes]:
-        advance_abi_name = self.abi_name + "_advance"
-        deref_abi_name = self.abi_name + "_dereference"
+        advance_abi_name = "advance_" + _get_abi_suffix(self.kind)
+        deref_abi_name = "dereference_" + _get_abi_suffix(self.kind)
         advance_ltoir, _ = cached_compile(
             self.__class__.advance,
             (
@@ -123,18 +127,16 @@ def dereference(state):
         raise NotImplementedError("Subclasses must override dereference staticmethod")
 
     def __hash__(self):
-        return hash(
-            (self.cvalue.value, self.numba_type, self.value_type, self.abi_name)
-        )
+        return hash((self.kind, self.cvalue.value, self.numba_type, self.value_type))
 
     def __eq__(self, other):
         if not isinstance(other, self.__class__):
             return NotImplemented
         return (
-            self.cvalue.value == other.cvalue.value
+            self.kind == other.kind
+            and self.cvalue.value == other.cvalue.value
             and self.numba_type == other.numba_type
             and self.value_type == other.value_type
-            and self.abi_name == other.abi_name
         )
 
 
@@ -178,12 +180,10 @@ class RawPointer(IteratorBase):
     def __init__(self, ptr: int, value_type: types.Type):
         cvalue = ctypes.c_void_p(ptr)
         numba_type = types.CPointer(types.CPointer(value_type))
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
             cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -231,12 +231,10 @@ def __init__(self, ptr: int, ntype: types.Type):
         cvalue = ctypes.c_void_p(ptr)
         value_type = ntype
         numba_type = types.CPointer(types.CPointer(value_type))
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
             cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -259,12 +257,10 @@ def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
         cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
             cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -287,12 +283,10 @@ def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
         cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
             cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -327,27 +321,20 @@ def __init__(self, it: IteratorBase, op: CUDADispatcher):
             self._it = it
             self._op = CachableFunction(op.py_func)
             numba_type = it.numba_type
-            # TODO: the abi name below isn't unique enough when we have e.g.,
-            # two identically named `op` functions with different
-            # signatures, bytecodes, and/or closure variables.
-            op_abi_name = f"{self.__class__.__name__}_{op.py_func.__name__}"
-
             # TODO: it would be nice to not need to compile `op` to get
             # its return type, but there's nothing in the numba API
             # to do that (yet),
             _, op_retty = cached_compile(
                 op,
                 (self._it.value_type,),
-                abi_name=op_abi_name,
+                abi_name=f"{op.__name__}_{_get_abi_suffix(self.kind)}",
                 output="ltoir",
             )
             value_type = op_retty
-            abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}"
             super().__init__(
                 cvalue=it.cvalue,
                 numba_type=numba_type,
                 value_type=value_type,
-                abi_name=abi_name,
             )
 
         @property
@@ -363,16 +350,10 @@ def dereference(state):
             return op(it_dereference(state))
 
         def __hash__(self):
-            return hash(
-                (
-                    self._it,
-                    self._op._func.py_func.__code__.co_code,
-                    self._op._func.py_func.__closure__,
-                )
-            )
+            return hash((self._it, self._op))
 
         def __eq__(self, other):
-            if not isinstance(other, IteratorBase):
+            if not isinstance(other.kind, TransformIteratorKind):
                 return NotImplemented
             return self._it == other._it and self._op == other._op
 
diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py
index 89ae076cb4c..ef94800df43 100644
--- a/python/cuda_parallel/tests/test_iterators.py
+++ b/python/cuda_parallel/tests/test_iterators.py
@@ -48,6 +48,7 @@ def test_cache_modified_input_iterator_equality():
 
     assert it1 == it2
     assert it1 != it3
+
     assert it1.kind == it2.kind == it3.kind
     assert it1.kind != it4.kind
 
@@ -71,6 +72,7 @@ def op3(x):
     assert it1 == it2
     assert it1 != it3
     assert it1 == it4
+
     assert it1.kind == it2.kind == it4.kind
 
     ary1 = cp.asarray([0, 1, 2])

From d0175721972088ce257f42c94247fc9d095dd1e6 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 30 Dec 2024 07:23:05 -0500
Subject: [PATCH 09/12] Remove __eq__ and __hash__ methods from IteratorBase

---
 .../experimental/iterators/_iterators.py      | 13 ------
 python/cuda_parallel/tests/test_iterators.py  | 41 ++++---------------
 2 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index 7e7a1652309..2d1083208b2 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -126,19 +126,6 @@ def advance(state, distance):
     def dereference(state):
         raise NotImplementedError("Subclasses must override dereference staticmethod")
 
-    def __hash__(self):
-        return hash((self.kind, self.cvalue.value, self.numba_type, self.value_type))
-
-    def __eq__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return (
-            self.kind == other.kind
-            and self.cvalue.value == other.cvalue.value
-            and self.numba_type == other.numba_type
-            and self.value_type == other.value_type
-        )
-
 
 def sizeof_pointee(context, ptr):
     size = context.get_abi_sizeof(ptr.type.pointee)
diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py
index ef94800df43..3b7910d404d 100644
--- a/python/cuda_parallel/tests/test_iterators.py
+++ b/python/cuda_parallel/tests/test_iterators.py
@@ -14,10 +14,6 @@ def test_constant_iterator_equality():
     it3 = ConstantIterator(np.int32(1))
     it4 = ConstantIterator(np.int64(9))
 
-    assert it1 == it2
-    assert it1 != it3
-    assert it1 != it4
-
     assert it1.kind == it2.kind == it3.kind
     assert it1.kind != it4.kind
 
@@ -28,10 +24,6 @@ def test_counting_iterator_equality():
     it3 = CountingIterator(np.int32(1))
     it4 = CountingIterator(np.int64(9))
 
-    assert it1 == it2
-    assert it1 != it3
-    assert it1 != it4
-
     assert it1.kind == it2.kind == it3.kind
     assert it1.kind != it4.kind
 
@@ -46,9 +38,6 @@ def test_cache_modified_input_iterator_equality():
     it3 = CacheModifiedInputIterator(ary2, "stream")
     it4 = CacheModifiedInputIterator(ary3, "stream")
 
-    assert it1 == it2
-    assert it1 != it3
-
     assert it1.kind == it2.kind == it3.kind
     assert it1.kind != it4.kind
 
@@ -66,31 +55,17 @@ def op3(x):
     it = CountingIterator(np.int32(0))
     it1 = TransformIterator(it, op1)
     it2 = TransformIterator(it, op1)
-    it3 = TransformIterator(it, op2)
-    it4 = TransformIterator(it, op3)
-
-    assert it1 == it2
-    assert it1 != it3
-    assert it1 == it4
+    it3 = TransformIterator(it, op3)
 
-    assert it1.kind == it2.kind == it4.kind
+    assert it1.kind == it2.kind == it3.kind
 
     ary1 = cp.asarray([0, 1, 2])
     ary2 = cp.asarray([3, 4, 5])
+    it4 = TransformIterator(ary1, op1)
     it5 = TransformIterator(ary1, op1)
-    it6 = TransformIterator(ary1, op1)
-    it7 = TransformIterator(ary1, op2)
-    it8 = TransformIterator(ary1, op3)
-    it9 = TransformIterator(ary2, op1)
-
-    assert it5 == it6
-    assert it5 != it7
-    assert it5 == it8
-    assert it5 != it9
-
-    assert it5.kind == it6.kind == it8.kind == it9.kind
-    assert it5.kind != it7.kind
-
+    it6 = TransformIterator(ary1, op2)
+    it7 = TransformIterator(ary1, op3)
+    it8 = TransformIterator(ary2, op1)
 
-def test_different_iterator_types_equality():
-    assert CountingIterator(np.int32(0)) != ConstantIterator(np.int64(0))
+    assert it4.kind == it5.kind == it7.kind == it8.kind
+    assert it4.kind != it6.kind

From 68235eefee4a39f4e7dd29c9a48166ddbff267a9 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 30 Dec 2024 07:32:26 -0500
Subject: [PATCH 10/12] Move helper function

---
 .../experimental/iterators/_iterators.py         | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index 2d1083208b2..076a260ca1f 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -19,14 +19,6 @@
 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8
 
 
-@lru_cache(maxsize=None)
-def _get_abi_suffix(kind: "IteratorKind"):
-    # given an IteratorKind, return a UUID. The value
-    # is cached so that the same UUID is always returned
-    # for a given IteratorKind.
-    return uuid.uuid4().hex
-
-
 @lru_cache(maxsize=256)  # TODO: what's a reasonable value?
 def cached_compile(func, sig, abi_name=None, **kwargs):
     return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs)
@@ -46,6 +38,14 @@ def __hash__(self):
         return hash(self.value_type)
 
 
+@lru_cache(maxsize=None)
+def _get_abi_suffix(kind: IteratorKind):
+    # given an IteratorKind, return a UUID. The value
+    # is cached so that the same UUID is always returned
+    # for a given IteratorKind.
+    return uuid.uuid4().hex
+
+
 class IteratorBase:
     """
     An Iterator is a wrapper around a pointer, and must define the following:

From f892265643bef96b71a1727f876108cbd8c9d538 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 30 Dec 2024 09:06:40 -0500
Subject: [PATCH 11/12] Formatting

---
 .../cuda/parallel/experimental/iterators/_iterators.py       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index 076a260ca1f..babc4a12a11 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -40,9 +40,8 @@ def __hash__(self):
 
 @lru_cache(maxsize=None)
 def _get_abi_suffix(kind: IteratorKind):
-    # given an IteratorKind, return a UUID. The value
-    # is cached so that the same UUID is always returned
-    # for a given IteratorKind.
+    # given an IteratorKind, return a UUID. The value is cached so
+    # that the same UUID is always returned for a given IteratorKind.
     return uuid.uuid4().hex
 
 

From 5e3d95aa819e8d7bb161f9b3d183ed70e009437a Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 31 Dec 2024 13:57:06 -0500
Subject: [PATCH 12/12] Don't unpack tuple in cache key

---
 python/cuda_parallel/cuda/parallel/experimental/_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
index 3d79436b1b3..2647c1835ee 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/_caching.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -25,7 +25,7 @@ def deco(func):
         @functools.wraps(func)
         def inner(*args, **kwargs):
             cc = cuda.get_current_device().compute_capability
-            cache_key = (key(*args, **kwargs), *cc)
+            cache_key = (key(*args, **kwargs), cc)
             if cache_key not in cache:
                 result = func(*args, **kwargs)
                 cache[cache_key] = result