diff --git a/python/cuda_parallel/cuda/parallel/experimental/_caching.py b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
new file mode 100644
index 00000000000..2647c1835ee
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/_caching.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import functools
+from numba import cuda
+
+
+def cache_with_key(key):
+    """
+    Decorator to cache the result of the decorated function.  Uses the
+    provided `key` function to compute the key for cache lookup. `key`
+    receives all arguments passed to the function.
+
+    Notes
+    -----
+    The CUDA compute capability of the current device is appended to
+    the cache key returned by `key`.
+    """
+
+    def deco(func):
+        cache = {}
+
+        @functools.wraps(func)
+        def inner(*args, **kwargs):
+            cc = cuda.get_current_device().compute_capability
+            cache_key = (key(*args, **kwargs), cc)
+            if cache_key not in cache:
+                result = func(*args, **kwargs)
+                cache[cache_key] = result
+            return cache[cache_key]
+
+        return inner
+
+    return deco
+
+
+class CachableFunction:
+    """
+    A type that wraps a function and provides custom comparison
+    (__eq__) and hash (__hash__) implementations.
+
+    The purpose of this class is to enable caching and comparison of
+    functions based on their bytecode, constants, and closures, while
+    ignoring other attributes such as their names or docstrings.
+    """
+
+    def __init__(self, func):
+        self._func = func
+        self._identity = (
+            self._func.__code__.co_code,
+            self._func.__code__.co_consts,
+            self._func.__closure__,
+        )
+
+    def __eq__(self, other):
+        return self._identity == other._identity
+
+    def __hash__(self):
+        return hash(self._identity)
+
+    def __repr__(self):
+        return str(self._func)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
new file mode 100644
index 00000000000..9c0718e71f0
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/_utils/cai.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Utilities for extracting information from `__cuda_array_interface__`.
+"""
+
+import numpy as np
+
+from ..typing import DeviceArrayLike
+
+
+def get_dtype(arr: DeviceArrayLike) -> np.dtype:
+    return np.dtype(arr.__cuda_array_interface__["typestr"])
diff --git a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
index 29e7786b5f8..41843742827 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/algorithms/reduce.py
@@ -3,6 +3,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from __future__ import annotations  # TODO: required for Python 3.7 docs env
+
 import ctypes
 import numba
 import numpy as np
@@ -12,6 +14,10 @@
 
 from .. import _cccl as cccl
 from .._bindings import get_paths, get_bindings
+from .._caching import CachableFunction, cache_with_key
+from ..typing import DeviceArrayLike
+from ..iterators._iterators import IteratorBase
+from .._utils import cai
 
 
 class _Op:
@@ -41,12 +47,18 @@ def _dtype_validation(dt1, dt2):
 
 class _Reduce:
     # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
-    def __init__(self, d_in, d_out, op: Callable, h_init: np.ndarray):
+    def __init__(
+        self,
+        d_in: DeviceArrayLike | IteratorBase,
+        d_out: DeviceArrayLike,
+        op: Callable,
+        h_init: np.ndarray,
+    ):
         d_in_cccl = cccl.to_cccl_iter(d_in)
         self._ctor_d_in_cccl_type_enum_name = cccl.type_enum_as_name(
             d_in_cccl.value_type.type.value
         )
-        self._ctor_d_out_dtype = d_out.dtype
+        self._ctor_d_out_dtype = cai.get_dtype(d_out)
         self._ctor_init_dtype = h_init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = get_paths()
@@ -119,9 +131,28 @@ def __del__(self):
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
 
+def make_cache_key(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_out: DeviceArrayLike,
+    op: Callable,
+    h_init: np.ndarray,
+):
+    d_in_key = d_in.kind if isinstance(d_in, IteratorBase) else cai.get_dtype(d_in)
+    d_out_key = cai.get_dtype(d_out)
+    op_key = CachableFunction(op)
+    h_init_key = h_init.dtype
+    return (d_in_key, d_out_key, op_key, h_init_key)
+
+
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
-def reduce_into(d_in, d_out, op: Callable, h_init: np.ndarray):
+@cache_with_key(make_cache_key)
+def reduce_into(
+    d_in: DeviceArrayLike | IteratorBase,
+    d_out: DeviceArrayLike,
+    op: Callable,
+    h_init: np.ndarray,
+):
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 
     Example:
diff --git a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
index e92578efbe7..babc4a12a11 100644
--- a/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
+++ b/python/cuda_parallel/cuda/parallel/experimental/iterators/_iterators.py
@@ -1,15 +1,19 @@
 import ctypes
 import operator
+import uuid
 from functools import lru_cache
 from typing import Dict, Callable
 
 from llvmlite import ir
 from numba.core.extending import intrinsic, overload
 from numba.core.typing.ctypes_utils import to_ctypes
+from numba.cuda.dispatcher import CUDADispatcher
 from numba import cuda, types
 import numba
 import numpy as np
 
+from .._caching import CachableFunction
+
 
 _DEVICE_POINTER_SIZE = 8
 _DEVICE_POINTER_BITWIDTH = _DEVICE_POINTER_SIZE * 8
@@ -20,16 +24,35 @@ def cached_compile(func, sig, abi_name=None, **kwargs):
     return cuda.compile(func, sig, abi_info={"abi_name": abi_name}, **kwargs)
 
 
+class IteratorKind:
+    def __init__(self, value_type):
+        self.value_type = value_type
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}[{str(self.value_type)}]"
+
+    def __eq__(self, other):
+        return type(self) is type(other) and self.value_type == other.value_type
+
+    def __hash__(self):
+        return hash(self.value_type)
+
+
+@lru_cache(maxsize=None)
+def _get_abi_suffix(kind: IteratorKind):
+    # given an IteratorKind, return a UUID. The value is cached so
+    # that the same UUID is always returned for a given IteratorKind.
+    return uuid.uuid4().hex
+
+
 class IteratorBase:
     """
     An Iterator is a wrapper around a pointer, and must define the following:
 
-    - a `state` property that returns a `ctypes.c_void_p` object, representing
-      a pointer to some data.
-    - an `advance` (static) method that receives the state pointer and performs
+    - an `advance` (static) method that receives the pointer and performs
       an action that advances the pointer by the offset `distance`
       (returns nothing).
-    - a `dereference` (static) method that dereferences the state pointer
+    - a `dereference` (static) method that dereferences the pointer
       and returns a value.
 
     Iterators are not meant to be used directly. They are constructed and passed
@@ -38,29 +61,40 @@ class IteratorBase:
     The `advance` and `dereference` must be compilable to device code by numba.
     """
 
-    def __init__(self, numba_type: types.Type, value_type: types.Type, abi_name: str):
+    iterator_kind_type: type  # must be a subclass of IteratorKind
+
+    def __init__(
+        self,
+        cvalue: ctypes.c_void_p,
+        numba_type: types.Type,
+        value_type: types.Type,
+    ):
         """
         Parameters
         ----------
+        cvalue
+          A ctypes type representing the object pointed to by the iterator.
         numba_type
-          A numba type that specifies how to interpret the state pointer.
+          A numba type representing the type of the input to the advance
+          and dereference functions.
         value_type
           The numba type of the value returned by the dereference operation.
-        abi_name
-          A unique identifier that will determine the abi_names for the
-          advance and dereference operations.
         """
+        self.cvalue = cvalue
         self.numba_type = numba_type
         self.value_type = value_type
-        self.abi_name = abi_name
+
+    @property
+    def kind(self):
+        return self.__class__.iterator_kind_type(self.value_type)
 
     # TODO: should we cache this? Current docs environment doesn't allow
     # using Python > 3.7. We could use a hand-rolled cached_property if
     # needed.
     @property
     def ltoirs(self) -> Dict[str, bytes]:
-        advance_abi_name = self.abi_name + "_advance"
-        deref_abi_name = self.abi_name + "_dereference"
+        advance_abi_name = "advance_" + _get_abi_suffix(self.kind)
+        deref_abi_name = "dereference_" + _get_abi_suffix(self.kind)
         advance_ltoir, _ = cached_compile(
             self.__class__.advance,
             (
@@ -81,7 +115,7 @@ def ltoirs(self) -> Dict[str, bytes]:
 
     @property
     def state(self) -> ctypes.c_void_p:
-        raise NotImplementedError("Subclasses must override advance staticmethod")
+        return ctypes.cast(ctypes.pointer(self.cvalue), ctypes.c_void_p)
 
     @staticmethod
     def advance(state, distance):
@@ -122,16 +156,20 @@ def impl(ptr, offset):
     return impl
 
 
+class RawPointerType(IteratorKind):
+    pass
+
+
 class RawPointer(IteratorBase):
-    def __init__(self, ptr: int, ntype: types.Type):
-        value_type = ntype
-        self._cvalue = ctypes.c_void_p(ptr)
+    iterator_kind_type = RawPointerType
+
+    def __init__(self, ptr: int, value_type: types.Type):
+        cvalue = ctypes.c_void_p(ptr)
         numba_type = types.CPointer(types.CPointer(value_type))
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -142,13 +180,9 @@ def advance(state, distance):
     def dereference(state):
         return state[0][0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
-
 
-def pointer(container, ntype: types.Type) -> RawPointer:
-    return RawPointer(container.__cuda_array_interface__["data"][0], ntype)
+def pointer(container, value_type: types.Type) -> RawPointer:
+    return RawPointer(container.__cuda_array_interface__["data"][0], value_type)
 
 
 @intrinsic
@@ -172,16 +206,21 @@ def codegen(context, builder, sig, args):
     return base.dtype(base), codegen
 
 
+class CacheModifiedPointerType(IteratorKind):
+    pass
+
+
 class CacheModifiedPointer(IteratorBase):
+    iterator_kind_type = CacheModifiedPointerType
+
     def __init__(self, ptr: int, ntype: types.Type):
-        self._cvalue = ctypes.c_void_p(ptr)
+        cvalue = ctypes.c_void_p(ptr)
         value_type = ntype
         numba_type = types.CPointer(types.CPointer(value_type))
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -192,21 +231,22 @@ def advance(state, distance):
     def dereference(state):
         return load_cs(state[0])
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
+
+class ConstantIteratorKind(IteratorKind):
+    pass
 
 
 class ConstantIterator(IteratorBase):
+    iterator_kind_type = ConstantIteratorKind
+
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
-        self._cvalue = to_ctypes(value_type)(value)
+        cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -217,21 +257,22 @@ def advance(state, distance):
     def dereference(state):
         return state[0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
+
+class CountingIteratorKind(IteratorKind):
+    pass
 
 
 class CountingIterator(IteratorBase):
+    iterator_kind_type = CountingIteratorKind
+
     def __init__(self, value: np.number):
         value_type = numba.from_dtype(value.dtype)
-        self._cvalue = to_ctypes(value_type)(value)
+        cvalue = to_ctypes(value_type)(value)
         numba_type = types.CPointer(value_type)
-        abi_name = f"{self.__class__.__name__}_{str(value_type)}"
         super().__init__(
+            cvalue=cvalue,
             numba_type=numba_type,
             value_type=value_type,
-            abi_name=abi_name,
         )
 
     @staticmethod
@@ -242,9 +283,13 @@ def advance(state, distance):
     def dereference(state):
         return state[0]
 
-    @property
-    def state(self) -> ctypes.c_void_p:
-        return ctypes.cast(ctypes.pointer(self._cvalue), ctypes.c_void_p)
+
+class TransformIteratorKind(IteratorKind):
+    def __eq__(self, other):
+        return type(self) is type(other) and self.value_type == other.value_type
+
+    def __hash__(self):
+        return hash(self.value_type)
 
 
 def make_transform_iterator(it, op: Callable):
@@ -256,31 +301,32 @@ def make_transform_iterator(it, op: Callable):
     op = cuda.jit(op, device=True)
 
     class TransformIterator(IteratorBase):
-        def __init__(self, it: IteratorBase, op):
+        iterator_kind_type = TransformIteratorKind
+
+        def __init__(self, it: IteratorBase, op: CUDADispatcher):
             self._it = it
+            self._op = CachableFunction(op.py_func)
             numba_type = it.numba_type
-            # TODO: the abi name below isn't unique enough when we have e.g.,
-            # two identically named `op` functions with different
-            # signatures, bytecodes, and/or closure variables.
-            op_abi_name = f"{self.__class__.__name__}_{op.py_func.__name__}"
-
             # TODO: it would be nice to not need to compile `op` to get
             # its return type, but there's nothing in the numba API
             # to do that (yet),
             _, op_retty = cached_compile(
                 op,
                 (self._it.value_type,),
-                abi_name=op_abi_name,
+                abi_name=f"{op.__name__}_{_get_abi_suffix(self.kind)}",
                 output="ltoir",
             )
             value_type = op_retty
-            abi_name = f"{self.__class__.__name__}_{it.abi_name}_{op_abi_name}"
             super().__init__(
+                cvalue=it.cvalue,
                 numba_type=numba_type,
                 value_type=value_type,
-                abi_name=abi_name,
             )
 
+        @property
+        def kind(self):
+            return self.__class__.iterator_kind_type((self._it.kind, self._op))
+
         @staticmethod
         def advance(state, distance):
             return it_advance(state, distance)
@@ -289,8 +335,12 @@ def advance(state, distance):
         def dereference(state):
             return op(it_dereference(state))
 
-        @property
-        def state(self) -> ctypes.c_void_p:
-            return it.state
+        def __hash__(self):
+            return hash((self._it, self._op))
+
+        def __eq__(self, other):
+            if not isinstance(other.kind, TransformIteratorKind):
+                return NotImplemented
+            return self._it == other._it and self._op == other._op
 
     return TransformIterator(it, op)
diff --git a/python/cuda_parallel/cuda/parallel/experimental/typing.py b/python/cuda_parallel/cuda/parallel/experimental/typing.py
new file mode 100644
index 00000000000..1c4e9c9975f
--- /dev/null
+++ b/python/cuda_parallel/cuda/parallel/experimental/typing.py
@@ -0,0 +1,12 @@
+from typing_extensions import (
+    Protocol,
+)  # TODO: typing_extensions required for Python 3.7 docs env
+
+
+class DeviceArrayLike(Protocol):
+    """
+    Objects representing a device array, having a `.__cuda_array_interface__`
+    attribute.
+    """
+
+    __cuda_array_interface__: dict
diff --git a/python/cuda_parallel/setup.py b/python/cuda_parallel/setup.py
index 40c998fafee..0db6592bb05 100644
--- a/python/cuda_parallel/setup.py
+++ b/python/cuda_parallel/setup.py
@@ -100,7 +100,8 @@ def build_extension(self, ext):
     ],
     packages=find_namespace_packages(include=["cuda.*"]),
     python_requires=">=3.9",
-    install_requires=["numba>=0.60.0", "cuda-python", "jinja2"],
+    # TODO: typing_extensions required for Python 3.7 docs env
+    install_requires=["numba>=0.60.0", "cuda-python", "jinja2", "typing_extensions"],
     extras_require={
         "test": [
             "pytest",
diff --git a/python/cuda_parallel/tests/test_iterators.py b/python/cuda_parallel/tests/test_iterators.py
new file mode 100644
index 00000000000..3b7910d404d
--- /dev/null
+++ b/python/cuda_parallel/tests/test_iterators.py
@@ -0,0 +1,71 @@
+from cuda.parallel.experimental.iterators import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    TransformIterator,
+)
+import cupy as cp
+import numpy as np
+
+
+def test_constant_iterator_equality():
+    it1 = ConstantIterator(np.int32(0))
+    it2 = ConstantIterator(np.int32(0))
+    it3 = ConstantIterator(np.int32(1))
+    it4 = ConstantIterator(np.int64(9))
+
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
+
+
+def test_counting_iterator_equality():
+    it1 = CountingIterator(np.int32(0))
+    it2 = CountingIterator(np.int32(0))
+    it3 = CountingIterator(np.int32(1))
+    it4 = CountingIterator(np.int64(9))
+
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
+
+
+def test_cache_modified_input_iterator_equality():
+    ary1 = cp.asarray([0, 1, 2], dtype="int32")
+    ary2 = cp.asarray([3, 4, 5], dtype="int32")
+    ary3 = cp.asarray([0, 1, 2], dtype="int64")
+
+    it1 = CacheModifiedInputIterator(ary1, "stream")
+    it2 = CacheModifiedInputIterator(ary1, "stream")
+    it3 = CacheModifiedInputIterator(ary2, "stream")
+    it4 = CacheModifiedInputIterator(ary3, "stream")
+
+    assert it1.kind == it2.kind == it3.kind
+    assert it1.kind != it4.kind
+
+
+def test_equality_transform_iterator():
+    def op1(x):
+        return x
+
+    def op2(x):
+        return 2 * x
+
+    def op3(x):
+        return x
+
+    it = CountingIterator(np.int32(0))
+    it1 = TransformIterator(it, op1)
+    it2 = TransformIterator(it, op1)
+    it3 = TransformIterator(it, op3)
+
+    assert it1.kind == it2.kind == it3.kind
+
+    ary1 = cp.asarray([0, 1, 2])
+    ary2 = cp.asarray([3, 4, 5])
+    it4 = TransformIterator(ary1, op1)
+    it5 = TransformIterator(ary1, op1)
+    it6 = TransformIterator(ary1, op2)
+    it7 = TransformIterator(ary1, op3)
+    it8 = TransformIterator(ary2, op1)
+
+    assert it4.kind == it5.kind == it7.kind == it8.kind
+    assert it4.kind != it6.kind
diff --git a/python/cuda_parallel/tests/test_reduce.py b/python/cuda_parallel/tests/test_reduce.py
index 0f454e3603b..ce5656f635d 100644
--- a/python/cuda_parallel/tests/test_reduce.py
+++ b/python/cuda_parallel/tests/test_reduce.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import cupy as cp
-import numpy
+import numpy as np
 import pytest
 import random
 import numba.cuda
@@ -14,31 +14,29 @@
 
 
 def random_int(shape, dtype):
-    return numpy.random.randint(0, 5, size=shape).astype(dtype)
+    return np.random.randint(0, 5, size=shape).astype(dtype)
 
 
 def type_to_problem_sizes(dtype):
-    if dtype in [numpy.uint8, numpy.int8]:
+    if dtype in [np.uint8, np.int8]:
         return [2, 4, 5, 6]
-    elif dtype in [numpy.uint16, numpy.int16]:
+    elif dtype in [np.uint16, np.int16]:
         return [4, 8, 12, 14]
-    elif dtype in [numpy.uint32, numpy.int32]:
+    elif dtype in [np.uint32, np.int32]:
         return [16, 20, 24, 28]
-    elif dtype in [numpy.uint64, numpy.int64]:
+    elif dtype in [np.uint64, np.int64]:
         return [16, 20, 24, 28]
     else:
         raise ValueError("Unsupported dtype")
 
 
-@pytest.mark.parametrize(
-    "dtype", [numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]
-)
+@pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64])
 def test_device_reduce(dtype):
     def op(a, b):
         return a + b
 
     init_value = 42
-    h_init = numpy.array([init_value], dtype=dtype)
+    h_init = np.array([init_value], dtype=dtype)
     d_output = numba.cuda.device_array(1, dtype=dtype)
     reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init)
 
@@ -47,7 +45,7 @@ def op(a, b):
         h_input = random_int(num_items, dtype)
         d_input = numba.cuda.to_device(h_input)
         temp_storage_size = reduce_into(None, d_input, d_output, None, h_init)
-        d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+        d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8)
         reduce_into(d_temp_storage, d_input, d_output, None, h_init)
         h_output = d_output.copy_to_host()
         assert h_output[0] == sum(h_input) + init_value
@@ -57,19 +55,19 @@ def test_complex_device_reduce():
     def op(a, b):
         return a + b
 
-    h_init = numpy.array([40.0 + 2.0j], dtype=complex)
+    h_init = np.array([40.0 + 2.0j], dtype=complex)
     d_output = numba.cuda.device_array(1, dtype=complex)
     reduce_into = algorithms.reduce_into(d_output, d_output, op, h_init)
 
     for num_items in [42, 420000]:
-        h_input = numpy.random.random(num_items) + 1j * numpy.random.random(num_items)
+        h_input = np.random.random(num_items) + 1j * np.random.random(num_items)
         d_input = numba.cuda.to_device(h_input)
         temp_storage_bytes = reduce_into(None, d_input, d_output, None, h_init)
-        d_temp_storage = numba.cuda.device_array(temp_storage_bytes, numpy.uint8)
+        d_temp_storage = numba.cuda.device_array(temp_storage_bytes, np.uint8)
         reduce_into(d_temp_storage, d_input, d_output, None, h_init)
 
         result = d_output.copy_to_host()[0]
-        expected = numpy.sum(h_input, initial=h_init[0])
+        expected = np.sum(h_input, initial=h_init[0])
         assert result == pytest.approx(expected)
 
 
@@ -77,9 +75,9 @@ def test_device_reduce_dtype_mismatch():
     def min_op(a, b):
         return a if a < b else b
 
-    dtypes = [numpy.int32, numpy.int64]
-    h_inits = [numpy.array([], dt) for dt in dtypes]
-    h_inputs = [numpy.array([], dt) for dt in dtypes]
+    dtypes = [np.int32, np.int64]
+    h_inits = [np.array([], dt) for dt in dtypes]
+    h_inputs = [np.array([], dt) for dt in dtypes]
     d_outputs = [numba.cuda.device_array(1, dt) for dt in dtypes]
     d_inputs = [numba.cuda.to_device(h_inp) for h_inp in h_inputs]
 
@@ -109,14 +107,14 @@ def add_op(a, b):
         expected_result = add_op(expected_result, v)
 
     if use_numpy_array:
-        h_input = numpy.array(l_varr, dtype_inp)
+        h_input = np.array(l_varr, dtype_inp)
         d_input = numba.cuda.to_device(h_input)
     else:
         d_input = i_input
 
     d_output = numba.cuda.device_array(1, dtype_out)  # to store device sum
 
-    h_init = numpy.array([start_sum_with], dtype_out)
+    h_init = np.array([start_sum_with], dtype_out)
 
     reduce_into = algorithms.reduce_into(
         d_in=d_input, d_out=d_output, op=add_op, h_init=h_init
@@ -125,7 +123,7 @@ def add_op(a, b):
     temp_storage_size = reduce_into(
         None, d_in=d_input, d_out=d_output, num_items=len(l_varr), h_init=h_init
     )
-    d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=numpy.uint8)
+    d_temp_storage = numba.cuda.device_array(temp_storage_size, dtype=np.uint8)
 
     reduce_into(d_temp_storage, d_input, d_output, len(l_varr), h_init)
 
@@ -168,9 +166,9 @@ def test_device_sum_cache_modified_input_it(
 ):
     rng = random.Random(0)
     l_varr = [rng.randrange(100) for _ in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
-    input_devarr = numba.cuda.to_device(numpy.array(l_varr, dtype=dtype_inp))
+    input_devarr = numba.cuda.to_device(np.array(l_varr, dtype=dtype_inp))
     i_input = iterators.CacheModifiedInputIterator(input_devarr, modifier="stream")
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
@@ -181,7 +179,7 @@ def test_device_sum_constant_it(
     use_numpy_array, supported_value_type, num_items=3, start_sum_with=10
 ):
     l_varr = [42 for distance in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
     i_input = iterators.ConstantIterator(dtype_inp.type(42))
     _test_device_sum_with_iterator(
@@ -193,7 +191,7 @@ def test_device_sum_counting_it(
     use_numpy_array, supported_value_type, num_items=3, start_sum_with=10
 ):
     l_varr = [start_sum_with + distance for distance in range(num_items)]
-    dtype_inp = numpy.dtype(supported_value_type)
+    dtype_inp = np.dtype(supported_value_type)
     dtype_out = dtype_inp
     i_input = iterators.CountingIterator(dtype_inp.type(start_sum_with))
     _test_device_sum_with_iterator(
@@ -217,8 +215,8 @@ def test_device_sum_map_mul2_count_it(
 ):
     l_varr = [2 * (start_sum_with + distance) for distance in range(num_items)]
     vtn_out, vtn_inp = value_type_name_pair
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     i_input = iterators.TransformIterator(
         iterators.CountingIterator(dtype_inp.type(start_sum_with)), mul2
     )
@@ -248,8 +246,8 @@ def test_device_sum_map_mul_map_mul_count_it(
         fac_out * (fac_mid * (start_sum_with + distance))
         for distance in range(num_items)
     ]
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     mul_funcs = {2: mul2, 3: mul3}
     i_input = iterators.TransformIterator(
         iterators.TransformIterator(
@@ -275,8 +273,8 @@ def test_device_sum_map_mul2_cp_array_it(
     use_numpy_array, value_type_name_pair, num_items=3, start_sum_with=10
 ):
     vtn_out, vtn_inp = value_type_name_pair
-    dtype_inp = numpy.dtype(vtn_inp)
-    dtype_out = numpy.dtype(vtn_out)
+    dtype_inp = np.dtype(vtn_inp)
+    dtype_out = np.dtype(vtn_out)
     rng = random.Random(0)
     l_d_in = [rng.randrange(100) for _ in range(num_items)]
     a_d_in = cp.array(l_d_in, dtype_inp)
@@ -285,3 +283,221 @@ def test_device_sum_map_mul2_cp_array_it(
     _test_device_sum_with_iterator(
         l_varr, start_sum_with, i_input, dtype_inp, dtype_out, use_numpy_array
     )
+
+
+def test_reducer_caching():
+    def sum_op(x, y):
+        return x + y
+
+    # inputs are device arrays
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are device arrays of different dtype:
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int32"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # outputs are of different dtype:
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int32"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # inputs are of same dtype but different size
+    # (should still use cached reducer):
+    reducer_1 = algorithms.reduce_into(
+        cp.zeros(3, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        cp.zeros(5, dtype="int64"),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are counting iterators of the
+    # same value type:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are counting iterators of different value type:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int64(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    def op1(x):
+        return x
+
+    def op2(x):
+        return 2 * x
+
+    def op3(x):
+        return x
+
+    # inputs are TransformIterators
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators with different
+    # op:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op2),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2
+
+    # inputs are TransformIterators with same op
+    # but different name:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op3),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+
+    # inputs are CountingIterators of same kind
+    # but different state:
+    reducer_1 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(0)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.CountingIterator(np.int32(1)),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators of same kind
+    # but different state:
+    ary1 = cp.asarray([0, 1, 2], dtype="int64")
+    ary2 = cp.asarray([0, 1], dtype="int64")
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(ary1, op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(ary2, op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators of same kind
+    # but different state:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(1)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is reducer_2
+
+    # inputs are TransformIterators with different kind:
+    reducer_1 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int32(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    reducer_2 = algorithms.reduce_into(
+        iterators.TransformIterator(iterators.CountingIterator(np.int64(0)), op1),
+        cp.zeros(1, dtype="int64"),
+        sum_op,
+        np.zeros([0], dtype="int64"),
+    )
+    assert reducer_1 is not reducer_2