Skip to content

Commit

Permalink
avoid reinitializing rmm multiple times to resolve some intermittent …
Browse files Browse the repository at this point in the history
…memory issuespin numpy < 1 in readme

Signed-off-by: Erik Ordentlich <[email protected]>
  • Loading branch information
eordentlich committed Nov 20, 2024
1 parent d10e9f0 commit ff25f8f
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 15 deletions.
2 changes: 1 addition & 1 deletion python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html
```bash
conda create -n rapids-24.10 \
-c rapidsai -c conda-forge -c nvidia \
cuml=24.10 cuvs=24.10 python=3.10 cuda-version=11.8
cuml=24.10 cuvs=24.10 python=3.10 cuda-version=11.8 numpy~=1.0
```

**Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary.
Expand Down
28 changes: 16 additions & 12 deletions python/src/spark_rapids_ml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,11 +711,14 @@ def _train_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame:
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator

rmm.reinitialize(
managed_memory=True,
devices=_CumlCommon._get_gpu_device(context, is_local),
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# avoid initializing these twice to avoid downstream segfaults and other cuda memory errors
if not type(rmm.mr.get_current_device_resource()) == type(
rmm.mr.ManagedMemoryResource()
):
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())

if not cp.cuda.get_allocator().__name__ == rmm_cupy_allocator.__name__:
cp.cuda.set_allocator(rmm_cupy_allocator)

_CumlCommon._initialize_cuml_logging(cuml_verbose)

Expand Down Expand Up @@ -1386,13 +1389,14 @@ def _transform_udf(pdf_iter: Iterator[pd.DataFrame]) -> pd.DataFrame:
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator

rmm.reinitialize(
managed_memory=True,
devices=_CumlCommon._get_gpu_device(
context, is_local, is_transform=True
),
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# avoid initializing these twice to avoid downstream segfaults and other cuda memory errors
if not type(rmm.mr.get_current_device_resource()) == type(
rmm.mr.ManagedMemoryResource()
):
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())

if not cp.cuda.get_allocator().__name__ == rmm_cupy_allocator.__name__:
cp.cuda.set_allocator(rmm_cupy_allocator)

# Construct the cuml counterpart object
cuml_instance = construct_cuml_object_func()
Expand Down
10 changes: 8 additions & 2 deletions python/src/spark_rapids_ml/umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,8 +1114,14 @@ def _train_udf(pdf_iter: Iterable[pd.DataFrame]) -> Iterable[pd.DataFrame]:
import rmm
from rmm.allocators.cupy import rmm_cupy_allocator

rmm.reinitialize(managed_memory=True)
cp.cuda.set_allocator(rmm_cupy_allocator)
# avoid initializing these twice to avoid downstream segfaults and other cuda memory errors
if not type(rmm.mr.get_current_device_resource()) == type(
rmm.mr.ManagedMemoryResource()
):
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())

if not cp.cuda.get_allocator().__name__ == rmm_cupy_allocator.__name__:
cp.cuda.set_allocator(rmm_cupy_allocator)

_CumlCommon._initialize_cuml_logging(cuml_verbose)

Expand Down

0 comments on commit ff25f8f

Please sign in to comment.