From 7c1251a323764239b47770b4c6f2c051b6462ebf Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 6 Aug 2021 09:25:08 -0500 Subject: [PATCH 1/2] Fix concatenation of `cudf.RangeIndex` (#8970) (#8984) This is a backport of #8970 to 21.08 --- python/cudf/cudf/core/index.py | 51 +++++++++++++++++-- python/cudf/cudf/tests/test_index.py | 19 +++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 691b6ab2e29..c94aa940ec5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import cupy import numpy as np @@ -588,13 +588,18 @@ def sum(self): @classmethod def _concat(cls, objs): - data = concat_columns([o._values for o in objs]) + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + names = {obj.name for obj in objs} if len(names) == 1: [name] = names else: name = None - result = as_index(data) + result.name = name return result @@ -3043,3 +3048,43 @@ def __new__( ) return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + + +def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: + """ + An internal Utility function to concat RangeIndex objects. + """ + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + if not non_empty_indexes: + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0) + + for obj in non_empty_indexes: + if start is None: + # This is set by the first non-empty index + start = obj.start + if step is None and len(obj) > 1: + step = obj.step + elif step is None: + # First non-empty index had only one element + if obj.start == start: + result = as_index(concat_columns([x._values for x in indexes])) + return result + step = obj.start - start + + non_consecutive = (step != obj.step and len(obj) > 1) or ( + next_ is not None and obj.start != next_ + ) + if non_consecutive: + result = as_index(concat_columns([x._values for x in indexes])) + return result + if step is not None: + next_ = obj[-1] + step + + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f03454c479a..3f58eb3d6e7 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2316,3 +2316,22 @@ def test_get_loc_multi_string(idx, key, method): got = gi.get_loc(key, method=method) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "objs", + [ + [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], + [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], + [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], + ], +) +def test_range_index_concat(objs): + cudf_objs = [cudf.from_pandas(obj) for obj in objs] + + actual = cudf.concat(cudf_objs) + + expected = objs[0] + for obj in objs[1:]: + expected = expected.append(obj) + assert_eq(expected, actual) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cf5203a22e5..ace9701b677 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx(): ddf = dgd.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex) + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]]) From 29302e076f3b293bf1b7998fbf71516b97ce879e Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Fri, 6 Aug 2021 09:52:07 -0500 Subject: [PATCH 2/2] Don't unnecessarily read string offsets when doing concatenate overflow checking. (#8968) Fixes: #8960 We were always reading string offsets (device->gpu memcpy) during the concatenation overflow checking which was unnecessary when dealing with an unsliced column, resulting in a performance degradation. This fixes that. Url: https://github.com/rapidsai/cudf/pull/8968 --- cpp/src/copying/concatenate.cu | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index f5f3937089f..f4b6a8bf5fd 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -374,11 +374,18 @@ void traverse_children::operator()(host_span size_t { strings_column_view scv(b); - return a + (b.is_empty() - ? 0 - : cudf::detail::get_value( - scv.offsets(), scv.offset() + b.size(), stream) - - cudf::detail::get_value(scv.offsets(), scv.offset(), stream)); + return a + (scv.is_empty() ? 0 + // if the column is unsliced, skip the offset retrieval. + : scv.offset() > 0 + ? cudf::detail::get_value( + scv.offsets(), scv.offset() + scv.size(), stream) - + cudf::detail::get_value(scv.offsets(), scv.offset(), stream) + // if the offset() is 0, it can still be sliced to a shorter length. in this case + // we only need to read a single offset. otherwise just return the full length + // (chars_size()) + : scv.size() + 1 == scv.offsets().size() + ? scv.chars_size() + : cudf::detail::get_value(scv.offsets(), scv.size(), stream)); }); // note: output text must include "exceeds size_type range" for python error handling CUDF_EXPECTS(total_char_count <= static_cast(std::numeric_limits::max()),