Skip to content

Commit

Permalink
Categorical reverse transform may crash with ValueError for certain…
Browse files Browse the repository at this point in the history
… dtypes (int64) (#755)
  • Loading branch information
R-Palazzo authored Jan 22, 2024
1 parent 71a590b commit deb0f74
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 10 deletions.
22 changes: 18 additions & 4 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from rdt.errors import TransformerInputError
from rdt.transformers.base import BaseTransformer
from rdt.transformers.utils import fill_nan_with_none
from rdt.transformers.utils import check_nan_in_transform, fill_nan_with_none, try_convert_to_dtype

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -177,6 +177,7 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
data = data.clip(0, 1)
bins = [0]
labels = []
Expand All @@ -192,7 +193,10 @@ def _reverse_transform(self, data):
labels.append(key)

result = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
return result.replace(nan_name, np.nan).astype(self.dtype)
result = result.replace(nan_name, np.nan)
result = try_convert_to_dtype(result, self.dtype)

return result


class OrderedUniformEncoder(UniformEncoder):
Expand Down Expand Up @@ -333,6 +337,7 @@ def __init__(self, add_noise=False):
)
super().__init__()
self.add_noise = add_noise
self._is_integer = None

@staticmethod
def _get_intervals(data):
Expand Down Expand Up @@ -516,6 +521,7 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
data = data.clip(0, 1)
num_rows = len(data)
num_categories = len(self.means)
Expand Down Expand Up @@ -545,6 +551,7 @@ class OneHotEncoder(BaseTransformer):
_dummy_encoded = False
_indexer = None
_uniques = None
dtype = None

@staticmethod
def _prepare_data(data):
Expand Down Expand Up @@ -582,6 +589,7 @@ def _fit(self, data):
data (pandas.Series or pandas.DataFrame):
Data to fit the transformer to.
"""
self.dtype = data.dtype
data = self._prepare_data(data)

null = pd.isna(data).to_numpy()
Expand Down Expand Up @@ -657,15 +665,18 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim == 1:
data = data.reshape(-1, 1)

indices = np.argmax(data, axis=1)
result = pd.Series(indices).map(self.dummies.__getitem__)
result = try_convert_to_dtype(result, self.dtype)

return pd.Series(indices).map(self.dummies.__getitem__)
return result


class LabelEncoder(BaseTransformer):
Expand Down Expand Up @@ -801,13 +812,15 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
if self.add_noise:
data = np.floor(data)

data = data.clip(min(self.values_to_categories), max(self.values_to_categories))
data = data.round().map(self.values_to_categories)
data = try_convert_to_dtype(data, self.dtype)

return data.astype(self.dtype)
return data


class OrderedLabelEncoder(LabelEncoder):
Expand Down Expand Up @@ -865,6 +878,7 @@ def _fit(self, data):
data (pandas.Series):
Data to fit the transformer to.
"""
self.dtype = data.dtype
data = data.fillna(np.nan)
missing = list(data[~data.isin(self.order)].unique())
if len(missing) > 0:
Expand Down
50 changes: 50 additions & 0 deletions rdt/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import re
import string
import warnings

import numpy as np
import pandas as pd

import sre_parse # isort:skip

Expand Down Expand Up @@ -184,3 +186,51 @@ def flatten_column_list(column_list):
flattened.append(column)

return flattened


def check_nan_in_transform(data, dtype):
"""Check if there are null values in the transformed data.
Args:
data (pd.Series or numpy.ndarray):
Data that has been transformed.
dtype (str):
Data type of the transformed data.
"""
if pd.isna(data).any().any():
message = (
'There are null values in the transformed data. The reversed '
'transformed data will contain null values'
)
is_integer = pd.api.types.is_integer_dtype(dtype)
if is_integer:
message += " of type 'float'."
else:
message += '.'

warnings.warn(message)


def try_convert_to_dtype(data, dtype):
"""Try to convert data to a given dtype.
Args:
data (pd.Series or numpy.ndarray):
Data to convert.
dtype (str):
Data type to convert to.
Returns:
data:
Data converted to the given dtype.
"""
try:
data = data.astype(dtype)
except ValueError as error:
is_integer = pd.api.types.is_integer_dtype(dtype)
if is_integer:
data = data.astype(float)
else:
raise error

return data
83 changes: 78 additions & 5 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ def test__transform_user_warning(self):
assert transformed.iloc[4] >= 0
assert transformed.iloc[4] < 1

def test__reverse_transform(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform(self, mock_convert_dtype, mock_check_nan):
"""Test the ``_reverse_transform``."""
# Setup
data = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2])
Expand All @@ -289,12 +291,18 @@ def test__reverse_transform(self):
}

transformed = pd.Series([0.12, 0.254, 0.789, 0.43, 0.56, 0.08, 0.67, 0.98, 0.36])
mock_convert_dtype.return_value = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2])

# Run
output = transformer._reverse_transform(transformed)

# Asserts
pd.testing.assert_series_equal(output, data)
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, transformed)
assert mock_input_dtype == transformer.dtype
mock_convert_dtype.assert_called_once()

def test__reverse_transform_nans(self):
"""Test ``_reverse_transform`` for data with NaNs."""
Expand Down Expand Up @@ -323,6 +331,25 @@ def test__reverse_transform_nans(self):
# Asserts
pd.testing.assert_series_equal(output, data)

def test__reverse_transform_integer_and_nans(self):
"""Test the ``reverse_transform`` method with integers and nans.
Test that the method correctly reverse transforms the data
when the initial data is integers and the transformed data has nans.
"""
# Setup
transformer = UniformEncoder()
transformer.frequencies = {11: 0.2, 12: 0.3, 13: 0.5}
transformer.intervals = {11: [0, 0.2], 12: [0.2, 0.5], 13: [0.5, 1]}
transformer.dtype = np.int64
data = pd.Series([0.1, 0.25, np.nan, 0.65])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan, 13]))


@pytest.fixture(autouse=True)
def _setup_caplog(caplog):
Expand Down Expand Up @@ -718,7 +745,8 @@ def test__get_value_add_noise_true(self, norm_mock):
# Asserts
assert result == 0.2745

def test__reverse_transform_series(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
def test__reverse_transform_series(self, mock_check_nan):
"""Test reverse_transform a pandas Series"""
# Setup
data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
Expand All @@ -730,6 +758,10 @@ def test__reverse_transform_series(self):
result = transformer._reverse_transform(rt_data)

# Asserts
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, rt_data)
assert mock_input_dtype == transformer.dtype
expected_intervals = {
'foo': (
0,
Expand Down Expand Up @@ -1123,7 +1155,8 @@ def test__reverse_transform_by_row_called(self):
np.testing.assert_array_equal(reverse_arg, data.clip(0, 1))
assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value

def test__reverse_transform_by_row(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
def test__reverse_transform_by_row(self, mock_check_nan):
"""Test the _reverse_transform_by_row method with numerical data.
Expect that the transformed data is correctly reverse transformed.
Expand Down Expand Up @@ -1156,6 +1189,10 @@ def test__reverse_transform_by_row(self):
reverse = transformer._reverse_transform(transformed)

# Assert
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, transformed)
assert mock_input_dtype == data.dtype
pd.testing.assert_series_equal(data, reverse)


Expand Down Expand Up @@ -1222,6 +1259,7 @@ def test__fit_dummies_no_nans(self):

# Assert
np.testing.assert_array_equal(ohe.dummies, ['a', 2, 'c'])
assert ohe.dtype == 'object'

def test__fit_dummies_nans(self):
"""Test the ``_fit`` method without nans.
Expand Down Expand Up @@ -1776,11 +1814,14 @@ def test__transform_numeric(self):
assert not ohe._dummy_encoded
np.testing.assert_array_equal(out, expected)

def test__reverse_transform_no_nans(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan):
# Setup
ohe = OneHotEncoder()
data = pd.Series(['a', 'b', 'c'])
ohe._fit(data)
mock_convert_dtype.return_value = data

# Run
transformed = np.array([
Expand All @@ -1793,6 +1834,11 @@ def test__reverse_transform_no_nans(self):
# Assert
expected = pd.Series(['a', 'b', 'c'])
pd.testing.assert_series_equal(out, expected)
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
np.testing.assert_array_equal(mock_input_data, transformed)
assert mock_input_dtype == 'O'
mock_convert_dtype.assert_called_once()

def test__reverse_transform_nans(self):
# Setup
Expand Down Expand Up @@ -2168,7 +2214,9 @@ def test__reverse_transform_clips_values(self):
# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))

def test__reverse_transform_add_noise(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform_add_noise(self, mock_convert_dtype, mock_check_nan):
"""Test the ``_reverse_transform`` method with ``add_noise``.
Test that the method correctly reverse transforms the data
Expand All @@ -2183,12 +2231,36 @@ def test__reverse_transform_add_noise(self):
transformer = LabelEncoder(add_noise=True)
transformer.values_to_categories = {0: 'a', 1: 'b', 2: 'c'}
data = pd.Series([0.5, 1.0, 10.9])
mock_convert_dtype.return_value = pd.Series(['a', 'b', 'c'])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, data)
assert mock_input_dtype == 'O'
mock_convert_dtype.assert_called_once()

def test__reverse_transform_integer_and_nans(self):
"""Test the ``reverse_transform`` method with integers and nans.
Test that the method correctly reverse transforms the data
when the initial data is integers and the transformed data has nans.
"""
# Setup
transformer = LabelEncoder()
transformer.values_to_categories = {0: 11, 1: 12, 2: 13}
transformer.dtype = 'int'
data = pd.Series([0, 1, np.nan])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan]))


class TestOrderedLabelEncoder:
Expand Down Expand Up @@ -2272,6 +2344,7 @@ def test__fit(self):
transformer._fit(data)

# Assert
assert transformer.dtype == 'float'
expected_values_to_categories = {0: 2, 1: 3, 2: np.nan, 3: 1}
expected_categories_to_values = {2: 0, 3: 1, 1: 3, np.nan: 2}
for key, value in transformer.values_to_categories.items():
Expand Down
Loading

0 comments on commit deb0f74

Please sign in to comment.