From 9d65f247af628ee69e3c95062af45e9491291a37 Mon Sep 17 00:00:00 2001 From: caiodallaqua Date: Tue, 6 Feb 2024 10:17:37 -0300 Subject: [PATCH] add contributing --- .github/workflows/release.yaml | 33 +++++ .gitignore | 3 +- CONTRIBUTING.md | 48 ++++++++ README.md | 4 + Taskfile.yaml | 22 ++++ pier_ds_utils/__init__.py | 1 + pier_ds_utils/estimator.py | 14 +-- pier_ds_utils/transformer.py | 90 +++++++------- poetry.lock | 30 ++++- pyproject.toml | 1 + setup.py | 8 +- tests/conftest.py | 2 +- tests/test_estimator.py | 20 +-- tests/test_transformer.py | 214 ++++++++++++++++----------------- 14 files changed, 313 insertions(+), 177 deletions(-) create mode 100644 .github/workflows/release.yaml create mode 100644 CONTRIBUTING.md create mode 100644 Taskfile.yaml diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..1094149 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,33 @@ +name: Release +on: + release: + types: + - created + +jobs: + publish: + strategy: + fail-fast: false + matrix: + python-version: [3.11] + poetry-version: [1.4.2] + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout the repository + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - name: Run image + uses: abatilo/actions-poetry@v2.0.0 + with: + poetry-version: ${{ matrix.poetry-version }} + - name: Publish + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + run: | + poetry config pypi-token.pypi $PYPI_TOKEN + poetry publish --build \ No newline at end of file diff --git a/.gitignore b/.gitignore index 250581e..886aafa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__ .pytest_cache - +.ruff_cache .vscode +pytest-coverage.txt \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..fc2498d --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,48 @@ +# Contributing + +## Development + +### Setting up a development environment + +If you don't have a local development environment, you can follow these steps to set one up. + +Install [poetry](https://python-poetry.org/) and [task](https://taskfile.dev/). + +Now, initialize the project: + +```bash +task init +``` + +### Running tests + +You can run the tests with: + +```bash +task tests +``` + +This will run the tests with [pytest](https://docs.pytest.org/en/latest/) and show information about the coverage. + +### Formatting the code + +To look for formatting issues: +```bash +task check-formatting +``` + +To format the code, you can use the command: + +```bash +task formatting +``` + +### Releasing a new version + +To release a new version, you need to follow these steps: + +1. Update the version with `poetry version ` and commit the changes. This project follows [Semantic Versioning](http://semver.org/), so the version number should follow the format `..`. + +2. Create a Github release with the new version number. + +3. (Optional) Publish the new version to PyPI with `poetry publish --build`. \ No newline at end of file diff --git a/README.md b/README.md index 5527fae..2380af7 100644 --- a/README.md +++ b/README.md @@ -33,3 +33,7 @@ pip install git+https://github.com/pier-digital/pier-ds-utils@_version_ ``` Replace `_version_` by the specific version you want to use. You can find them [here](https://github.com/pier-digital/pier-ds-utils/tags). + +## Contributing + +Contributions are welcome! Please read the [contributing guidelines](CONTRIBUTING.md) first. \ No newline at end of file diff --git a/Taskfile.yaml b/Taskfile.yaml new file mode 100644 index 0000000..9c1b42c --- /dev/null +++ b/Taskfile.yaml @@ -0,0 +1,22 @@ +version: 3 + +vars: + DECISIONS_DIR: docs/decisions + +tasks: + init: + desc: initializes the project + cmds: + - poetry install -n + tests: + desc: run automated tests + cmds: + - poetry run pytest tests | tee pytest-coverage.txt + check-formatting: + desc: checks formatting + cmds: + - poetry run ruff check . + formatting: + desc: formats the code + cmds: + - poetry run ruff format . diff --git a/pier_ds_utils/__init__.py b/pier_ds_utils/__init__.py index 672f5ca..81083f5 100644 --- a/pier_ds_utils/__init__.py +++ b/pier_ds_utils/__init__.py @@ -1 +1,2 @@ +# flake8: noqa from pier_ds_utils import estimator, transformer diff --git a/pier_ds_utils/estimator.py b/pier_ds_utils/estimator.py index d075270..ce03180 100644 --- a/pier_ds_utils/estimator.py +++ b/pier_ds_utils/estimator.py @@ -19,21 +19,21 @@ def __init__( def get_params(self, deep=True): return { **self.init_params, - **{'add_constant': self._add_constant, 'os_factor': self.os_factor}, + **{"add_constant": self._add_constant, "os_factor": self.os_factor}, } def fit(self, X, y, **fit_params): if self._add_constant: - X['const'] = 1 + X["const"] = 1 self.model_ = sm.GLM(endog=y, exog=X, **self.init_params) - fit_method = fit_params.pop('fit_method', 'fit') + fit_method = fit_params.pop("fit_method", "fit") self.results_ = getattr(self.model_, fit_method)(**fit_params) return self def predict(self, X, **predict_params): if self._add_constant: - X['const'] = 1 + X["const"] = 1 return self.results_.predict(exog=X, **predict_params) * self.os_factor @@ -55,6 +55,6 @@ def __getattr__(self, __name: str) -> Any: def get_params(self, deep: bool = True) -> dict: return { - 'model': self.model.get_params(deep=deep) if deep else self.model, - 'column': self.column, - } \ No newline at end of file + "model": self.model.get_params(deep=deep) if deep else self.model, + "column": self.column, + } diff --git a/pier_ds_utils/transformer.py b/pier_ds_utils/transformer.py index 61919b4..df5eb3b 100644 --- a/pier_ds_utils/transformer.py +++ b/pier_ds_utils/transformer.py @@ -6,7 +6,7 @@ class BaseCustomTransformer(BaseEstimator, TransformerMixin): - def set_output(self, transform: str = 'pandas') -> BaseEstimator: + def set_output(self, transform: str = "pandas") -> BaseEstimator: return self @@ -19,7 +19,7 @@ def __init__( default_value: typing.Any = None, output_column: typing.Optional[str] = None, ): - ''' + """ Transformer to categorize a column into custom categories. Parameters @@ -34,15 +34,15 @@ def __init__( Value to be used for missing values. If None, missing values will be kept as NaN. output_column: str Name of the output column. If None, the original column will be overwritten. - ''' + """ if len(categories) != len(labels): raise ValueError( - 'Number of categories must be the same as number of labels' + "Number of categories must be the same as number of labels" ) for category in categories: if not isinstance(category, list): - raise TypeError('Each category must be a list') + raise TypeError("Each category must be a list") self._column = column self._categories = categories @@ -64,10 +64,10 @@ def from_dict(cls, categories: typing.Dict, **kwargs): def get_params(self, deep: bool = True) -> dict: return { - 'categories': self.categories_, - 'labels': self.labels_, - 'default_value': self._default_value, - 'output_column': self._output_column, + "categories": self.categories_, + "labels": self.labels_, + "default_value": self._default_value, + "output_column": self._output_column, } def fit(self, X, y=None): @@ -75,7 +75,7 @@ def fit(self, X, y=None): def transform(self, X): values = X[self._column].copy() - output = pd.Series(np.nan, index=X.index, dtype='object') + output = pd.Series(np.nan, index=X.index, dtype="object") for category, label in zip(self._categories, self._labels): output.loc[values.isin(category)] = label @@ -98,7 +98,7 @@ def __init__( default_value: typing.Any = None, output_column: typing.Optional[str] = None, ): - ''' + """ Custom transformer to categorize a numeric column into intervals. Parameters @@ -115,25 +115,25 @@ def __init__( Value to be used for missing values. If None, missing values will be kept as NaN. output_column: str Name of the output column. If None, the original column will be overwritten. - ''' + """ if len(intervals) != len(labels): - raise ValueError('Number of intervals must be the same as number of labels') + raise ValueError("Number of intervals must be the same as number of labels") for interval in intervals: if not isinstance(interval, tuple): - raise TypeError('Each interval must be a tuple') + raise TypeError("Each interval must be a tuple") if len(interval) != 2: - raise ValueError('Each interval must have two elements') + raise ValueError("Each interval must have two elements") if not isinstance(interval[0], (int, float)) or not isinstance( interval[1], (int, float) ): - raise TypeError('Each interval element must be a number') + raise TypeError("Each interval element must be a number") if interval[0] >= interval[1]: raise ValueError( - 'Each interval must have the first element smaller than the second' + "Each interval must have the first element smaller than the second" ) self._column = column @@ -171,11 +171,11 @@ def from_dict(cls, intervals: typing.Dict, **kwargs): def get_params(self, deep: bool = True) -> dict: return { - 'intervals': self.intervals_, - 'labels': self.labels_, - 'default_value': self.default_value_, - 'output_column': self.output_column_, - 'column': self.column_, + "intervals": self.intervals_, + "labels": self.labels_, + "default_value": self.default_value_, + "output_column": self.output_column_, + "column": self.column_, } def fit(self, X, y=None): @@ -183,7 +183,7 @@ def fit(self, X, y=None): def transform(self, X): values = X[self.column_].astype(float).copy() - output = pd.Series(np.nan, index=X.index, dtype='object') + output = pd.Series(np.nan, index=X.index, dtype="object") for interval, label in zip(self.intervals_, self.labels_): output.loc[(values >= interval[0]) & (values < interval[1]),] = label @@ -205,7 +205,7 @@ def __init__( default_value: typing.Any = None, output_column: typing.Optional[str] = None, ): - ''' + """ Custom transformer to categorize a numeric column into intervals given a categorical column. Parameters @@ -221,19 +221,19 @@ def __init__( Value to be used for missing values. If None, missing values will be kept as NaN. output_column: str Name of the output column. If None, the original column will be overwritten. - ''' + """ if not isinstance(interval_categorizers, dict): - raise TypeError('interval_categorizers must be a dict') + raise TypeError("interval_categorizers must be a dict") for key, value in interval_categorizers.items(): if not isinstance(key, str): - raise TypeError('Keys of interval_categorizers must be strings') + raise TypeError("Keys of interval_categorizers must be strings") if not isinstance(value, CustomIntervalCategorizer): raise TypeError( - 'Values of interval_categorizers must be CustomIntervalCategorizer' + "Values of interval_categorizers must be CustomIntervalCategorizer" ) - + self._category_column = category_column self._interval_categorizers = interval_categorizers self._default_categorizer = default_categorizer @@ -247,7 +247,7 @@ def category_column_(self) -> str: @property def interval_categorizers_(self) -> typing.Dict[str, CustomIntervalCategorizer]: return self._interval_categorizers - + @property def default_categorizer_(self) -> typing.Optional[CustomIntervalCategorizer]: return self._default_categorizer @@ -258,34 +258,34 @@ def from_dict(cls, **kwargs): def get_params(self, deep: bool = True) -> dict: return { - 'category_column': self.category_column_, - 'interval_categorizers': self.interval_categorizers_, - 'default_categorizer': self.default_categorizer_, - 'default_value': self._default_value, - 'output_column': self._output_column, + "category_column": self.category_column_, + "interval_categorizers": self.interval_categorizers_, + "default_categorizer": self.default_categorizer_, + "default_value": self._default_value, + "output_column": self._output_column, } def fit(self, X, y=None): return self def transform(self, X): - output = pd.Series(np.nan, index=X.index, dtype='object') + output = pd.Series(np.nan, index=X.index, dtype="object") for category, interval_categorizer in self.interval_categorizers_.items(): output.loc[ X[self._category_column] == category ] = interval_categorizer.transform( X.loc[X[self._category_column] == category] - )[ - interval_categorizer.get_output_column() - ] + )[interval_categorizer.get_output_column()] if self._default_categorizer is not None: - output.loc[~(X[self.category_column_].isin(self.interval_categorizers_.keys()))] = self._default_categorizer.transform( - X.loc[~(X[self.category_column_].isin(self.interval_categorizers_.keys()))] - )[ - self._default_categorizer.get_output_column() - ] + output.loc[ + ~(X[self.category_column_].isin(self.interval_categorizers_.keys())) + ] = self._default_categorizer.transform( + X.loc[ + ~(X[self.category_column_].isin(self.interval_categorizers_.keys())) + ] + )[self._default_categorizer.get_output_column()] if self._default_value is not None: output.fillna(self._default_value, inplace=True) @@ -293,4 +293,4 @@ def transform(self, X): output_column = self._output_column or self._category_column X.loc[:, output_column] = output - return X \ No newline at end of file + return X diff --git a/poetry.lock b/poetry.lock index 501f6bf..53b3bde 100644 --- a/poetry.lock +++ b/poetry.lock @@ -382,6 +382,32 @@ files = [ {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, ] +[[package]] +name = "ruff" +version = "0.2.0" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:638ea3294f800d18bae84a492cb5a245c8d29c90d19a91d8e338937a4c27fca0"}, + {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3ff35433fcf4dff6d610738712152df6b7d92351a1bde8e00bd405b08b3d5759"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9faafbdcf4f53917019f2c230766da437d4fd5caecd12ddb68bb6a17d74399"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8153a3e4128ed770871c47545f1ae7b055023e0c222ff72a759f5a341ee06483"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8a75a98ae989a27090e9c51f763990ad5bbc92d20626d54e9701c7fe597f399"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:87057dd2fdde297130ff99553be8549ca38a2965871462a97394c22ed2dfc19d"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d232f99d3ab00094ebaf88e0fb7a8ccacaa54cc7fa3b8993d9627a11e6aed7a"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3c641f95f435fc6754b05591774a17df41648f0daf3de0d75ad3d9f099ab92"}, + {file = "ruff-0.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3826fb34c144ef1e171b323ed6ae9146ab76d109960addca730756dc19dc7b22"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eceab7d85d09321b4de18b62d38710cf296cb49e98979960a59c6b9307c18cfe"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:30ad74687e1f4a9ff8e513b20b82ccadb6bd796fe5697f1e417189c5cde6be3e"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a7e3818698f8460bd0f8d4322bbe99db8327e9bc2c93c789d3159f5b335f47da"}, + {file = "ruff-0.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:edf23041242c48b0d8295214783ef543847ef29e8226d9f69bf96592dba82a83"}, + {file = "ruff-0.2.0-py3-none-win32.whl", hash = "sha256:e155147199c2714ff52385b760fe242bb99ea64b240a9ffbd6a5918eb1268843"}, + {file = "ruff-0.2.0-py3-none-win_amd64.whl", hash = "sha256:ba918e01cdd21e81b07555564f40d307b0caafa9a7a65742e98ff244f5035c59"}, + {file = "ruff-0.2.0-py3-none-win_arm64.whl", hash = "sha256:3fbaff1ba9564a2c5943f8f38bc221f04bac687cc7485e45237579fee7ccda79"}, + {file = "ruff-0.2.0.tar.gz", hash = "sha256:63856b91837606c673537d2889989733d7dffde553828d3b0f0bacfa6def54be"}, +] + [[package]] name = "s3transfer" version = "0.10.0" @@ -643,5 +669,5 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" -python-versions = ">=3.8,<=3.11" -content-hash = "e2ce2862df77037a1a5d070d57edc294965cded6ed23cc282962c479ac389843" \ No newline at end of file +python-versions = ">=3.8,<3.12" +content-hash = "ab6c6b02595c16981d9be67804208434c6714605a88613920f34c88578bdcaff" diff --git a/pyproject.toml b/pyproject.toml index b9557dd..c572b17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ statsmodels = "*" pytest = "^7.4.4" boto3 = "^1.34.19" awswrangler = "^3.5.1" +ruff = "^0.2.0" [build-system] requires = ["poetry-core"] diff --git a/setup.py b/setup.py index ba65068..6f9226e 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from setuptools import setup, find_packages from setuptools.config import read_configuration -if __name__ == '__main__': - config = read_configuration('pyproject.toml') - poetry_dependencies = config['tool']['poetry']['dependencies'] +if __name__ == "__main__": + config = read_configuration("pyproject.toml") + poetry_dependencies = config["tool"]["poetry"]["dependencies"] setup( packages=find_packages(), - install_requires = [str(dep) for dep in poetry_dependencies], + install_requires=[str(dep) for dep in poetry_dependencies], ) diff --git a/tests/conftest.py b/tests/conftest.py index cba5045..d07c5dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,5 +2,5 @@ import os # Add the project root directory to the Python path. -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) sys.path.insert(0, project_root) diff --git a/tests/test_estimator.py b/tests/test_estimator.py index cd4ef19..f8d7628 100644 --- a/tests/test_estimator.py +++ b/tests/test_estimator.py @@ -8,13 +8,13 @@ def test_glm_wrapper(): assert wrapper is not None # Check attributes - assert hasattr(wrapper, 'os_factor') - assert hasattr(wrapper, 'init_params') + assert hasattr(wrapper, "os_factor") + assert hasattr(wrapper, "init_params") # Check methods - assert hasattr(wrapper, 'fit') - assert hasattr(wrapper, 'predict') - assert hasattr(wrapper, 'get_params') + assert hasattr(wrapper, "fit") + assert hasattr(wrapper, "predict") + assert hasattr(wrapper, "get_params") def test_predict_proba_selector(): @@ -25,10 +25,10 @@ def test_predict_proba_selector(): assert selector is not None # Check attributes - assert hasattr(selector, 'model') - assert hasattr(selector, 'column') + assert hasattr(selector, "model") + assert hasattr(selector, "column") # Check methods - assert hasattr(selector, 'fit') - assert hasattr(selector, 'predict_proba') - assert hasattr(selector, 'get_params') + assert hasattr(selector, "fit") + assert hasattr(selector, "predict_proba") + assert hasattr(selector, "get_params") diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 202e03b..2fac9d3 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -4,28 +4,28 @@ def test_custom_discrete_categorizer(): categorizer = ds.transformer.CustomDiscreteCategorizer( - column='gender', + column="gender", categories=[ - ['M', 'm', 'Masculino', 'masculino'], - ['F', 'f', 'Feminino', 'feminino'], + ["M", "m", "Masculino", "masculino"], + ["F", "f", "Feminino", "feminino"], ], - labels=['M', 'F'], - default_value='M', + labels=["M", "F"], + default_value="M", ) X = pd.DataFrame( { - 'gender': [ - 'M', - 'm', - 'Masculino', - 'masculino', - 'F', - 'f', - 'Feminino', - 'feminino', - '', - 'non-sense', + "gender": [ + "M", + "m", + "Masculino", + "masculino", + "F", + "f", + "Feminino", + "feminino", + "", + "non-sense", None, 42, 42.42, @@ -35,33 +35,33 @@ def test_custom_discrete_categorizer(): X_transformed = categorizer.fit_transform(X) - assert X_transformed['gender'].tolist() == [ - 'M', - 'M', - 'M', - 'M', - 'F', - 'F', - 'F', - 'F', - 'M', - 'M', - 'M', - 'M', - 'M' + assert X_transformed["gender"].tolist() == [ + "M", + "M", + "M", + "M", + "F", + "F", + "F", + "F", + "M", + "M", + "M", + "M", + "M", ] def test_custom_discrete_categorizer_get_params(): categories = [ - ['M', 'm', 'Masculino', 'masculino'], - ['F', 'f', 'Feminino', 'feminino'], + ["M", "m", "Masculino", "masculino"], + ["F", "f", "Feminino", "feminino"], ] - labels = ['M', 'F'] - default_value = 'M' + labels = ["M", "F"] + default_value = "M" categorizer = ds.transformer.CustomDiscreteCategorizer( - column='gender', + column="gender", categories=categories, labels=labels, default_value=default_value, @@ -69,28 +69,28 @@ def test_custom_discrete_categorizer_get_params(): params = categorizer.get_params() - assert params['categories'] == categories - assert params['labels'] == labels - assert params['default_value'] == default_value + assert params["categories"] == categories + assert params["labels"] == labels + assert params["default_value"] == default_value def test_custom_interval_categorizer(): categorizer = ds.transformer.CustomIntervalCategorizer( - column='price', + column="price", intervals=[ (498, 2700), (2700, 3447.6), (3447.6, 5592), (5592, 13950), ], - labels=['fx1_apple', 'fx2_apple', 'fx3_apple', 'fx4_apple'], - default_value='fx_outras_marcas', - output_column='price_fx', + labels=["fx1_apple", "fx2_apple", "fx3_apple", "fx4_apple"], + default_value="fx_outras_marcas", + output_column="price_fx", ) X = pd.DataFrame( { - 'price': [ + "price": [ 498, 2699, 2700, @@ -107,32 +107,32 @@ def test_custom_interval_categorizer(): X = categorizer.fit_transform(X) - assert X['price_fx'].tolist() == [ - 'fx1_apple', - 'fx1_apple', - 'fx2_apple', - 'fx2_apple', - 'fx3_apple', - 'fx3_apple', - 'fx4_apple', - 'fx4_apple', - 'fx_outras_marcas', - 'fx_outras_marcas', + assert X["price_fx"].tolist() == [ + "fx1_apple", + "fx1_apple", + "fx2_apple", + "fx2_apple", + "fx3_apple", + "fx3_apple", + "fx4_apple", + "fx4_apple", + "fx_outras_marcas", + "fx_outras_marcas", ] def test_custom_interval_categorizer_get_params(): - column = 'price' + column = "price" intervals = [ (498, 2700), (2700, 3447.6), (3447.6, 5592), (5592, 13950), ] - labels = ['fx1_apple', 'fx2_apple', 'fx3_apple', 'fx4_apple'] - default_value = 'fx_outras_marcas' - output_column = 'price_fx' - + labels = ["fx1_apple", "fx2_apple", "fx3_apple", "fx4_apple"] + default_value = "fx_outras_marcas" + output_column = "price_fx" + categorizer = ds.transformer.CustomIntervalCategorizer( column=column, intervals=intervals, @@ -143,63 +143,63 @@ def test_custom_interval_categorizer_get_params(): params = categorizer.get_params() - assert params['column'] == column - assert params['intervals'] == intervals - assert params['labels'] == labels - assert params['default_value'] == default_value - assert params['output_column'] == output_column + assert params["column"] == column + assert params["intervals"] == intervals + assert params["labels"] == labels + assert params["default_value"] == default_value + assert params["output_column"] == output_column def test_custom_interval_categorizer_by_category(): categorizer = ds.transformer.CustomIntervalCategorizerByCategory( - category_column='brand', + category_column="brand", interval_categorizers={ - 'apple': ds.transformer.CustomIntervalCategorizer( - column='price', + "apple": ds.transformer.CustomIntervalCategorizer( + column="price", intervals=[ (498, 2700), (2700, 3447.6), (3447.6, 5592), (5592, 13950), ], - labels=['fx1_apple', 'fx2_apple', 'fx3_apple', 'fx4_apple'], + labels=["fx1_apple", "fx2_apple", "fx3_apple", "fx4_apple"], ), - 'samsung': ds.transformer.CustomIntervalCategorizer( - column='price', + "samsung": ds.transformer.CustomIntervalCategorizer( + column="price", intervals=[ (189, 1500), (1500, 11340), ], - labels=['fx1_samsung', 'fx2_samsung'], - ) + labels=["fx1_samsung", "fx2_samsung"], + ), }, default_categorizer=ds.transformer.CustomIntervalCategorizer( - column='price', - intervals=[(240, 5260)], - labels=['fx_outras_marcas'], + column="price", + intervals=[(240, 5260)], + labels=["fx_outras_marcas"], ), - output_column='price_fx', + output_column="price_fx", ) X = pd.DataFrame( { - 'brand': [ - 'apple', - 'apple', - 'apple', - 'apple', - 'apple', - 'apple', - 'apple', - 'apple', - 'samsung', - 'samsung', - 'samsung', - 'samsung', - 'outras_marcas', - 'outras_marcas', + "brand": [ + "apple", + "apple", + "apple", + "apple", + "apple", + "apple", + "apple", + "apple", + "samsung", + "samsung", + "samsung", + "samsung", + "outras_marcas", + "outras_marcas", ], - 'price': [ + "price": [ 498, 2699, 2700, @@ -220,19 +220,19 @@ def test_custom_interval_categorizer_by_category(): X = categorizer.fit_transform(X) - assert X['price_fx'].tolist() == [ - 'fx1_apple', - 'fx1_apple', - 'fx2_apple', - 'fx2_apple', - 'fx3_apple', - 'fx3_apple', - 'fx4_apple', - 'fx4_apple', - 'fx1_samsung', - 'fx1_samsung', - 'fx2_samsung', - 'fx2_samsung', - 'fx_outras_marcas', - 'fx_outras_marcas', + assert X["price_fx"].tolist() == [ + "fx1_apple", + "fx1_apple", + "fx2_apple", + "fx2_apple", + "fx3_apple", + "fx3_apple", + "fx4_apple", + "fx4_apple", + "fx1_samsung", + "fx1_samsung", + "fx2_samsung", + "fx2_samsung", + "fx_outras_marcas", + "fx_outras_marcas", ]