From ac98b23d2602e3b1502053db07e05290ef437412 Mon Sep 17 00:00:00 2001 From: Vadim Smirnov Date: Tue, 10 Dec 2024 13:33:55 +0000 Subject: [PATCH] Add tests from demos, add xgb in configs and sklearn ohe sparse parameter handling --- .../tabular_configs/conf_0_sel_type_0.yml | 5 + .../tabular_configs/conf_1_sel_type_1.yml | 5 + .../conf_2_select_mode_1_no_typ.yml | 6 + .../conf_3_sel_type_1_no_inter_lgbm.yml | 5 + .../conf_4_sel_type_0_no_int.yml | 5 + .../conf_5_sel_type_1_tuning_full.yml | 5 + ...f_6_sel_type_1_tuning_full_no_int_lgbm.yml | 5 + lightautoml/transformers/categorical.py | 9 +- tests/conftest.py | 13 ++ tests/integration/test_demo0.py | 166 ++++++++++++++++++ tests/integration/test_demo1.py | 117 ++++++++++++ tests/integration/test_demo10.py | 96 ++++++++++ tests/integration/test_demo11.py | 36 ++++ tests/integration/test_demo12.py | 46 +++++ tests/integration/test_demo13.py | 49 ++++++ tests/integration/test_demo15.py | 101 +++++++++++ tests/integration/test_demo2.py | 114 ++++++++++++ tests/integration/test_demo3.py | 120 +++++++++++++ tests/integration/test_demo4.py | 71 ++++++++ tests/integration/test_demo5.py | 136 ++++++++++++++ tests/integration/test_demo6.py | 56 ++++++ tests/integration/test_demo7.py | 40 +++++ tests/integration/test_demo8.py | 94 ++++++++++ .../test_tabularutilizedautoml.py | 27 +++ .../optuna/test_optuna_tuner.py | 2 +- 25 files changed, 1327 insertions(+), 2 deletions(-) create mode 100644 tests/integration/test_demo0.py create mode 100644 tests/integration/test_demo1.py create mode 100644 tests/integration/test_demo10.py create mode 100644 tests/integration/test_demo11.py create mode 100644 tests/integration/test_demo12.py create mode 100644 tests/integration/test_demo13.py create mode 100644 tests/integration/test_demo15.py create mode 100644 tests/integration/test_demo2.py create mode 100644 tests/integration/test_demo3.py create mode 100644 tests/integration/test_demo4.py create mode 100644 tests/integration/test_demo5.py create mode 100644 tests/integration/test_demo6.py create mode 100644 tests/integration/test_demo7.py create mode 100644 tests/integration/test_demo8.py create mode 100644 tests/unit/test_automl/test_presets/test_tabularutilizedautoml.py diff --git a/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml b/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml index bb36fcf0..4b8d6bde 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + cb_params: default_params: task_type: 'CPU' diff --git a/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml b/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml index d0eede50..3da1f009 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml b/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml index e9b02dd9..92ded29d 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml @@ -78,6 +78,12 @@ lgb_params: num_threads: 100 freeze_defaults: False + +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml b/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml index 1d5827c0..5d1d1b12 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml b/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml index bfd12818..4de6384d 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml b/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml index c273da60..f8e2b110 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml b/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml index 1d5827c0..5d1d1b12 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml @@ -78,6 +78,11 @@ lgb_params: num_threads: 100 freeze_defaults: False +xgb_params: + default_params: + nthread: 100 + freeze_defaults: False + # params for BoostCB MLAlgo cb_params: default_params: diff --git a/lightautoml/transformers/categorical.py b/lightautoml/transformers/categorical.py index 6d3a8b33..131339f6 100644 --- a/lightautoml/transformers/categorical.py +++ b/lightautoml/transformers/categorical.py @@ -15,6 +15,7 @@ from pandas import concat from sklearn.preprocessing import OneHotEncoder from sklearn.utils.murmurhash import murmurhash3_32 +from sklearn import __version__ as sklearn_version from ..dataset.base import LAMLDataset from ..dataset.np_pd_dataset import CSRSparseDataset @@ -266,13 +267,19 @@ def fit(self, dataset: NumpyOrPandas): fill_rate = self.total_feats_cnt / (self.total_feats_cnt - max_idx.shape[0] + max_idx.sum()) self.make_sparse = fill_rate < 0.2 + # from 1.2.0 "sparse" is deprecated + if sklearn_version >= "1.2.0": + sparse_ohe = {"sparse_output": self.make_sparse} + else: + sparse_ohe = {"sparse": self.make_sparse} + # create ohe self.ohe = OneHotEncoder( categories=[np.arange(x, y + 1, dtype=np.int32) for (x, y) in zip(min_idx, max_idx)], # drop=np.ones(max_idx.shape[0], dtype=np.int32), dtype=self.dtype, - sparse=self.make_sparse, handle_unknown="ignore", + **sparse_ohe, ) self.ohe.fit(data) diff --git a/tests/conftest.py b/tests/conftest.py index b79dd9f3..100ec43d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -83,6 +83,19 @@ def sampled_app_roles(): } +@pytest.fixture() +def ai92_value_77_train_test(): + data = pd.read_csv( + "./examples/data/ai92_value_77.csv", + ) + + horizon = 30 + + train = data[:-horizon] + test = data[-horizon:] + return train, test, horizon + + @pytest.fixture() def binary_task(): return Task("binary") diff --git a/tests/integration/test_demo0.py b/tests/integration/test_demo0.py new file mode 100644 index 00000000..1ec6bbe7 --- /dev/null +++ b/tests/integration/test_demo0.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""Building ML pipeline from blocks and fit + predict the pipeline itself.""" + +import os +import pickle +import time + +import numpy as np +import pandas as pd + +from lightautoml.dataset.np_pd_dataset import PandasDataset +from lightautoml.dataset.utils import roles_parser +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.validation.np_iterators import FoldsIterator + +MAX_SELECTOR_FIT_TIME = 0.5 +MAX_PD_DATASET_CREATING_TIME = 0.2 +MAX_MLPIPELINE_FIT_PREDICT_TIME = 200 +MAX_PREDICT_TIME = 1 + +FILE_PATH = "examples/data/sampled_app_train.csv" + + +def test_simple_pipeline(sampled_app_roles, binary_task): + data = pd.read_csv( + FILE_PATH, + usecols=[ + "TARGET", + "NAME_CONTRACT_TYPE", + "AMT_CREDIT", + "NAME_TYPE_SUITE", + "AMT_GOODS_PRICE", + "DAYS_BIRTH", + "DAYS_EMPLOYED", + ], + ) + + assert isinstance(data, pd.DataFrame) + assert "TARGET" in data.columns and "AMT_GOODS_PRICE" in data.columns + + # Fix dates and convert to date type + data["BIRTH_DATE"] = np.datetime64("2018-01-01") + data["DAYS_BIRTH"].astype(np.dtype("timedelta64[D]")) + data["EMP_DATE"] = np.datetime64("2018-01-01") + np.clip(data["DAYS_EMPLOYED"], None, 0).astype( + np.dtype("timedelta64[D]") + ) + data.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis=1, inplace=True) + assert "DAY_BIRTH" not in data.columns + + # Create folds + data["__fold__"] = np.random.randint(0, 5, len(data)) + + assert isinstance(data.head(), pd.DataFrame) + + # Set roles for columns + check_roles = sampled_app_roles + + # create Task + task = binary_task + + assert task.metric_name == "auc" + + # Creating PandasDataSet + + pd_dataset_timing_list = [] + for _ in range(30): + start_time = time.time() + pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task) + pd_dataset_timing_list.append(time.time() - start_time) + + assert np.mean(pd_dataset_timing_list) < MAX_PD_DATASET_CREATING_TIME, np.mean(pd_dataset_timing_list) + + roles_classes = [object, str, np.float32, np.float32, str, np.datetime64, np.datetime64, object] + assert all([roles_classes[i] == pd_dataset.roles[role].dtype for i, role in enumerate(pd_dataset.roles)]) + + # Feature selection part + model = BoostLGBM() + + assert not model.is_fitted and model._name == "LightGBM" + + pipe = LGBSimpleFeatures() + + model0 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "seed": 0, + "num_threads": 5, + } + ) + + selector_timing_list = [] + for _ in range(30): + selector_iterator = FoldsIterator(pd_dataset, 1) + + mbie = ModelBasedImportanceEstimator() + + selector = ImportanceCutoffSelector(pipe, model0, mbie, cutoff=10) + + start_time = time.time() + selector.fit(selector_iterator) + + selector_timing_list.append(time.time() - start_time) + + assert np.mean(selector_timing_list) < MAX_SELECTOR_FIT_TIME, np.mean(selector_timing_list) + + assert isinstance(selector.get_features_score(), pd.Series) + + # Build AutoML pipeline + pipe = LGBSimpleFeatures() + + params_tuner1 = OptunaTuner(n_trials=10, timeout=300) + model1 = BoostLGBM(default_params={"learning_rate": 0.05, "num_leaves": 128}) + + params_tuner2 = OptunaTuner(n_trials=20, timeout=300) + model2 = BoostLGBM(default_params={"learning_rate": 0.025, "num_leaves": 64}) + + total = MLPipeline( + [(model1, params_tuner1), (model2, params_tuner2)], + pre_selection=selector, + features_pipeline=pipe, + post_selection=None, + ) + + assert total._ml_algos[0]._name == "Mod_0_LightGBM" + assert total._ml_algos[1]._name == "Mod_1_LightGBM" + + train_valid = FoldsIterator(pd_dataset) + + # Fit predict using pipeline + start_time = time.time() + pred = total.fit_predict(train_valid) + + assert time.time() - start_time < MAX_MLPIPELINE_FIT_PREDICT_TIME + + # Check preds + assert pred.shape == (10000, 2) + + start_time = time.time() + train_pred = total.predict(pd_dataset) + assert time.time() - start_time < MAX_PREDICT_TIME + + # Pickle automl + with open("automl.pickle", "wb") as f: + pickle.dump(total, f) + + with open("automl.pickle", "rb") as f: + total = pickle.load(f) + + train_pred = total.predict(pd_dataset) + os.remove("automl.pickle") + + assert train_pred.shape == (10000, 2) + + assert isinstance(model1.get_features_score(), pd.Series) + assert isinstance(model2.get_features_score(), pd.Series) + + assert ((0 <= train_pred.data[:, 1]) & (train_pred.data[:, 1] <= 1)).all() diff --git a/tests/integration/test_demo1.py b/tests/integration/test_demo1.py new file mode 100644 index 00000000..927426ca --- /dev/null +++ b/tests/integration/test_demo1.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import pickle + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.base import AutoML +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader + + +def test_cutoff_selector_in_pipeline(sampled_app_train_test, binary_task): + + train_data, test_data = sampled_app_train_test + + task = binary_task + + reader = PandasToPandasReader(task, cv=5, random_state=1) + + # selector parts + model0 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "seed": 42, + "num_threads": 5, + } + ) + pipe0 = LGBSimpleFeatures() + mbie = ModelBasedImportanceEstimator() + selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=10) + + # pipeline 1 level parts + pipe = LGBSimpleFeatures() + + params_tuner1 = OptunaTuner(n_trials=100, timeout=300) + model1 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 128, + "seed": 1, + "num_threads": 5, + } + ) + model2 = BoostLGBM( + default_params={ + "learning_rate": 0.025, + "num_leaves": 64, + "seed": 2, + "num_threads": 5, + } + ) + + pipeline_lvl1 = MLPipeline( + [(model1, params_tuner1), model2], + pre_selection=selector, + features_pipeline=pipe, + post_selection=None, + ) + + # pipeline 2 level parts + pipe1 = LGBSimpleFeatures() + + model = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "max_bin": 1024, + "seed": 3, + "num_threads": 5, + }, + freeze_defaults=True, + ) + + pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) + + automl = AutoML( + reader, + [ + [pipeline_lvl1], + [pipeline_lvl2], + ], + skip_conn=False, + debug=True, + ) + + automl.fit_predict(train_data, roles={"target": "TARGET"}, verbose=5) + + # just checking if methods can be called + selector.get_features_score() + automl.levels[-1][0].ml_algos[0].get_features_score() + automl.levels[0][0].ml_algos[0].get_features_score() + automl.levels[0][0].ml_algos[1].get_features_score() + + test_pred = automl.predict(test_data) + + with open("automl.pickle", "wb") as f: + pickle.dump(automl, f) + + with open("automl.pickle", "rb") as f: + automl = pickle.load(f) + + test_pred = automl.predict(test_data) + test_score = roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]) + + assert test_score > 0.65 + + os.remove("automl.pickle") diff --git a/tests/integration/test_demo10.py b/tests/integration/test_demo10.py new file mode 100644 index 00000000..a68af490 --- /dev/null +++ b/tests/integration/test_demo10.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from sklearn.metrics import log_loss + +from lightautoml.automl.base import AutoML +from lightautoml.automl.blend import WeightedBlender +from lightautoml.ml_algo.boost_cb import BoostCB +from lightautoml.ml_algo.linear_sklearn import LinearLBFGS +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.features.linear_pipeline import LinearFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ( + ImportanceCutoffSelector, + ModelBasedImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader +from lightautoml.utils.timer import PipelineTimer + +# demo of timer, blender and multiclass + +np.random.seed(42) + + +def test_some_pipeline(sampled_app_train_test, multiclass_task): + + train, test = sampled_app_train_test + + timer = PipelineTimer(600, mode=2) + + timer_gbm = timer.get_task_timer("gbm") + feat_sel_0 = LGBSimpleFeatures() + mod_sel_0 = BoostCB(timer=timer_gbm) + imp_sel_0 = ModelBasedImportanceEstimator() + selector_0 = ImportanceCutoffSelector( + feat_sel_0, + mod_sel_0, + imp_sel_0, + cutoff=0, + ) + + feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, feats_imp=imp_sel_0) + timer_gbm_0 = timer.get_task_timer("gbm") + timer_gbm_1 = timer.get_task_timer("gbm") + + gbm_0 = BoostCB(timer=timer_gbm_0, default_params={"devices": "0"}) + gbm_1 = BoostCB(timer=timer_gbm_1, default_params={"devices": "0"}) + + tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) + gbm_lvl0 = MLPipeline( + [(gbm_0, tuner_0), gbm_1], + pre_selection=selector_0, + features_pipeline=feats_gbm_0, + post_selection=None, + ) + + feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") + + timer_reg = timer.get_task_timer("reg") + reg_0 = LinearLBFGS(timer=timer_reg) + + reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) + + reader = PandasToPandasReader( + multiclass_task, + samples=None, + max_nan_rate=1, + max_constant_rate=1, + advanced_roles=True, + drop_score_co=-1, + n_jobs=1, + ) + blender = WeightedBlender() + + automl = AutoML( + reader=reader, + levels=[[gbm_lvl0, reg_lvl0]], + timer=timer, + blender=blender, + skip_conn=False, + ) + + oof_pred = automl.fit_predict(train, roles={"target": "TARGET"}) + test_pred = automl.predict(test) + + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + + oof_score = log_loss(train["TARGET"].values[not_nan], oof_pred.data[not_nan]) + assert oof_score < 1 + + test_score = log_loss(test["TARGET"].values, test_pred.data) + assert test_score < 1 diff --git a/tests/integration/test_demo11.py b/tests/integration/test_demo11.py new file mode 100644 index 00000000..4c17a808 --- /dev/null +++ b/tests/integration/test_demo11.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# coding: utf-8 + +import shutil + +import numpy as np + +from sklearn.metrics import mean_squared_error + +from lightautoml.automl.presets.text_presets import TabularNLPAutoML + + +np.random.seed(42) + + +def test_tabularnlp(avito1k_train_test, avito1k_roles, regression_task): + train, test = avito1k_train_test + + roles = avito1k_roles + + task = regression_task + + automl = TabularNLPAutoML(task=task, timeout=600) + oof_pred = automl.fit_predict(train, roles=roles) + test_pred = automl.predict(test) + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + target = roles["target"] + + oof_score = mean_squared_error(train[target].values[not_nan], oof_pred.data[not_nan][:, 0]) + + assert oof_score < 0.2 + + test_score = mean_squared_error(test[target].values, test_pred.data[:, 0]) + assert test_score < 0.2 + + shutil.rmtree("./models", ignore_errors=True) diff --git a/tests/integration/test_demo12.py b/tests/integration/test_demo12.py new file mode 100644 index 00000000..215c44dc --- /dev/null +++ b/tests/integration/test_demo12.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from lightautoml.validation.np_iterators import TimeSeriesIterator + + +np.random.seed(42) + + +def test_tabular_with_dates(sampled_app_train_test, binary_task): + + train, test = sampled_app_train_test + + # create time series iterator that is passed as cv_func + cv_iter = TimeSeriesIterator(train["EMP_DATE"].astype("datetime64[ns]"), n_splits=5, sorted_kfold=False) + + # train dataset may be passed as dict of np.ndarray + train = { + "data": train[["AMT_CREDIT", "AMT_ANNUITY"]].values, + "target": train["TARGET"].values, + } + + task = binary_task + + automl = TabularAutoML( + task=task, + timeout=200, + ) + oof_pred = automl.fit_predict(train, train_features=["AMT_CREDIT", "AMT_ANNUITY"], cv_iter=cv_iter) + # prediction can be made on file by + test.to_csv("temp_test_data.csv", index=False) + test_pred = automl.predict("temp_test_data.csv", batch_size=100, n_jobs=4) + + oof_prediction = oof_pred.data[:, 0] + not_empty = np.logical_not(np.isnan(oof_prediction)) + + oof_score = roc_auc_score(train["target"][not_empty], oof_prediction[not_empty]) + assert oof_score > 0.52 + + test_score = roc_auc_score(test["TARGET"].values, test_pred.data[:, 0]) + assert test_score > 0.51 diff --git a/tests/integration/test_demo13.py b/tests/integration/test_demo13.py new file mode 100644 index 00000000..f5ac2d0a --- /dev/null +++ b/tests/integration/test_demo13.py @@ -0,0 +1,49 @@ +import numpy as np + +from sklearn.metrics import mean_absolute_error + +from lightautoml.addons.autots.base import AutoTS +from lightautoml.tasks import Task + + +np.random.seed(42) + + +def test_autots(ai92_value_77_train_test): + + train, test, horizon = ai92_value_77_train_test + roles = {"target": "value", "datetime": "date"} + + seq_params = { + "seq0": { + "case": "next_values", + "params": {"n_target": horizon, "history": np.maximum(7, horizon), "step": 1, "test_last": True}, + }, + } + + # True (then set default values) / False; int, list or np.array + # default: lag_features=30, diff_features=7 + transformers_params = { + "lag_features": [0, 1, 2, 3, 5, 10], + "lag_time_features": [0, 1, 2], + "diff_features": [0, 1, 3, 4], + } + + task = Task("multi:reg", greater_is_better=False, metric="mae", loss="mae") + + reader_params = { + "seq_params": seq_params, + "transformers_params": transformers_params, + } + automl = AutoTS( + task, + reader_params=reader_params, + time_series_trend_params={ + "trend": False, + }, + ) + automl.fit_predict(train, roles, verbose=4) + forecast, _ = automl.predict(train) + + test_score = mean_absolute_error(test[roles["target"]].values, forecast) + assert test_score < 2e5 diff --git a/tests/integration/test_demo15.py b/tests/integration/test_demo15.py new file mode 100644 index 00000000..a1d2db8b --- /dev/null +++ b/tests/integration/test_demo15.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from lightautoml.automl.base import AutoML +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from lightautoml.dataset.roles import CategoryRole +from lightautoml.dataset.roles import NumericRole +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader +from lightautoml.tasks import Task + + +################################ +# Features: +# - group_by transformer +################################ + +N_FOLDS = 3 # number of folds for cross-validation inside AutoML +RANDOM_STATE = 42 # fixed random state for various reasons +N_THREADS = 4 # threads cnt for lgbm and linear models +TIMEOUT = 100 +USED_COLS = ["SK_ID_CURR", "TARGET", "NAME_CONTRACT_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL", "DAYS_BIRTH"] +TARGET = "TARGET" + + +def test_groupby_features(sampled_app_train_test, binary_task): + + train, _ = sampled_app_train_test + + # Using TabularAutoML preset + task = binary_task + roles = { + "target": TARGET, + CategoryRole(dtype=str): ["NAME_CONTRACT_TYPE", "CODE_GENDER"], + NumericRole(np.float32): ["AMT_INCOME_TOTAL"], + } + + # specify groupby triplets: [("group_col", "feature", "transform_type"),] + groupby_triplets = [ + ("CODE_GENDER", "AMT_INCOME_TOTAL", "max"), + ("NAME_CONTRACT_TYPE", "CODE_GENDER", "mode"), + ("NAME_CONTRACT_TYPE", "AMT_INCOME_TOTAL", "delta_mean"), + ] + + automl = TabularAutoML( + task=task, + timeout=TIMEOUT, + cpu_limit=N_THREADS, + reader_params={"n_jobs": N_THREADS, "cv": N_FOLDS, "random_state": RANDOM_STATE}, + general_params={"use_algos": [["lgb"]]}, + gbm_pipeline_params={"use_groupby": True, "groupby_triplets": groupby_triplets}, + ) + automl.fit_predict(train, roles=roles) + + automl.levels[0][0].ml_algos[0].get_features_score() + + task = Task("binary") + reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE) + model0 = BoostLGBM(default_params={"learning_rate": 0.1, "num_leaves": 64, "seed": 42, "num_threads": N_THREADS}) + pie = ModelBasedImportanceEstimator() + selector = ImportanceCutoffSelector(LGBSimpleFeatures(), model0, pie, cutoff=-9999) + + pipe = LGBAdvancedPipeline( + use_groupby=True, + pre_selector=selector, + groupby_types=["delta_median", "std"], + groupby_top_based_on="importance", + ) + + model = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 128, + "seed": 1, + "num_threads": N_THREADS, + } + ) + + pipeline = MLPipeline([model], pre_selection=selector, features_pipeline=pipe, post_selection=None) + + automl = AutoML( + reader, + [[pipeline]], + skip_conn=False, + ) + + automl.fit_predict( + train, + roles={"target": TARGET}, + ) + + assert len(pipe.output_features) > 0 diff --git a/tests/integration/test_demo2.py b/tests/integration/test_demo2.py new file mode 100644 index 00000000..79e184e9 --- /dev/null +++ b/tests/integration/test_demo2.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import pickle + + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.base import AutoML +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpIterativeFeatureSelector, +) +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpPermutationImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader + + +def test_permutation_importance_based_iterative_selector(sampled_app_train_test, binary_task): + + train_data, test_data = sampled_app_train_test + + task = binary_task + + reader = PandasToPandasReader(task, cv=5, random_state=1) + + # selector parts + model0 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "seed": 42, + "num_threads": 5, + } + ) + pipe0 = LGBSimpleFeatures() + pie = NpPermutationImportanceEstimator() + selector = NpIterativeFeatureSelector(pipe0, model0, pie, feature_group_size=1, max_features_cnt_in_result=15) + + # pipeline 1 level parts + pipe = LGBSimpleFeatures() + + model1 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 128, + "seed": 1, + "num_threads": 5, + } + ) + + params_tuner2 = OptunaTuner(n_trials=100, timeout=100) + model2 = BoostLGBM( + default_params={ + "learning_rate": 0.025, + "num_leaves": 64, + "seed": 2, + "num_threads": 5, + } + ) + + pipeline_lvl1 = MLPipeline( + [model1, (model2, params_tuner2)], + pre_selection=selector, + features_pipeline=pipe, + post_selection=None, + ) + + # pipeline 2 level parts + pipe1 = LGBSimpleFeatures() + + model = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "max_bin": 1024, + "seed": 3, + "num_threads": 5, + }, + freeze_defaults=True, + ) + + pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) + + automl = AutoML( + reader, + [ + [pipeline_lvl1], + [pipeline_lvl2], + ], + skip_conn=False, + debug=True, + ) + + automl.fit_predict(train_data, roles={"target": "TARGET"}, verbose=0) + + test_pred = automl.predict(test_data) + test_score = roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]) + assert test_score > 0.55 + + with open("automl.pickle", "wb") as f: + pickle.dump(automl, f) + + with open("automl.pickle", "rb") as f: + automl = pickle.load(f) + + test_pred = automl.predict(test_data) + + os.remove("automl.pickle") diff --git a/tests/integration/test_demo3.py b/tests/integration/test_demo3.py new file mode 100644 index 00000000..9f56cca1 --- /dev/null +++ b/tests/integration/test_demo3.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.base import AutoML +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.base import ComposedSelector +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpIterativeFeatureSelector, +) +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpPermutationImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader + + +def test_pipeline_with_selectors(sampled_app_train_test, binary_task): + np.random.seed(42) + + train_data, test_data = sampled_app_train_test + task = binary_task + + reader = PandasToPandasReader(task, cv=5, random_state=1) + + # selector parts + model01 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "seed": 42, + "num_threads": 5, + } + ) + + model02 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "seed": 42, + "num_threads": 5, + } + ) + pipe0 = LGBSimpleFeatures() + pie = NpPermutationImportanceEstimator() + pie1 = ModelBasedImportanceEstimator() + sel1 = ImportanceCutoffSelector(pipe0, model01, pie1, cutoff=0) + sel2 = NpIterativeFeatureSelector(pipe0, model02, pie, feature_group_size=1, max_features_cnt_in_result=15) + selector = ComposedSelector([sel1, sel2]) + + # pipeline 1 level parts + pipe = LGBSimpleFeatures() + + params_tuner1 = OptunaTuner(n_trials=100, timeout=100) + model1 = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 128, + "seed": 1, + "num_threads": 5, + } + ) + + model2 = BoostLGBM( + default_params={ + "learning_rate": 0.025, + "num_leaves": 64, + "seed": 2, + "num_threads": 5, + } + ) + + pipeline_lvl1 = MLPipeline( + [(model1, params_tuner1), model2], + pre_selection=selector, + features_pipeline=pipe, + post_selection=None, + ) + + # pipeline 2 level parts + pipe1 = LGBSimpleFeatures() + + model = BoostLGBM( + default_params={ + "learning_rate": 0.05, + "num_leaves": 64, + "max_bin": 1024, + "seed": 3, + "num_threads": 5, + } + ) + pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None) + + automl = AutoML( + reader, + [ + [pipeline_lvl1], + [pipeline_lvl2], + ], + skip_conn=False, + debug=True, + ) + + oof_pred = automl.fit_predict(train_data, roles={"target": "TARGET"}, verbose=5) + + test_pred = automl.predict(test_data) + oof_score = roc_auc_score(train_data["TARGET"].values, oof_pred.data[:, 0]) + test_score = roc_auc_score(test_data["TARGET"].values, test_pred.data[:, 0]) + + assert oof_score > 0.57 + assert test_score > 0.55 diff --git a/tests/integration/test_demo4.py b/tests/integration/test_demo4.py new file mode 100644 index 00000000..ef3ee1bb --- /dev/null +++ b/tests/integration/test_demo4.py @@ -0,0 +1,71 @@ +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.base import AutoML +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.reader.base import PandasToPandasReader +from lightautoml.tasks import Task + +np.random.seed(42) + + +def test_different_task_params(sampled_app_train_test): + + train_data, test_data = sampled_app_train_test + + for task_params, target in zip( + [ + {"name": "binary"}, + {"name": "binary", "metric": roc_auc_score}, + {"name": "reg", "loss": "mse", "metric": "r2"}, + {"name": "reg", "loss": "rmsle", "metric": "rmsle"}, + { + "name": "reg", + "loss": "quantile", + "loss_params": {"q": 0.9}, + "metric": "quantile", + "metric_params": {"q": 0.9}, + }, + ], + ["TARGET", "TARGET", "AMT_CREDIT", "AMT_CREDIT", "AMT_CREDIT"], + ): + + task = Task(**task_params) + + reader = PandasToPandasReader(task, cv=5, random_state=1) + + # pipeline 1 level parts + pipe = LGBSimpleFeatures() + + model2 = BoostLGBM( + default_params={ + "learning_rate": 0.025, + "num_leaves": 64, + "seed": 2, + "num_threads": 5, + } + ) + + pipeline_lvl1 = MLPipeline( + [model2], + pre_selection=None, # selector, + features_pipeline=pipe, + post_selection=None, + ) + + automl = AutoML( + reader, + [ + [pipeline_lvl1], + ], + skip_conn=False, + # debug=True, + ) + + oof_pred = automl.fit_predict(train_data, roles={"target": target}, verbose=1) + # assert for last oof score + assert task.metric_func(train_data[target].values, oof_pred.data[:, 0]) < 10 ** 5 + assert task.metric_func(test_data[target].values, automl.predict(test_data).data[:, 0]) < 10 ** 5 diff --git a/tests/integration/test_demo5.py b/tests/integration/test_demo5.py new file mode 100644 index 00000000..9b103d5b --- /dev/null +++ b/tests/integration/test_demo5.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# coding: utf-8 + +"""2 level stacking using AutoML class with different algos on first level including LGBM, Linear and LinearL1.""" + +import os +import pickle + +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.base import AutoML +from lightautoml.automl.blend import MeanBlender +from lightautoml.dataset.roles import DatetimeRole +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.linear_sklearn import LinearL1CD +from lightautoml.ml_algo.linear_sklearn import LinearLBFGS +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.features.linear_pipeline import LinearFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.pipelines.selection.linear_selector import HighCorrRemoval +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpIterativeFeatureSelector, +) +from lightautoml.pipelines.selection.permutation_importance_based import ( + NpPermutationImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader + + +np.random.seed(42) + + +def test_blending(sampled_app_train_test, binary_task): + + train, test = sampled_app_train_test + + feat_sel_0 = LGBSimpleFeatures() + mod_sel_0 = BoostLGBM() + imp_sel_0 = ModelBasedImportanceEstimator() + selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0) + + feats_gbm_0 = LGBAdvancedPipeline() + gbm_0 = BoostLGBM() + gbm_1 = BoostLGBM() + tuner_0 = OptunaTuner(n_trials=100, timeout=30, fit_on_holdout=True) + gbm_lvl0 = MLPipeline( + [(gbm_0, tuner_0), gbm_1], + pre_selection=selector_0, + features_pipeline=feats_gbm_0, + post_selection=None, + ) + + feats_reg_0 = LinearFeatures(output_categories=True) + reg_0 = LinearLBFGS() + reg_lvl0 = MLPipeline( + [reg_0], + pre_selection=None, + features_pipeline=feats_reg_0, + post_selection=HighCorrRemoval(corr_co=1), + ) + + feat_sel_1 = LGBSimpleFeatures() + mod_sel_1 = BoostLGBM() + imp_sel_1 = NpPermutationImportanceEstimator() + selector_1 = NpIterativeFeatureSelector(feat_sel_1, mod_sel_1, imp_sel_1, feature_group_size=1) + + feats_reg_1 = LinearFeatures(output_categories=False) + reg_1 = LinearL1CD() + reg_l1_lvl0 = MLPipeline( + [reg_1], + pre_selection=selector_1, + features_pipeline=feats_reg_1, + post_selection=HighCorrRemoval(), + ) + + feats_reg_2 = LinearFeatures(output_categories=True) + reg_2 = LinearLBFGS() + reg_lvl1 = MLPipeline( + [reg_2], + pre_selection=None, + features_pipeline=feats_reg_2, + post_selection=HighCorrRemoval(corr_co=1), + ) + + reader = PandasToPandasReader( + binary_task, + samples=None, + max_nan_rate=1, + max_constant_rate=1, + ) + + automl = AutoML( + reader, + [ + [gbm_lvl0, reg_lvl0, reg_l1_lvl0], + [reg_lvl1], + ], + skip_conn=False, + blender=MeanBlender(), + debug=True, + ) + + roles = { + "target": "TARGET", + DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", + } + + oof_pred = automl.fit_predict(train, roles=roles, verbose=2) + + test_pred = automl.predict(test) + + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + + oof_score = roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]) + assert oof_score > 0.7 + + test_score = roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]) + assert test_score > 0.7 + + with open("automl.pickle", "wb") as f: + pickle.dump(automl, f) + + with open("automl.pickle", "rb") as f: + automl = pickle.load(f) + + test_pred = automl.predict(test) + + os.remove("automl.pickle") diff --git a/tests/integration/test_demo6.py b/tests/integration/test_demo6.py new file mode 100644 index 00000000..5409c9ac --- /dev/null +++ b/tests/integration/test_demo6.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# coding: utf-8 + + +"""AutoML with nested CV usage.""" + +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from lightautoml.dataset.roles import DatetimeRole + + +np.random.seed(42) + + +def test_tabularautoml_2lvl(sampled_app_train_test, binary_task): + + train, test = sampled_app_train_test + + roles = { + "target": "TARGET", + DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", + } + + task = binary_task + + automl = TabularAutoML( + task=task, + timeout=600, + general_params={ + "use_algos": [ + [ + "linear_l2", + "lgb", + ], + ["linear_l2", "lgb"], + ], + "nested_cv": True, + "skip_conn": True, + }, + nested_cv_params={"cv": 5, "n_folds": None}, + debug=True, + ) + + oof_pred = automl.fit_predict(train, roles=roles, verbose=5) + test_pred = automl.predict(test) + + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + + oof_score = roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]) + assert oof_score > 0.75 + + test_score = roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]) + assert test_score > 0.7 diff --git a/tests/integration/test_demo7.py b/tests/integration/test_demo7.py new file mode 100644 index 00000000..d2cd874f --- /dev/null +++ b/tests/integration/test_demo7.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from lightautoml.dataset.roles import DatetimeRole + + +np.random.seed(42) + + +def test_classic_tabularautoml(sampled_app_train_test, binary_task): + + train, test = sampled_app_train_test + + roles = { + "target": "TARGET", + DatetimeRole(base_date=True, seasonality=(), base_feats=False): "report_dt", + } + + task = binary_task + + automl = TabularAutoML( + task=task, + timeout=3600, + debug=True, + ) + oof_pred = automl.fit_predict(train, roles=roles, verbose=5) + test_pred = automl.predict(test) + + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + + oof_score = roc_auc_score(train[roles["target"]].values[not_nan], oof_pred.data[not_nan][:, 0]) + assert oof_score > 0.7 + + test_score = roc_auc_score(test[roles["target"]].values, test_pred.data[:, 0]) + assert test_score > 0.7 diff --git a/tests/integration/test_demo8.py b/tests/integration/test_demo8.py new file mode 100644 index 00000000..7f361014 --- /dev/null +++ b/tests/integration/test_demo8.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# coding: utf-8 + +import numpy as np + +from sklearn.metrics import log_loss + +from lightautoml.automl.base import AutoML +from lightautoml.automl.blend import WeightedBlender +from lightautoml.ml_algo.boost_lgbm import BoostLGBM +from lightautoml.ml_algo.linear_sklearn import LinearLBFGS +from lightautoml.ml_algo.tuning.optuna import OptunaTuner +from lightautoml.pipelines.features.lgb_pipeline import LGBAdvancedPipeline +from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures +from lightautoml.pipelines.features.linear_pipeline import LinearFeatures +from lightautoml.pipelines.ml.base import MLPipeline +from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector +from lightautoml.pipelines.selection.importance_based import ( + ModelBasedImportanceEstimator, +) +from lightautoml.reader.base import PandasToPandasReader +from lightautoml.utils.timer import PipelineTimer + + +def test_lgbm_linear_pipeline(sampled_app_train_test, multiclass_task): + + # demo of timer, blender and multiclass + np.random.seed(42) + train, test = sampled_app_train_test + timer = PipelineTimer(600, mode=2) + + timer_gbm = timer.get_task_timer("gbm") + feat_sel_0 = LGBSimpleFeatures() + mod_sel_0 = BoostLGBM(timer=timer_gbm) + imp_sel_0 = ModelBasedImportanceEstimator() + selector_0 = ImportanceCutoffSelector( + feat_sel_0, + mod_sel_0, + imp_sel_0, + cutoff=0, + ) + + feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, output_categories=True, feats_imp=imp_sel_0) + timer_gbm_0 = timer.get_task_timer("gbm") + timer_gbm_1 = timer.get_task_timer("gbm") + + gbm_0 = BoostLGBM(timer=timer_gbm_0) + gbm_1 = BoostLGBM(timer=timer_gbm_1) + + tuner_0 = OptunaTuner(n_trials=10, timeout=10, fit_on_holdout=True) + gbm_lvl0 = MLPipeline( + [(gbm_0, tuner_0), gbm_1], + pre_selection=selector_0, + features_pipeline=feats_gbm_0, + post_selection=None, + ) + + feats_reg_0 = LinearFeatures(output_categories=True, sparse_ohe="auto") + + timer_reg = timer.get_task_timer("reg") + reg_0 = LinearLBFGS(timer=timer_reg) + + reg_lvl0 = MLPipeline([reg_0], pre_selection=None, features_pipeline=feats_reg_0, post_selection=None) + + reader = PandasToPandasReader( + multiclass_task, + samples=None, + max_nan_rate=1, + max_constant_rate=1, + advanced_roles=True, + drop_score_co=-1, + n_jobs=1, + ) + + blender = WeightedBlender() + + automl = AutoML( + reader=reader, + levels=[[gbm_lvl0, reg_lvl0]], + timer=timer, + blender=blender, + debug=True, + skip_conn=False, + ) + oof_pred = automl.fit_predict(train, roles={"target": "TARGET"}, verbose=5) + test_pred = automl.predict(test) + + not_nan = np.any(~np.isnan(oof_pred.data), axis=1) + + oof_score = log_loss(train["TARGET"].values[not_nan], oof_pred.data[not_nan, :]) + assert oof_score < 1 + + test_score = log_loss(test["TARGET"].values, test_pred.data) + assert test_score < 1 diff --git a/tests/unit/test_automl/test_presets/test_tabularutilizedautoml.py b/tests/unit/test_automl/test_presets/test_tabularutilizedautoml.py new file mode 100644 index 00000000..28516dc4 --- /dev/null +++ b/tests/unit/test_automl/test_presets/test_tabularutilizedautoml.py @@ -0,0 +1,27 @@ +from sklearn.metrics import roc_auc_score + +from lightautoml.automl.presets.tabular_presets import TabularAutoML +from tests.unit.test_automl.test_presets.presets_utils import check_pickling +from tests.unit.test_automl.test_presets.presets_utils import get_target_name + + +class TabularUtilizedAutoML: + def test_fit_predict(self, sampled_app_train_test, sampled_app_roles, binary_task): + # load and prepare data + train, test = sampled_app_train_test + + # run automl + automl = TabularAutoML(task=binary_task) + oof_predictions = automl.fit_predict(train, roles=sampled_app_roles, verbose=10) + ho_predictions = automl.predict(test) + + # calculate scores + target_name = get_target_name(sampled_app_roles) + oof_score = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) + ho_score = roc_auc_score(test[target_name].values, ho_predictions.data[:, 0]) + + # checks + assert oof_score > 0.73 + assert ho_score > 0.72 + + check_pickling(automl, ho_score, binary_task, test, target_name) diff --git a/tests/unit/test_ml_algo/test_optimization/optuna/test_optuna_tuner.py b/tests/unit/test_ml_algo/test_optimization/optuna/test_optuna_tuner.py index 95a6f5cf..398ca5fd 100644 --- a/tests/unit/test_ml_algo/test_optimization/optuna/test_optuna_tuner.py +++ b/tests/unit/test_ml_algo/test_optimization/optuna/test_optuna_tuner.py @@ -82,7 +82,7 @@ def test_invalid_distributions(): params_tuner = OptunaTuner(n_trials=10, timeout=300) - with pytest.raises(ValueError): + with pytest.raises(Exception): params_tuner.fit( ml_algo=model, train_valid_iterator=iterator_mock,