-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7c7e3ce
commit 6b15688
Showing
4 changed files
with
348 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
"""Run tabular automl using ClearML logging.""" | ||
import argparse | ||
import os | ||
import pandas as pd | ||
import clearml | ||
import numpy as np | ||
|
||
|
||
def main( # noqa D103 | ||
task_name: str, | ||
dataset_name: str, | ||
queue: str, | ||
image: str, | ||
project: str, | ||
cpu_limit: int, | ||
min_num_obs: int, | ||
memory_limit: int, | ||
tags: list, | ||
dataset_project: str = None, | ||
dataset_partial_name: str = None, | ||
n_datasets: int = -1, | ||
save_model: bool = False, | ||
): | ||
if dataset_name is not None: | ||
dataset_list = [dataset_name] | ||
else: | ||
dataset_list = pd.DataFrame( | ||
clearml.Dataset.list_datasets( | ||
dataset_project=dataset_project, | ||
partial_name=dataset_partial_name, | ||
tags=tags, | ||
ids=None, | ||
only_completed=True, | ||
recursive_project_search=True, | ||
include_archived=False, | ||
) | ||
) | ||
dataset_list = ( | ||
dataset_list.sort_values("version", ascending=False).drop_duplicates(subset=["name"]).to_dict("records") | ||
) | ||
|
||
if min_num_obs is not None: | ||
for indx, dataset in enumerate(dataset_list): | ||
metadata = clearml.Dataset.get(dataset_id=None, dataset_name=dataset["name"]).get_metadata() | ||
if metadata["num_obs"].iloc[0] < min_num_obs: | ||
dataset_list.pop(indx) | ||
|
||
if len(dataset_list) <= 0: | ||
raise ValueError("No one dataset was found with passed parameters.") | ||
|
||
np.random.shuffle(dataset_list) | ||
dataset_list = dataset_list[:n_datasets] | ||
|
||
print(f"Running {len(dataset_list)} datasets:") | ||
|
||
for dataset in dataset_list: | ||
if isinstance(dataset, str): | ||
dataset_name = dataset | ||
tags = [""] | ||
else: | ||
dataset_name = dataset["name"] | ||
tags = dataset["tags"] | ||
|
||
curr_task_name = f"{task_name}@{dataset_name}" if task_name is not None else f"{dataset_name}" | ||
|
||
tags.append(queue) | ||
tags = f"--tags {' '.join(tags)}" if len(tags) else "" | ||
|
||
os.system( | ||
f'clearml-task --project {project} --name {curr_task_name} --script scripts/experiments/run_tabular.py --queue {queue} {tags} --docker {image} --docker_args "--cpus={cpu_limit} --memory={memory_limit}g" --args dataset={dataset_name} save_model={save_model}' | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="") | ||
parser.add_argument("--name", type=str, help="name for task", default=None) | ||
parser.add_argument("--dataset", type=str, help="dataset name or id", default=None) | ||
parser.add_argument("--dataset_project", type=str, help="dataset_project", default="Datasets_with_metadata") | ||
parser.add_argument("--dataset_partial_name", type=str, help="dataset_partial_name", default=None) | ||
parser.add_argument("--tags", nargs="+", default=[], help="tags") | ||
parser.add_argument("--cpu_limit", type=int, help="cpu limit in n threads", default=8) | ||
parser.add_argument("--memory_limit", type=int, help="mem limit in GBs", default=16) | ||
parser.add_argument("--queue", type=str, help="clearml workers queue", default="cpu_queue") | ||
parser.add_argument("--project", type=str, help="clearml project", default="junk") | ||
parser.add_argument("--image", type=str, help="docker image", default="for_clearml:latest") | ||
parser.add_argument("--n_datasets", type=int, help="number of datasets", default=-1) | ||
parser.add_argument("--min_num_obs", type=int, help="min number of samples", default=None) | ||
parser.add_argument("--save_model", action="store_true") | ||
args = parser.parse_args() | ||
|
||
main( | ||
task_name=args.name, | ||
dataset_name=args.dataset, | ||
cpu_limit=args.cpu_limit, | ||
memory_limit=args.memory_limit, | ||
dataset_partial_name=args.dataset_partial_name, | ||
dataset_project=args.dataset_project, | ||
tags=args.tags, | ||
queue=args.queue, | ||
project=args.project, | ||
image=args.image, | ||
n_datasets=args.n_datasets, | ||
min_num_obs=args.min_num_obs, | ||
save_model=args.save_model, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
"""Run tabular automl using ClearML logging.""" | ||
|
||
from utils import Timer | ||
from utils import install_lightautoml | ||
|
||
|
||
install_lightautoml() | ||
|
||
import argparse | ||
import os | ||
|
||
import clearml | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.metrics import log_loss | ||
from sklearn.metrics import roc_auc_score | ||
|
||
from lightautoml.automl.presets.tabular_presets import TabularAutoML | ||
from lightautoml.tasks import Task | ||
|
||
RANDOM_STATE = 1234 | ||
|
||
|
||
def map_to_corect_order_of_classes(values, targets_order): # noqa D103 | ||
target_mapping = {n: x for (x, n) in enumerate(targets_order)} | ||
mapped = list(map(target_mapping.get, values)) | ||
|
||
return mapped | ||
|
||
|
||
def main(dataset_name: str, cpu_limit: int, memory_limit: int, save_model: bool): # noqa D103 | ||
cml_task = clearml.Task.get_task(clearml.config.get_remote_task_id()) | ||
logger = cml_task.get_logger() | ||
|
||
dataset = clearml.Dataset.get(dataset_id=None, dataset_name=dataset_name) | ||
dataset_local_path = dataset.get_local_copy() | ||
|
||
with open(os.path.join(dataset_local_path, "task_type.txt"), "r") as f: | ||
task_type = f.readline() | ||
train = pd.read_csv(os.path.join(dataset_local_path, "train.csv")) | ||
test = pd.read_csv(os.path.join(dataset_local_path, "test.csv")) | ||
|
||
if task_type == "multilabel": | ||
target_name = [x for x in test.columns if x.startswith("target")] | ||
else: | ||
target_name = test.columns[-1] | ||
|
||
if task_type in ["binary", "multiclass", "multilabel"]: | ||
assert ( | ||
train[target_name].nunique() == test[target_name].nunique() | ||
), "train and test has different unique values." | ||
|
||
is_train_unique_ok = train[target_name].nunique() > 1 | ||
is_test_unique_ok = test[target_name].nunique() > 1 | ||
|
||
if isinstance(is_train_unique_ok, bool): | ||
assert is_train_unique_ok, "Only one class present in train target." | ||
else: | ||
(is_train_unique_ok).all(), "Only one class present in train target." | ||
|
||
if isinstance(is_test_unique_ok, bool): | ||
assert is_test_unique_ok, "Only one class present in test target." | ||
else: | ||
(is_test_unique_ok).all(), "Only one class present in test target." | ||
|
||
assert train[target_name].isnull().values.any() is np.False_, "train has nans in target." | ||
assert test[target_name].isnull().values.any() is np.False_, "test has nans in target." | ||
|
||
task = Task(task_type) | ||
|
||
# =================================== automl config: | ||
automl = TabularAutoML( | ||
debug=True, | ||
task=task, | ||
cpu_limit=cpu_limit, | ||
memory_limit=memory_limit, | ||
timeout=15 * 60, | ||
# general_params={ | ||
# "use_algos": [["mlp"]] | ||
# }, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model | ||
# nn_params={"n_epochs": 10, "bs": 512, "num_workers": 0, "path_to_save": None, "freeze_defaults": True}, | ||
# nn_pipeline_params={"use_qnt": True, "use_te": False}, | ||
reader_params={ | ||
# # 'n_jobs': N_THREADS, | ||
# "cv": 5, | ||
"random_state": RANDOM_STATE, | ||
}, | ||
) | ||
# =================================== | ||
|
||
cml_task.connect(automl) | ||
|
||
kwargs = {} | ||
if save_model: | ||
kwargs["path_to_save"] = "model" | ||
|
||
with Timer() as timer_training: | ||
oof_predictions = automl.fit_predict(train, roles={"target": target_name}, verbose=10, **kwargs) | ||
|
||
# add and upload local file artifact | ||
cml_task.upload_artifact( | ||
name="model.joblib", | ||
artifact_object=os.path.join( | ||
"model.joblib", | ||
), | ||
) | ||
|
||
with Timer() as timer_predict: | ||
test_predictions = automl.predict(test) | ||
|
||
if task_type == "binary": | ||
print(f"OOF: {oof_predictions.data[:, 0].unique()}") | ||
metric_oof = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0]) | ||
metric_ho = roc_auc_score(test[target_name].values, test_predictions.data[:, 0]) | ||
|
||
elif task_type == "multiclass": | ||
not_nan = np.any(~np.isnan(oof_predictions.data), axis=1) | ||
try: | ||
metric_oof = log_loss(train[target_name].values[not_nan], oof_predictions.data[not_nan, :]) | ||
metric_ho = log_loss(test[target_name], test_predictions.data) | ||
except: | ||
if np.unique(train[target_name].values[not_nan]).shape != np.unique(oof_predictions.data[not_nan, :]).shape: | ||
raise ValueError(f"Vectors have different number of classes: {np.unique(train[target_name].values[not_nan])} and {np.unique(oof_predictions.data[not_nan, :])}") | ||
# Some datasets can have dtype=float of target, | ||
# so we must map this target for correct log_loss calculating (if we didn't calсulate it in the try block) | ||
# and this mapping must be in the correct order so we extract automl.targets_order and map values | ||
y_true = map_to_corect_order_of_classes( | ||
values=train[target_name].values[not_nan], targets_order=automl.targets_order | ||
) | ||
metric_oof = log_loss(y_true, oof_predictions.data[not_nan, :]) | ||
|
||
y_true = map_to_corect_order_of_classes(values=test[target_name], targets_order=automl.targets_order) | ||
|
||
metric_ho = log_loss(y_true, test_predictions.data) | ||
|
||
elif task_type == "reg": | ||
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data[:, 0]) | ||
metric_ho = task.metric_func(test[target_name].values, test_predictions.data[:, 0]) | ||
|
||
elif task_type == "multilabel": | ||
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data) | ||
metric_ho = task.metric_func(test[target_name].values, test_predictions.data) | ||
else: | ||
raise ValueError("Bad task type.") | ||
|
||
print(f"Score for out-of-fold predictions: {metric_oof}") | ||
print(f"Score for hold-out: {metric_ho}") | ||
print(f"Train duration: {timer_training.duration}") | ||
print(f"Predict duration: {timer_predict.duration}") | ||
|
||
logger.report_single_value("Metric OOF", metric_oof) | ||
logger.report_single_value("Metric HO", metric_ho) | ||
|
||
logger.report_single_value("Train duration", timer_training.duration) | ||
logger.report_single_value("Predict duration", timer_predict.duration) | ||
|
||
logger.flush() | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="") | ||
parser.add_argument("--dataset", type=str, help="dataset name or id", default="sampled_app_train") | ||
parser.add_argument("--cpu_limit", type=int, help="", default=8) | ||
parser.add_argument("--memory_limit", type=int, help="", default=16) | ||
parser.add_argument("--save_model", action="store_true") | ||
args = parser.parse_args() | ||
|
||
main( | ||
dataset_name=args.dataset, cpu_limit=args.cpu_limit, memory_limit=args.memory_limit, save_model=args.save_model | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
"""Utils for running experiments.""" | ||
|
||
import os | ||
import time | ||
|
||
|
||
class Timer: # noqa: D101 | ||
@staticmethod | ||
def _zero(): | ||
return 0 | ||
|
||
def __init__(self, clock=time.time, enabled=True): | ||
self.start = 0 | ||
self.stop = 0 | ||
self._time = clock if enabled else Timer._zero | ||
self._tick = 0 | ||
|
||
def __enter__(self): | ||
self.start = self._tick = self._time() | ||
return self | ||
|
||
def __exit__(self, *args): | ||
self.stop = self._tick = self._time() | ||
|
||
@property | ||
def tick(self): | ||
"""Make one tick.""" | ||
if self.stop > 0: | ||
return -1 | ||
now = self._time() | ||
tick = now - self._tick | ||
self._tick = now | ||
return tick | ||
|
||
@property | ||
def duration(self): | ||
"""Get dureation in seconds.""" | ||
if self.stop > 0: | ||
return self.stop - self.start | ||
return self._time() - self.start | ||
|
||
|
||
def install_lightautoml(): | ||
"""Install lightautoml using pip.""" | ||
# os.system("curl -sSL https://install.python-poetry.org | ../../bin/python -vvv -") | ||
# os.system("/root/.local/bin/poetry build") | ||
# os.system("ls -la ./dist/") | ||
os.system("pip install packaging==22.0") | ||
os.system("python scripts/poetry_fix.py -f") | ||
os.system("../../bin/pip install .") # ./dist/*.whl | ||
|
||
|
||
# .pip install --upgrade pip | ||
# poetry config virtualenvs.create false --local | ||
# poetry run python ./scripts/poetry_fix.py -c | ||
# ls -la | ||
# poetry run pip install pillow==9.2.0 | ||
# poetry install | ||
# poetry run pip freeze | ||
# poetry run python -c "import sys; print(sys.path)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters