Skip to content

Commit

Permalink
add clearml exps
Browse files Browse the repository at this point in the history
  • Loading branch information
acc-to-learn committed Dec 17, 2024
1 parent 7c7e3ce commit 6b15688
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 0 deletions.
105 changes: 105 additions & 0 deletions scripts/experiments/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""Run tabular automl using ClearML logging."""
import argparse
import os
import pandas as pd
import clearml
import numpy as np


def main( # noqa D103
task_name: str,
dataset_name: str,
queue: str,
image: str,
project: str,
cpu_limit: int,
min_num_obs: int,
memory_limit: int,
tags: list,
dataset_project: str = None,
dataset_partial_name: str = None,
n_datasets: int = -1,
save_model: bool = False,
):
if dataset_name is not None:
dataset_list = [dataset_name]
else:
dataset_list = pd.DataFrame(
clearml.Dataset.list_datasets(
dataset_project=dataset_project,
partial_name=dataset_partial_name,
tags=tags,
ids=None,
only_completed=True,
recursive_project_search=True,
include_archived=False,
)
)
dataset_list = (
dataset_list.sort_values("version", ascending=False).drop_duplicates(subset=["name"]).to_dict("records")
)

if min_num_obs is not None:
for indx, dataset in enumerate(dataset_list):
metadata = clearml.Dataset.get(dataset_id=None, dataset_name=dataset["name"]).get_metadata()
if metadata["num_obs"].iloc[0] < min_num_obs:
dataset_list.pop(indx)

if len(dataset_list) <= 0:
raise ValueError("No one dataset was found with passed parameters.")

np.random.shuffle(dataset_list)
dataset_list = dataset_list[:n_datasets]

print(f"Running {len(dataset_list)} datasets:")

for dataset in dataset_list:
if isinstance(dataset, str):
dataset_name = dataset
tags = [""]
else:
dataset_name = dataset["name"]
tags = dataset["tags"]

curr_task_name = f"{task_name}@{dataset_name}" if task_name is not None else f"{dataset_name}"

tags.append(queue)
tags = f"--tags {' '.join(tags)}" if len(tags) else ""

os.system(
f'clearml-task --project {project} --name {curr_task_name} --script scripts/experiments/run_tabular.py --queue {queue} {tags} --docker {image} --docker_args "--cpus={cpu_limit} --memory={memory_limit}g" --args dataset={dataset_name} save_model={save_model}'
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument("--name", type=str, help="name for task", default=None)
parser.add_argument("--dataset", type=str, help="dataset name or id", default=None)
parser.add_argument("--dataset_project", type=str, help="dataset_project", default="Datasets_with_metadata")
parser.add_argument("--dataset_partial_name", type=str, help="dataset_partial_name", default=None)
parser.add_argument("--tags", nargs="+", default=[], help="tags")
parser.add_argument("--cpu_limit", type=int, help="cpu limit in n threads", default=8)
parser.add_argument("--memory_limit", type=int, help="mem limit in GBs", default=16)
parser.add_argument("--queue", type=str, help="clearml workers queue", default="cpu_queue")
parser.add_argument("--project", type=str, help="clearml project", default="junk")
parser.add_argument("--image", type=str, help="docker image", default="for_clearml:latest")
parser.add_argument("--n_datasets", type=int, help="number of datasets", default=-1)
parser.add_argument("--min_num_obs", type=int, help="min number of samples", default=None)
parser.add_argument("--save_model", action="store_true")
args = parser.parse_args()

main(
task_name=args.name,
dataset_name=args.dataset,
cpu_limit=args.cpu_limit,
memory_limit=args.memory_limit,
dataset_partial_name=args.dataset_partial_name,
dataset_project=args.dataset_project,
tags=args.tags,
queue=args.queue,
project=args.project,
image=args.image,
n_datasets=args.n_datasets,
min_num_obs=args.min_num_obs,
save_model=args.save_model,
)
171 changes: 171 additions & 0 deletions scripts/experiments/run_tabular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""Run tabular automl using ClearML logging."""

from utils import Timer
from utils import install_lightautoml


install_lightautoml()

import argparse
import os

import clearml
import numpy as np
import pandas as pd

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

RANDOM_STATE = 1234


def map_to_corect_order_of_classes(values, targets_order): # noqa D103
target_mapping = {n: x for (x, n) in enumerate(targets_order)}
mapped = list(map(target_mapping.get, values))

return mapped


def main(dataset_name: str, cpu_limit: int, memory_limit: int, save_model: bool): # noqa D103
cml_task = clearml.Task.get_task(clearml.config.get_remote_task_id())
logger = cml_task.get_logger()

dataset = clearml.Dataset.get(dataset_id=None, dataset_name=dataset_name)
dataset_local_path = dataset.get_local_copy()

with open(os.path.join(dataset_local_path, "task_type.txt"), "r") as f:
task_type = f.readline()
train = pd.read_csv(os.path.join(dataset_local_path, "train.csv"))
test = pd.read_csv(os.path.join(dataset_local_path, "test.csv"))

if task_type == "multilabel":
target_name = [x for x in test.columns if x.startswith("target")]
else:
target_name = test.columns[-1]

if task_type in ["binary", "multiclass", "multilabel"]:
assert (
train[target_name].nunique() == test[target_name].nunique()
), "train and test has different unique values."

is_train_unique_ok = train[target_name].nunique() > 1
is_test_unique_ok = test[target_name].nunique() > 1

if isinstance(is_train_unique_ok, bool):
assert is_train_unique_ok, "Only one class present in train target."
else:
(is_train_unique_ok).all(), "Only one class present in train target."

if isinstance(is_test_unique_ok, bool):
assert is_test_unique_ok, "Only one class present in test target."
else:
(is_test_unique_ok).all(), "Only one class present in test target."

assert train[target_name].isnull().values.any() is np.False_, "train has nans in target."
assert test[target_name].isnull().values.any() is np.False_, "test has nans in target."

task = Task(task_type)

# =================================== automl config:
automl = TabularAutoML(
debug=True,
task=task,
cpu_limit=cpu_limit,
memory_limit=memory_limit,
timeout=15 * 60,
# general_params={
# "use_algos": [["mlp"]]
# }, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
# nn_params={"n_epochs": 10, "bs": 512, "num_workers": 0, "path_to_save": None, "freeze_defaults": True},
# nn_pipeline_params={"use_qnt": True, "use_te": False},
reader_params={
# # 'n_jobs': N_THREADS,
# "cv": 5,
"random_state": RANDOM_STATE,
},
)
# ===================================

cml_task.connect(automl)

kwargs = {}
if save_model:
kwargs["path_to_save"] = "model"

with Timer() as timer_training:
oof_predictions = automl.fit_predict(train, roles={"target": target_name}, verbose=10, **kwargs)

# add and upload local file artifact
cml_task.upload_artifact(
name="model.joblib",
artifact_object=os.path.join(
"model.joblib",
),
)

with Timer() as timer_predict:
test_predictions = automl.predict(test)

if task_type == "binary":
print(f"OOF: {oof_predictions.data[:, 0].unique()}")
metric_oof = roc_auc_score(train[target_name].values, oof_predictions.data[:, 0])
metric_ho = roc_auc_score(test[target_name].values, test_predictions.data[:, 0])

elif task_type == "multiclass":
not_nan = np.any(~np.isnan(oof_predictions.data), axis=1)
try:
metric_oof = log_loss(train[target_name].values[not_nan], oof_predictions.data[not_nan, :])
metric_ho = log_loss(test[target_name], test_predictions.data)
except:
if np.unique(train[target_name].values[not_nan]).shape != np.unique(oof_predictions.data[not_nan, :]).shape:
raise ValueError(f"Vectors have different number of classes: {np.unique(train[target_name].values[not_nan])} and {np.unique(oof_predictions.data[not_nan, :])}")
# Some datasets can have dtype=float of target,
# so we must map this target for correct log_loss calculating (if we didn't calсulate it in the try block)
# and this mapping must be in the correct order so we extract automl.targets_order and map values
y_true = map_to_corect_order_of_classes(
values=train[target_name].values[not_nan], targets_order=automl.targets_order
)
metric_oof = log_loss(y_true, oof_predictions.data[not_nan, :])

y_true = map_to_corect_order_of_classes(values=test[target_name], targets_order=automl.targets_order)

metric_ho = log_loss(y_true, test_predictions.data)

elif task_type == "reg":
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data[:, 0])
metric_ho = task.metric_func(test[target_name].values, test_predictions.data[:, 0])

elif task_type == "multilabel":
metric_oof = task.metric_func(train[target_name].values, oof_predictions.data)
metric_ho = task.metric_func(test[target_name].values, test_predictions.data)
else:
raise ValueError("Bad task type.")

print(f"Score for out-of-fold predictions: {metric_oof}")
print(f"Score for hold-out: {metric_ho}")
print(f"Train duration: {timer_training.duration}")
print(f"Predict duration: {timer_predict.duration}")

logger.report_single_value("Metric OOF", metric_oof)
logger.report_single_value("Metric HO", metric_ho)

logger.report_single_value("Train duration", timer_training.duration)
logger.report_single_value("Predict duration", timer_predict.duration)

logger.flush()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="")
parser.add_argument("--dataset", type=str, help="dataset name or id", default="sampled_app_train")
parser.add_argument("--cpu_limit", type=int, help="", default=8)
parser.add_argument("--memory_limit", type=int, help="", default=16)
parser.add_argument("--save_model", action="store_true")
args = parser.parse_args()

main(
dataset_name=args.dataset, cpu_limit=args.cpu_limit, memory_limit=args.memory_limit, save_model=args.save_model
)
60 changes: 60 additions & 0 deletions scripts/experiments/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Utils for running experiments."""

import os
import time


class Timer: # noqa: D101
@staticmethod
def _zero():
return 0

def __init__(self, clock=time.time, enabled=True):
self.start = 0
self.stop = 0
self._time = clock if enabled else Timer._zero
self._tick = 0

def __enter__(self):
self.start = self._tick = self._time()
return self

def __exit__(self, *args):
self.stop = self._tick = self._time()

@property
def tick(self):
"""Make one tick."""
if self.stop > 0:
return -1
now = self._time()
tick = now - self._tick
self._tick = now
return tick

@property
def duration(self):
"""Get dureation in seconds."""
if self.stop > 0:
return self.stop - self.start
return self._time() - self.start


def install_lightautoml():
"""Install lightautoml using pip."""
# os.system("curl -sSL https://install.python-poetry.org | ../../bin/python -vvv -")
# os.system("/root/.local/bin/poetry build")
# os.system("ls -la ./dist/")
os.system("pip install packaging==22.0")
os.system("python scripts/poetry_fix.py -f")
os.system("../../bin/pip install .") # ./dist/*.whl


# .pip install --upgrade pip
# poetry config virtualenvs.create false --local
# poetry run python ./scripts/poetry_fix.py -c
# ls -la
# poetry run pip install pillow==9.2.0
# poetry install
# poetry run pip freeze
# poetry run python -c "import sys; print(sys.path)"
12 changes: 12 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ deps =
commands =
poetry run python scripts/poetry_fix.py -f
poetry build

# example:
# tox -e exp -- --dataset_project=Datasets_with_metadata --tags=binary openml
# tox -e exp -- --dataset_project=Datasets_with_metadata --dataset=CIFAR_10_openml --queue=gpu_queue
# tox -e exp -- --dataset_project=Datasets_with_metadata --tags=multiclass --queue=gpu_queue --n_datasets=5 --name=mlp --min_num_obs=100000
# Notion: args [--tags=binary openml] means tag is binary OR tag is openml
[testenv:exp]
requires = python == 3.8
deps =
clearml
commands =
python scripts/experiments/run.py {posargs}

0 comments on commit 6b15688

Please sign in to comment.