From 8a115dab08383424ca5eb928384b2fe35408f3e4 Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Tue, 14 Jan 2025 17:35:41 -0500 Subject: [PATCH 1/2] Fix a typo in src/cloudai/report_generator/report_generator.py --- src/cloudai/report_generator/report_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py index 9d7ef5631..e47bef40a 100644 --- a/src/cloudai/report_generator/report_generator.py +++ b/src/cloudai/report_generator/report_generator.py @@ -75,7 +75,7 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None: continue if not tr.test.test_template.can_handle_directory(subdir): logging.warning( - f"Skipping '{subdir}', can't hande with " + f"Skipping '{subdir}', can't handle with " f"strategy={tr.test.test_template.report_generation_strategy}." ) continue From c819d1ec23098cbe7fa35aa15a0173989d050056 Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Tue, 14 Jan 2025 19:26:43 -0500 Subject: [PATCH 2/2] Bug fix in job completion check --- src/cloudai/systems/slurm/slurm_system.py | 3 +++ tests/test_slurm_system.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index 51388ac82..2b74bfac2 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -244,6 +244,9 @@ def is_job_completed(self, job: BaseJob, retry_threshold: int = 3) -> bool: raise RuntimeError(error_message) job_states = stdout.strip().split() + if "RUNNING" in job_states: + return False + if any(state in ["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT"] for state in job_states): return True diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py index 028fb89e6..c89ca1ada 100644 --- a/tests/test_slurm_system.py +++ b/tests/test_slurm_system.py @@ -241,6 +241,9 @@ def test_allocate_nodes_exceeding_limit( ("TIMEOUT", "", True), ("RUNNING", "", False), ("PENDING", "", False), + ("COMPLETED RUNNING", "", False), + ("RUNNING COMPLETED", "", False), + ("COMPLETED COMPLETED", "", True), ("", "error", False), ], )