-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #53 from TaekyungHeo/failure-nccl-test
Implement NcclTestJobStatusRetrievalStrategy and add corresponding tests
- Loading branch information
Showing
3 changed files
with
119 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
src/cloudai/schema/test_template/nccl_test/job_status_retrieval_strategy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
|
||
from cloudai._core.job_status_result import JobStatusResult | ||
from cloudai._core.job_status_retrieval_strategy import JobStatusRetrievalStrategy | ||
|
||
|
||
class NcclTestJobStatusRetrievalStrategy(JobStatusRetrievalStrategy): | ||
"""Strategy to retrieve job status for NCCL tests by checking the contents of 'stdout.txt'.""" | ||
|
||
def get_job_status(self, output_path: str) -> JobStatusResult: | ||
""" | ||
Determine the job status by examining 'stdout.txt' in the output directory. | ||
Args: | ||
output_path (str): Path to the directory containing 'stdout.txt'. | ||
Returns: | ||
JobStatusResult: The result containing the job status and an optional error message. | ||
""" | ||
stdout_path = os.path.join(output_path, "stdout.txt") | ||
if os.path.isfile(stdout_path): | ||
with open(stdout_path, "r") as file: | ||
content = file.read() | ||
if "# Out of bounds values" in content and "# Avg bus bandwidth" in content: | ||
return JobStatusResult(is_successful=True) | ||
missing_indicators = [] | ||
if "# Out of bounds values" not in content: | ||
missing_indicators.append("'# Out of bounds values'") | ||
if "# Avg bus bandwidth" not in content: | ||
missing_indicators.append("'# Avg bus bandwidth'") | ||
error_message = ( | ||
f"Missing success indicators in {stdout_path}: {', '.join(missing_indicators)}. " | ||
"These keywords are expected to be present in stdout.txt, usually towards the end of the file. " | ||
f"Please ensure the NCCL test ran to completion. You can run the generated sbatch script manually " | ||
f"and check if {stdout_path} is created and contains the expected keywords." | ||
) | ||
return JobStatusResult(is_successful=False, error_message=error_message) | ||
return JobStatusResult( | ||
is_successful=False, | ||
error_message=( | ||
f"stdout.txt file not found in the specified output directory {output_path}. " | ||
"This file is expected to be created as a result of the NCCL test run. " | ||
"Please ensure the NCCL test was executed properly and that stdout.txt is generated. " | ||
f"You can run the generated NCCL test command manually and verify the creation of {stdout_path}." | ||
), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from pathlib import Path | ||
|
||
from cloudai.schema.test_template.nccl_test.job_status_retrieval_strategy import NcclTestJobStatusRetrievalStrategy | ||
|
||
|
||
class TestNcclTestJobStatusRetrievalStrategy: | ||
"""Tests for the NcclTestJobStatusRetrievalStrategy class.""" | ||
|
||
def setup_method(self) -> None: | ||
"""Setup method for initializing NcclTestJobStatusRetrievalStrategy.""" | ||
self.js = NcclTestJobStatusRetrievalStrategy() | ||
|
||
def test_no_stdout_file(self, tmp_path: Path) -> None: | ||
"""Test that job status is False when no stdout.txt file is present.""" | ||
result = self.js.get_job_status(str(tmp_path)) | ||
assert not result.is_successful | ||
assert result.error_message == ( | ||
f"stdout.txt file not found in the specified output directory {tmp_path}. " | ||
"This file is expected to be created as a result of the NCCL test run. " | ||
"Please ensure the NCCL test was executed properly and that stdout.txt is generated. " | ||
f"You can run the generated NCCL test command manually and verify the creation of " | ||
f"{tmp_path / 'stdout.txt'}." | ||
) | ||
|
||
def test_successful_job(self, tmp_path: Path) -> None: | ||
"""Test that job status is True when stdout.txt contains success indicators.""" | ||
stdout_file = tmp_path / "stdout.txt" | ||
stdout_content = """ | ||
# Some initialization output | ||
# More output | ||
# Out of bounds values : 0 OK | ||
# Avg bus bandwidth : 100.00 | ||
# Some final output | ||
""" | ||
stdout_file.write_text(stdout_content) | ||
result = self.js.get_job_status(str(tmp_path)) | ||
assert result.is_successful | ||
assert result.error_message == "" | ||
|
||
def test_failed_job(self, tmp_path: Path) -> None: | ||
"""Test that job status is False when stdout.txt does not contain success indicators.""" | ||
stdout_file = tmp_path / "stdout.txt" | ||
stdout_content = """ | ||
# Some initialization output | ||
# More output | ||
# Some final output without success indicators | ||
""" | ||
stdout_file.write_text(stdout_content) | ||
result = self.js.get_job_status(str(tmp_path)) | ||
assert not result.is_successful | ||
assert result.error_message == ( | ||
f"Missing success indicators in {stdout_file}: '# Out of bounds values', '# Avg bus bandwidth'. " | ||
"These keywords are expected to be present in stdout.txt, usually towards the end of the file. " | ||
f"Please ensure the NCCL test ran to completion. You can run the generated sbatch script manually " | ||
f"and check if {stdout_file} is created and contains the expected keywords." | ||
) |