From 7f17e2fd8629450da40249ec4cc4f8b2017fc91c Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Wed, 8 Jan 2025 06:23:22 -0500 Subject: [PATCH] Support custom recipes in NeMoRun --- conf/common/test/nemo_run_llama3_8b.toml | 1 + .../nemo_run/slurm_command_gen_strategy.py | 6 +++++ src/cloudai/test_definitions/nemo_run.py | 1 + ...est_nemo_run_slurm_command_gen_strategy.py | 25 +++++++++++++++++-- tests/test_acceptance.py | 5 +++- 5 files changed, 35 insertions(+), 3 deletions(-) diff --git a/conf/common/test/nemo_run_llama3_8b.toml b/conf/common/test/nemo_run_llama3_8b.toml index 462a57e61..ea8e8895f 100644 --- a/conf/common/test/nemo_run_llama3_8b.toml +++ b/conf/common/test/nemo_run_llama3_8b.toml @@ -21,6 +21,7 @@ test_template_name = "NeMoRun" [cmd_args] "docker_image_url" = "nvcr.io/nvidia/nemo:24.09" "task" = "pretrain" +"recipe_path" = "" "recipe_name" = "llama3_8b" [extra_cmd_args] diff --git a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py index ebbf13a59..62db838c7 100644 --- a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. +import os from typing import Any, Dict, List, cast from cloudai import TestRun @@ -32,6 +33,11 @@ def _parse_slurm_args( tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition) base_args.update({"image_path": tdef.docker_image.installed_path}) + if tdef.cmd_args.recipe_path: + # TODO: update /opt/NeMo/nemo/collections/llm/recipes/__init__.py + target_recipe_path = f"/opt/NeMo/nemo/collections/llm/recipes/{os.path.basename(tdef.cmd_args.recipe_path)}" + container_mounts = f"{tdef.cmd_args.recipe_path}:{target_recipe_path}" + base_args["container_mounts"] = container_mounts return base_args diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py index fa424672f..3123e2ea1 100644 --- a/src/cloudai/test_definitions/nemo_run.py +++ b/src/cloudai/test_definitions/nemo_run.py @@ -25,6 +25,7 @@ class NeMoRunCmdArgs(CmdArgs): docker_image_url: str task: str + recipe_path: Optional[str] recipe_name: str diff --git a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py index 57ed454d7..22539accf 100644 --- a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py @@ -34,7 +34,7 @@ def test_run(self, tmp_path: Path) -> TestRun: description="desc1", test_template_name="tt", cmd_args=NeMoRunCmdArgs( - docker_image_url="nvcr.io/nvidia/nemo:24.09", task="pretrain", recipe_name="llama_3b" + docker_image_url="nvcr.io/nvidia/nemo:24.09", task="pretrain", recipe_path=None, recipe_name="llama_3b" ), extra_env_vars={"TEST_VAR_1": "value1"}, extra_cmd_args={"extra_args": ""}, @@ -59,7 +59,12 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoRunSlurmCommandGenS "cmd_args, expected_cmd", [ ( - {"docker_image_url": "nvcr.io/nvidia/nemo:24.09", "task": "fine_tune", "recipe_name": "llama7_13b"}, + { + "docker_image_url": "nvcr.io/nvidia/nemo:24.09", + "task": "fine_tune", + "recipe_path": None, + "recipe_name": "llama7_13b", + }, ["nemo", "llm", "fine_tune", "--factory", "llama7_13b", "-y", "trainer.num_nodes=2", "extra_args"], ), ], @@ -90,3 +95,19 @@ def test_num_nodes(self, cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, test_ num_nodes_param = next(p for p in cmd if "trainer.num_nodes" in p) assert num_nodes_param == "trainer.num_nodes=3" + + def test_parse_slurm_args_without_recipe_path( + self, cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, test_run: TestRun + ) -> None: + test_run.test.test_definition.cmd_args.recipe_path = None + base_args = cmd_gen_strategy._parse_slurm_args("test_job", {}, {}, test_run) + assert "container_mounts" not in base_args + + def test_parse_slurm_args_with_recipe_path( + self, cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, test_run: TestRun + ) -> None: + recipe_path = "/some/recipe/path/model.py" + test_run.test.test_definition.cmd_args.recipe_path = recipe_path + base_args = cmd_gen_strategy._parse_slurm_args("test_job", {}, {}, test_run) + assert "container_mounts" in base_args + assert base_args["container_mounts"] == f"{recipe_path}:/opt/NeMo/nemo/collections/llm/recipes/model.py" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 53a22f502..f8f064639 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -211,7 +211,10 @@ def create_test_run(name, test_definition, command_gen_strategy): description=test_type, test_template_name=test_type, cmd_args=NeMoRunCmdArgs( - docker_image_url="nvcr.io/nvidia/nemo:24.09", task="pretrain", recipe_name="llama_3b" + docker_image_url="nvcr.io/nvidia/nemo:24.09", + task="pretrain", + recipe_path=None, + recipe_name="llama_3b", ), ), NeMoRunSlurmCommandGenStrategy,