NVIDIA · TaekyungHeo · Nov 20, 2024 · Oct 29, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/conf/common/test/nemo_run_llama3_8b.toml b/conf/common/test/nemo_run_llama3_8b.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b"
+description = "nemo_run_llama3_8b"
+test_template_name = "NeMoRun"
+
+[cmd_args]
+"docker_image_url" = "nvcr.io/nvidia/nemo:24.09"
+"task" = "pretrain"
+"recipe_name" = "llama3_8b"
+
+[extra_cmd_args]
+"trainer.max_steps" = "5"
+"trainer.val_check_interval" = "1000"
+"log.ckpt.save_on_train_epoch_end" = "False"
+"log.ckpt.save_last" = "False"
diff --git a/conf/common/test_scenario/nemo_run_llama3_8b.toml b/conf/common/test_scenario/nemo_run_llama3_8b.toml
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b"
+
+[[Tests]]
+id = "nemo_run_llama3_8b_1"
+test_name = "nemo_run_llama3_8b"
+num_nodes = "1"
+time_limit = "00:30:00"
+
+[[Tests]]
+id = "nemo_run_llama3_8b_2"
+test_name = "nemo_run_llama3_8b"
+num_nodes = "2"
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "nemo_run_llama3_8b_1"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -75,6 +75,8 @@
     NeMoLauncherSlurmJobIdRetrievalStrategy,
 )
 from .schema.test_template.nemo_launcher.template import NeMoLauncher
+from .schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
+from .schema.test_template.nemo_run.template import NeMoRun
 from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
 from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
 from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
@@ -101,6 +103,7 @@
     GrokTestDefinition,
     NCCLTestDefinition,
     NeMoLauncherTestDefinition,
+    NeMoRunTestDefinition,
     NemotronTestDefinition,
     SleepTestDefinition,
     UCCTestDefinition,
@@ -128,6 +131,7 @@
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
 Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
+Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoRun], NeMoRunSlurmCommandGenStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
 Registry().add_strategy(
     ReportGenerationStrategy,
@@ -143,7 +147,7 @@
 Registry().add_strategy(
     JobIdRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, SlurmContainer],
+    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, NeMoRun, SlurmContainer],
     SlurmJobIdRetrievalStrategy,
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
@@ -156,7 +160,7 @@
 Registry().add_strategy(
     JobStatusRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, SlurmContainer],
+    [ChakraReplay, UCCTest, NeMoLauncher, NeMoRun, SlurmContainer],
     DefaultJobStatusRetrievalStrategy,
 )
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
@@ -178,6 +182,7 @@
 Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
 Registry().add_test_definition("Sleep", SleepTestDefinition)
 Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
+Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition)
 Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition)
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
@@ -186,6 +191,7 @@
 Registry().add_test_template("ChakraReplay", ChakraReplay)
 Registry().add_test_template("NcclTest", NcclTest)
 Registry().add_test_template("NeMoLauncher", NeMoLauncher)
+Registry().add_test_template("NeMoRun", NeMoRun)
 Registry().add_test_template("Sleep", Sleep)
 Registry().add_test_template("UCCTest", UCCTest)
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)

diff --git a/src/cloudai/schema/test_template/nemo_run/__init__.py b/src/cloudai/schema/test_template/nemo_run/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
+from .template import NeMoRun
+
+__all__ = [
+    "NeMoRun",
+    "NeMoRunSlurmCommandGenStrategy",
+]
diff --git a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, List, cast
+
+from cloudai import TestRun
+from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
+from cloudai.test_definitions.nemo_run import NeMoRunTestDefinition
+
+
+class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for NeMo 2.0 on Slurm systems."""
+
+    def _parse_slurm_args(
+        self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
+    ) -> Dict[str, Any]:
+        base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)
+
+        tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)
+        base_args.update({"image_path": tdef.docker_image.installed_path})
+
+        return base_args
+
+    def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]:
+        tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)
+
+        command = ["nemo", "llm", tdef.cmd_args.task, "--factory", tdef.cmd_args.recipe_name, "-y"]
+
+        if tr.nodes:
+            command.append(f"trainer.num_nodes={len(tr.nodes)}")
+        elif tr.num_nodes > 0:
+            command.append(f"trainer.num_nodes={tr.num_nodes}")
+
+        if tr.test.extra_cmd_args:
+            command.append(tr.test.extra_cmd_args)
+
+        return command
diff --git a/src/cloudai/schema/test_template/nemo_run/template.py b/src/cloudai/schema/test_template/nemo_run/template.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cloudai import TestTemplate
+
+
+class NeMoRun(TestTemplate):
+    """Test template for the NeMo-run."""
diff --git a/src/cloudai/test_definitions/__init__.py b/src/cloudai/test_definitions/__init__.py
@@ -20,6 +20,7 @@
 from .jax_toolbox import JaxToolboxCmdArgs, JaxToolboxTestDefinition
 from .nccl import NCCLCmdArgs, NCCLTestDefinition
 from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
+from .nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition
 from .nemotron import NemotronCmdArgs, NemotronTestDefinition
 from .sleep import SleepCmdArgs, SleepTestDefinition
 from .ucc import UCCCmdArgs, UCCTestDefinition
@@ -31,6 +32,8 @@
     "NCCLTestDefinition",
     "NeMoLauncherCmdArgs",
     "NeMoLauncherTestDefinition",
+    "NeMoRunCmdArgs",
+    "NeMoRunTestDefinition",
     "SleepCmdArgs",
     "SleepTestDefinition",
     "UCCCmdArgs",

diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from cloudai import CmdArgs, TestDefinition
+from cloudai.installer.installables import DockerImage, Installable
+
+
+class NeMoRunCmdArgs(CmdArgs):
+    """NeMoRun test command arguments."""
+
+    docker_image_url: str
+    task: str
+    recipe_name: str
+
+
+class NeMoRunTestDefinition(TestDefinition):
+    """Test object for NeMoLauncher."""
+
+    cmd_args: NeMoRunCmdArgs
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        """Get list of installable objects."""
+        return [self.docker_image]
diff --git a/tests/ref_data/nemo-run.sbatch b/tests/ref_data/nemo-run.sbatch
@@ -0,0 +1,11 @@
+#!/bin/bash
+#SBATCH --job-name=__JOB_NAME__
+#SBATCH -N 1
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
+#SBATCH --partition=main
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+
+srun --mpi=pmix --container-image=nvcr.io/nvidia/nemo:24.09 nemo llm pretrain --factory llama_3b -y trainer.num_nodes=1
diff --git a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+
+from cloudai._core.test import Test
+from cloudai._core.test_scenario import TestRun
+from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
+from cloudai.systems import SlurmSystem
+from cloudai.test_definitions.nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition
+
+
+class TestNeMoRunSlurmCommandGenStrategy:
+    @pytest.fixture
+    def test_run(self, tmp_path: Path) -> TestRun:
+        tdef = NeMoRunTestDefinition(
+            name="t1",
+            description="desc1",
+            test_template_name="tt",
+            cmd_args=NeMoRunCmdArgs(
+                docker_image_url="nvcr.io/nvidia/nemo:24.09", task="pretrain", recipe_name="llama_3b"
+            ),
+            extra_env_vars={"TEST_VAR_1": "value1"},
+            extra_cmd_args={"extra_args": ""},
+        )
+
+        test = Test(test_definition=tdef, test_template=Mock())
+        tr = TestRun(
+            test=test,
+            num_nodes=2,
+            nodes=[],
+            output_path=tmp_path / "output",
+            name="test-job",
+        )
+
+        return tr
+
+    @pytest.fixture
+    def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoRunSlurmCommandGenStrategy:
+        return NeMoRunSlurmCommandGenStrategy(slurm_system, {})
+
+    @pytest.mark.parametrize(
+        "cmd_args, expected_cmd",
+        [
+            (
+                {"docker_image_url": "nvcr.io/nvidia/nemo:24.09", "task": "fine_tune", "recipe_name": "llama7_13b"},
+                ["nemo", "llm", "fine_tune", "--factory", "llama7_13b", "-y", "trainer.num_nodes=2", "extra_args"],
+            ),
+        ],
+    )
+    def test_generate_test_command(
+        self,
+        cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy,
+        test_run: TestRun,
+        cmd_args: dict,
+        expected_cmd: list,
+    ) -> None:
+        test_run.test.test_definition.cmd_args = NeMoRunCmdArgs(**cmd_args)
+
+        cmd = cmd_gen_strategy.generate_test_command(
+            test_run.test.test_definition.extra_env_vars,
+            test_run.test.test_definition.cmd_args.model_dump(),
+            test_run,
+        )
+        assert cmd == expected_cmd, f"Expected command {expected_cmd}, but got {cmd}"