NVIDIA · TaekyungHeo · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -68,6 +68,8 @@
 from .schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
 from .schema.test_template.nccl_test.template import NcclTest
 from .schema.test_template.nemo_launcher.grading_strategy import NeMoLauncherGradingStrategy
+from .schema.test_template.nemo_launcher.kubernetes_command_gen_strategy import NeMoLauncherKubernetesCommandGenStrategy
+from .schema.test_template.nemo_launcher.kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy
 from .schema.test_template.nemo_launcher.report_generation_strategy import NeMoLauncherReportGenerationStrategy
 from .schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
 from .schema.test_template.nemo_launcher.slurm_install_strategy import NeMoLauncherSlurmInstallStrategy
@@ -105,6 +107,7 @@
 
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmInstallStrategy)
 Registry().add_strategy(InstallStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmInstallStrategy)
+Registry().add_strategy(InstallStrategy, [KubernetesSystem], [NeMoLauncher], NeMoLauncherKubernetesInstallStrategy)
 Registry().add_strategy(
     ReportGenerationStrategy, [SlurmSystem, KubernetesSystem], [NcclTest], NcclTestReportGenerationStrategy
 )
@@ -126,6 +129,9 @@
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
 Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
+Registry().add_strategy(
+    CommandGenStrategy, [KubernetesSystem], [NeMoLauncher], NeMoLauncherKubernetesCommandGenStrategy
+)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)

diff --git a/src/cloudai/installer/kubernetes_installer.py b/src/cloudai/installer/kubernetes_installer.py
@@ -16,8 +16,7 @@
 
 import logging
 
-from kubernetes import client, config
-from kubernetes.client.rest import ApiException
+from kubernetes import config
 
 from cloudai import BaseInstaller, InstallStatusResult
 
@@ -57,67 +56,5 @@ def _check_prerequisites(self) -> InstallStatusResult:
             logging.error(message)
             return InstallStatusResult(False, message)
 
-        # Check MPIJob-related prerequisites
-        mpi_job_result = self._check_mpi_job_prerequisites()
-        if not mpi_job_result.success:
-            return mpi_job_result
-
         logging.info("All prerequisites are met. Proceeding with installation.")
         return InstallStatusResult(True)
-
-    def _check_mpi_job_prerequisites(self) -> InstallStatusResult:
-        """
-        Check if the MPIJob CRD is installed and if MPIJob kind is supported in the Kubernetes cluster.
-
-        This ensures that the system is ready for MPI-based operations.
-
-        Returns
-            InstallStatusResult: Result containing the status of the MPIJob prerequisite check and any error message.
-        """
-        # Check if MPIJob CRD is installed
-        try:
-            custom_api = client.CustomObjectsApi()
-            custom_api.get_cluster_custom_object(group="kubeflow.org", version="v1", plural="mpijobs", name="mpijobs")
-        except ApiException as e:
-            if e.status == 404:
-                message = (
-                    "Installation failed during prerequisite checking stage because MPIJob CRD is not installed on "
-                    "this Kubernetes cluster. Please ensure that the MPI Operator is installed and MPIJob kind is "
-                    "supported. You can follow the instructions in the MPI Operator repository to install it: "
-                    "https://github.com/kubeflow/mpi-operator"
-                )
-                logging.error(message)
-                return InstallStatusResult(False, message)
-            else:
-                message = (
-                    f"Installation failed during prerequisite checking stage due to an error while checking for MPIJob "
-                    f"CRD. Original error: {str(e)}. Please ensure that the Kubernetes cluster is accessible and the "
-                    f"MPI Operator is correctly installed."
-                )
-                logging.error(message)
-                return InstallStatusResult(False, message)
-
-        # Check if MPIJob kind is supported
-        try:
-            api_resources = client.ApiextensionsV1Api().list_custom_resource_definition()
-            mpi_job_supported = any(item.metadata.name == "mpijobs.kubeflow.org" for item in api_resources.items)
-        except ApiException as e:
-            message = (
-                f"Installation failed during prerequisite checking stage due to an error while checking for MPIJob "
-                f"kind support. Original error: {str(e)}. Please ensure that the Kubernetes cluster is accessible and "
-                f"the MPI Operator is correctly installed."
-            )
-            logging.error(message)
-            return InstallStatusResult(False, message)
-
-        if not mpi_job_supported:
-            message = (
-                "Installation failed during prerequisite checking stage because MPIJob kind is not supported on this "
-                "Kubernetes cluster. Please ensure that the MPI Operator is installed and MPIJob kind is supported. "
-                "You can follow the instructions in the MPI Operator repository to install it: "
-                "https://github.com/kubeflow/mpi-operator"
-            )
-            logging.error(message)
-            return InstallStatusResult(False, message)
-
-        return InstallStatusResult(True)
diff --git a/src/cloudai/runner/kubernetes/kubernetes_runner.py b/src/cloudai/runner/kubernetes/kubernetes_runner.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import logging
+from pathlib import Path
 from typing import cast
 
 from cloudai import BaseJob, BaseRunner, TestRun
@@ -36,19 +37,23 @@ def _submit_test(self, tr: TestRun) -> KubernetesJob:
         Returns:
             KubernetesJob: A KubernetesJob object containing job details.
         """
-        logging.info(f"Running test: {tr.test.section_name}")
-        job_output_path = self.get_job_output_path(tr.test)
-        job_name = tr.test.section_name.replace(".", "-").lower()
-        job_spec = tr.test.gen_json(job_output_path, job_name, tr.time_limit, tr.num_nodes, tr.nodes)
-        job_kind = job_spec.get("kind", "").lower()
-        logging.info(f"Generated JSON string for test {tr.test.section_name}: {job_spec}")
-        job_namespace = ""
+        if tr.test.test_template.json_gen_strategy is not None:
+            logging.info(f"Running test: {tr.test.section_name}")
+            job_output_path = self.get_job_output_path(tr.test)
+            job_name = tr.test.section_name.replace(".", "-").lower()
+            job_spec = tr.test.gen_json(job_output_path, job_name, tr.time_limit, tr.num_nodes, tr.nodes)
+            job_kind = job_spec.get("kind", "").lower()
+            logging.info(f"Generated JSON string for test {tr.test.section_name}: {job_spec}")
+            job_namespace = ""
 
-        if self.mode == "run":
-            k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
-            job_name, job_namespace = k8s_system.create_job(job_spec)
+            if self.mode == "run":
+                k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
+                job_name, job_namespace = k8s_system.create_job(job_spec)
 
-        return KubernetesJob(self.mode, self.system, tr, job_namespace, job_name, job_kind, job_output_path)
+            return KubernetesJob(self.mode, self.system, tr, job_namespace, job_name, job_kind, job_output_path)
+        else:
+            print("Command Gen")
+            return KubernetesJob(self.mode, self.system, tr, "", "", "", Path(""))
 
     async def job_completion_callback(self, job: BaseJob) -> None:
         """

diff --git a/src/cloudai/schema/test_template/nemo_launcher/kubernetes_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/kubernetes_command_gen_strategy.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List
+
+from cloudai import CommandGenStrategy
+
+from .kubernetes_install_strategy import NeMoLauncherKubernetesInstallStrategy
+
+
+class NeMoLauncherKubernetesCommandGenStrategy(CommandGenStrategy):
+    """Command generation strategy for NeMo Megatron Launcher on Kubernetes systems."""
+
+    def gen_exec_command(
+        self,
+        cmd_args: Dict[str, str],
+        extra_env_vars: Dict[str, str],
+        extra_cmd_args: str,
+        output_path: Path,
+        num_nodes: int,
+        nodes: List[str],
+    ) -> str:
+        final_env_vars = {**self.system.global_env_vars, **extra_env_vars}
+
+        launcher_path = (
+            self.system.install_path
+            / NeMoLauncherKubernetesInstallStrategy.SUBDIR_PATH
+            / NeMoLauncherKubernetesInstallStrategy.REPOSITORY_NAME
+        )
+
+        self.final_cmd_args = {**self.default_cmd_args, **cmd_args}
+        self.final_cmd_args["launcher_scripts_path"] = str(launcher_path / "launcher_scripts")
+
+        self.final_cmd_args.update({f"env_vars.{key}": value for key, value in final_env_vars.items()})
+
+        self.final_cmd_args["cluster"] = self.final_cmd_args.pop("cluster.value", "")
+        self.final_cmd_args["training"] = self.final_cmd_args.pop("training.values", "")
+
+        for key in ["repository_url", "repository_commit_hash", "docker_image_url"]:
+            self.final_cmd_args.pop(key, None)
+
+        if self.final_cmd_args.get("data_dir") == "DATA_DIR":
+            raise ValueError(
+                "The 'data_dir' field of the NeMo launcher test contains the placeholder 'DATA_DIR'. "
+                "Please update the test schema TOML file with a valid path to the dataset."
+            )
+
+        cmd_args_str = self._generate_cmd_args_str(self.final_cmd_args)
+
+        full_cmd = f"python {launcher_path}/launcher_scripts/main.py {cmd_args_str}"
+
+        if extra_cmd_args:
+            full_cmd += f" {extra_cmd_args}"
+
+        env_vars_str = " ".join(f"{key}={value}" for key, value in final_env_vars.items())
+        return f"{env_vars_str} {full_cmd}".strip() if env_vars_str else full_cmd.strip()
+
+    def _generate_cmd_args_str(self, args: Dict[str, str]) -> str:
+        """
+        Generate a string of command-line arguments, wrapping values in quotes when necessary.
+
+        Args:
+            args (Dict[str, str]): The command-line arguments.
+
+        Returns:
+            str: A string of command-line arguments.
+        """
+        cmd_arg_str_parts = []
+        env_var_str_parts = []
+        special_chars = ["[", "]", "\\"]
+
+        for key, value in args.items():
+            if any(char in value for char in special_chars):
+                value = f'"{value}"'
+
+            if key.startswith("env_vars."):
+                env_var_str_parts.append(f"+{key}={value}")
+            else:
+                cmd_arg_str_parts.append(f"{key}={value}")
+
+        return " ".join(cmd_arg_str_parts + env_var_str_parts)