Skip to content

Commit

Permalink
Add NeMo 2.0 (NeMo-run)
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo committed Nov 15, 2024
1 parent def60ed commit 15b2e87
Show file tree
Hide file tree
Showing 12 changed files with 371 additions and 4 deletions.
30 changes: 30 additions & 0 deletions conf/common/test/nemo_run_llama3_8b.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nemo_run_llama3_8b"
description = "nemo_run_llama3_8b"
test_template_name = "NeMoRun"

[cmd_args]
"docker_image_url" = "nvcr.io/nvidia/nemo:24.09"
"task" = "pretrain"
"recipe_name" = "llama3_8b"

[extra_cmd_args]
"trainer.max_steps" = "5"
"trainer.val_check_interval" = "1000"
"log.ckpt.save_on_train_epoch_end" = "False"
"log.ckpt.save_last" = "False"
32 changes: 32 additions & 0 deletions conf/common/test_scenario/nemo_run_llama3_8b.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nemo_run_llama3_8b"

[[Tests]]
id = "nemo_run_llama3_8b_1"
test_name = "nemo_run_llama3_8b"
num_nodes = "1"
time_limit = "00:30:00"

[[Tests]]
id = "nemo_run_llama3_8b_2"
test_name = "nemo_run_llama3_8b"
num_nodes = "2"
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "nemo_run_llama3_8b_1"
14 changes: 12 additions & 2 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@
NeMoLauncherSlurmJobIdRetrievalStrategy,
)
from .schema.test_template.nemo_launcher.template import NeMoLauncher
from .schema.test_template.nemo_run.grading_strategy import NeMoRunGradingStrategy
from .schema.test_template.nemo_run.report_generation_strategy import NeMoRunReportGenerationStrategy
from .schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
from .schema.test_template.nemo_run.template import NeMoRun
from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
Expand All @@ -94,6 +98,7 @@
GrokTestDefinition,
NCCLTestDefinition,
NeMoLauncherTestDefinition,
NeMoRunTestDefinition,
NemotronTestDefinition,
SleepTestDefinition,
UCCTestDefinition,
Expand All @@ -115,20 +120,23 @@
ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherReportGenerationStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoRun], NeMoRunReportGenerationStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmCommandGenStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [Sleep], SleepGradingStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoRun], NeMoRunSlurmCommandGenStrategy)
Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoRun], NeMoRunGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy)
Registry().add_strategy(
JobIdRetrievalStrategy,
[SlurmSystem],
[ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep],
[ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, NeMoRun],
SlurmJobIdRetrievalStrategy,
)
Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
Expand All @@ -141,7 +149,7 @@
Registry().add_strategy(
JobStatusRetrievalStrategy,
[SlurmSystem],
[ChakraReplay, UCCTest, NeMoLauncher, Sleep],
[ChakraReplay, UCCTest, NeMoLauncher, NeMoRun, Sleep],
DefaultJobStatusRetrievalStrategy,
)
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
Expand All @@ -162,13 +170,15 @@
Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
Registry().add_test_definition("Sleep", SleepTestDefinition)
Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition)
Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition)
Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)

Registry().add_test_template("ChakraReplay", ChakraReplay)
Registry().add_test_template("NcclTest", NcclTest)
Registry().add_test_template("NeMoLauncher", NeMoLauncher)
Registry().add_test_template("NeMoRun", NeMoRun)
Registry().add_test_template("Sleep", Sleep)
Registry().add_test_template("UCCTest", UCCTest)
Registry().add_test_template("JaxToolboxGPT", JaxToolbox)
Expand Down
27 changes: 27 additions & 0 deletions src/cloudai/schema/test_template/nemo_run/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .grading_strategy import NeMoRunGradingStrategy
from .report_generation_strategy import NeMoRunReportGenerationStrategy
from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
from .template import NeMoRun

__all__ = [
"NeMoRun",
"NeMoRunGradingStrategy",
"NeMoRunReportGenerationStrategy",
"NeMoRunSlurmCommandGenStrategy",
]
26 changes: 26 additions & 0 deletions src/cloudai/schema/test_template/nemo_run/grading_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

from cloudai import GradingStrategy


class NeMoRunGradingStrategy(GradingStrategy):
"""Performance grading strategy for NeMoLauncher test templates on Slurm systems."""

def grade(self, directory_path: Path, ideal_perf: float) -> float:
return 0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

from cloudai import ReportGenerationStrategy


class NeMoRunReportGenerationStrategy(ReportGenerationStrategy):
"""Strategy for generating reports from NeMo run directories."""

def can_handle_directory(self, directory_path: Path) -> bool:
return False
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import Any, Dict, List, cast

from cloudai import TestRun
from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
from cloudai.test_definitions.nemo_run import NeMoRunTestDefinition


class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy):
"""Command generation strategy for NeMo 2.0 on Slurm systems."""

def _parse_slurm_args(
self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
) -> Dict[str, Any]:
base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)

tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)
base_args.update({"image_path": tdef.docker_image.installed_path})

return base_args

def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]:
command = ["nemo", "llm"]

task = cmd_args.get("task", "pretrain")
command.append(task)

recipe_name = cmd_args.get("recipe_name", "llama3_8b")
command.extend(["--factory", recipe_name])

command.append("-y")

if tr.nodes:
command.append(f"trainer.num_nodes={len(tr.nodes)}")
elif tr.num_nodes > 0:
command.append(f"trainer.num_nodes={tr.num_nodes}")

if tr.test.extra_cmd_args:
command.append(tr.test.extra_cmd_args)

return command
21 changes: 21 additions & 0 deletions src/cloudai/schema/test_template/nemo_run/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cloudai import TestTemplate


class NeMoRun(TestTemplate):
"""Test template for the NeMo-run."""
3 changes: 3 additions & 0 deletions src/cloudai/test_definitions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .jax_toolbox import JaxToolboxCmdArgs, JaxToolboxTestDefinition
from .nccl import NCCLCmdArgs, NCCLTestDefinition
from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
from .nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition
from .nemotron import NemotronCmdArgs, NemotronTestDefinition
from .sleep import SleepCmdArgs, SleepTestDefinition
from .ucc import UCCCmdArgs, UCCTestDefinition
Expand All @@ -31,6 +32,8 @@
"NCCLTestDefinition",
"NeMoLauncherCmdArgs",
"NeMoLauncherTestDefinition",
"NeMoRunCmdArgs",
"NeMoRunTestDefinition",
"SleepCmdArgs",
"SleepTestDefinition",
"UCCCmdArgs",
Expand Down
46 changes: 46 additions & 0 deletions src/cloudai/test_definitions/nemo_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from cloudai import CmdArgs, TestDefinition
from cloudai.installer.installables import DockerImage, Installable


class NeMoRunCmdArgs(CmdArgs):
"""NeMoRun test command arguments."""

docker_image_url: str = "nvcr.io/nvidia/nemo:24.09"
task: Optional[str] = None
recipe_name: Optional[str] = None


class NeMoRunTestDefinition(TestDefinition):
"""Test object for NeMoLauncher."""

cmd_args: NeMoRunCmdArgs
_docker_image: Optional[DockerImage] = None

@property
def docker_image(self) -> DockerImage:
if not self._docker_image:
self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
return self._docker_image

@property
def installables(self) -> list[Installable]:
"""Get list of installable objects."""
return [self.docker_image]
Loading

0 comments on commit 15b2e87

Please sign in to comment.