NVIDIA · jeffnvidia · May 22, 2024 · May 23, 2024 · May 19, 2024 · May 19, 2024
diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml
@@ -3,15 +3,15 @@ name = "NeMoLauncher"
 [cmd_args]
   [cmd_args.repository_url]
   type = "str"
-  default = "NEMO_REPOSITORY_URL"
+  default = "https://github.com/NVIDIA/NeMo-Framework-Launcher.git"
 
   [cmd_args.repository_commit_hash]
   type = "str"
-  default = "6528780fba8185bf61e7c2396fdd2331ee5933a1"
+  default = "cf411a9ede3b466677df8ee672bcc6c396e71e1a"
 
   [cmd_args.docker_image_url]
   type = "str"
-  default = "DOCKER_IMAGE_URL"
+  default = "nvcr.io/nvidian/nemofw-training:24.01.01"
 
   [cmd_args.stages]
   type = "str"
@@ -33,8 +33,8 @@ name = "NeMoLauncher"
     default = "8"
 
   [cmd_args.training]
-  values = ["gpt3/43b"]
-  default = "gpt3/43b"
+  values = ["gpt3/40b_improved"]
+  default = "gpt3/40b_improved"
     [cmd_args.training.exp_manager]
       [cmd_args.training.exp_manager.create_checkpoint_callback]
       type = "bool"

diff --git a/src/cloudai/__main__.py b/src/cloudai/__main__.py
@@ -17,6 +17,7 @@
 import logging
 import os
 import sys
+import traceback
 
 from cloudai import Installer, Parser, ReportGenerator, Runner, SystemObjectUpdater
 
@@ -178,7 +179,11 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> None:
     test_scenario.pretty_print()
 
     runner = Runner(args.mode, system, test_scenario)
-    asyncio.run(runner.run())
+    try:
+        asyncio.run(runner.run())
+    except RuntimeError as e:
+        logging.error(traceback.format_exc())
+        logging.error(f"Error running asyncio loop: {e}")
 
     print(f"All test scenario results stored at: {runner.runner.output_path}")
 

diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py
@@ -72,6 +72,8 @@ def _submit_test(self, test: Test) -> Optional[SlurmJob]:
         job_id = None
         if self.mode == "run":
             stdout, stderr = self.cmd_shell.execute(exec_cmd).communicate()
+            self.logger.info(f"\tstdout: {stdout}")
+            self.logger.info(f"\tstderr: {stderr}")
             job_id = test.get_job_id(stdout, stderr)
         else:
             job_id = 0

diff --git a/src/cloudai/runner/standalone/standalone_runner.py b/src/cloudai/runner/standalone/standalone_runner.py
@@ -104,7 +104,9 @@ def is_job_completed(self, job: BaseJob) -> bool:
         s_job = cast(StandaloneJob, job)
         command = f"ps -p {s_job.id}"
         self.logger.debug(f"Checking job status with command: {command}")
-        stdout = self.cmd_shell.execute(command).communicate()[0]
+        stdout, stderr = self.cmd_shell.execute(command).communicate()
+        self.logger.info(f"\tstdout: {stdout}")
+        self.logger.info(f"\tstderr: {stderr}")
         return str(s_job.id) not in stdout
 
     def kill_job(self, job: BaseJob):

diff --git a/src/cloudai/schema/system/slurm/slurm_system.py b/src/cloudai/schema/system/slurm/slurm_system.py
@@ -589,20 +589,18 @@ def parse_sinfo_output(self, sinfo_output: str, node_user_map: Dict[str, str]) -
             parts = line.split()
             partition, _, _, _, state, nodelist = parts[:6]
             partition = partition.rstrip("*")
-
-            node_groups = nodelist.split(",")
-            for node_group in node_groups:
-                node_names = self.parse_node_list([node_group.strip()])
-                state_enum = self.convert_state_to_enum(state)
-
-                for node_name in node_names:
-                    for part_name, nodes in self.partitions.items():
-                        if part_name != partition:
-                            continue
-                        for node in nodes:
-                            if node.name == node_name:
-                                node.state = state_enum
-                                node.user = node_user_map.get(node_name, "N/A")
+            node_names = self.parse_node_list([nodelist])
+
+            # Convert state to enum, handling states with suffixes
+            state_enum = self.convert_state_to_enum(state)
+            for node_name in node_names:
+                for part_name, nodes in self.partitions.items():
+                    if part_name != partition:
+                        continue
+                    for node in nodes:
+                        if node.name == node_name:
+                            node.state = state_enum
+                            node.user = node_user_map.get(node_name, "N/A")
 
     def convert_state_to_enum(self, state_str: str) -> SlurmNodeState:
         """

diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py
@@ -8,8 +8,14 @@
 @pytest.fixture
 def slurm_system():
     nodes = [
-        SlurmNode(name="nodeA001", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
-        SlurmNode(name="nodeB001", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-115", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-116", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-117", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-118", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-119", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-120", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-121", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
+        SlurmNode(name="node-122", partition="main", state=SlurmNodeState.UNKNOWN_STATE),
     ]
     system = SlurmSystem(
         name="test_system",
@@ -48,6 +54,20 @@ def test_parse_sinfo_output(slurm_system):
     slurm_system.parse_sinfo_output(sinfo_output, node_user_map)
     assert slurm_system.partitions["main"][0].state == SlurmNodeState.IDLE
     assert slurm_system.partitions["main"][1].state == SlurmNodeState.IDLE
+
+def test_parse_sinfo_output2(slurm_system):
+    sinfo_output = """
+    PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
+    main    up    3:00:00      1  inval node-081
+    main    up    3:00:00      5  drain node-[065-066,114,124-125]
+    main    up    3:00:00      2   resv node-[034-035]
+    main    up    3:00:00     88  alloc node-[033,036-064,067-080,082-113,115-123,126-128]
+    backup    up   12:00:00     16  alloc node-[01-16]
+    """
+    node_user_map = {'': 'user1', 'node-033': 'user2', 'node-[036-064': 'user3', '067-080': 'user3', '082-113': 'user3', '115-118]': 'user3', 'node-[119-123': 'user4', '126-128]': 'user4', 'node-01': 'user5', 'node-02': 'user5', 'node-03': 'user5', 'node-04': 'user5', 'node-05': 'user5', 'node-06': 'user5', 'node-07': 'user5', 'node-08': 'user5', 'node-09': 'user5', 'node-10': 'user5', 'node-11': 'user5', 'node-12': 'user5', 'node-13': 'user5', 'node-14': 'user5', 'node-15': 'user5', 'node-16': 'user5'}
+    slurm_system.parse_sinfo_output(sinfo_output, node_user_map)
+    assert slurm_system.partitions["main"][0].state == SlurmNodeState.ALLOCATED
+    assert slurm_system.partitions["main"][1].state == SlurmNodeState.ALLOCATED
 
 
 @patch("cloudai.schema.system.SlurmSystem.get_squeue")