From d945201972c520442f191fd76d6c994a34270a51 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 02:17:42 -0800
Subject: [PATCH 01/21] agent environment intergation with runner

---
 src/cloudai/_core/configurator/cloudai_gym.py |  4 +--
 src/cloudai/cli/handlers.py                   | 36 +++++++++++++------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/cloudai/_core/configurator/cloudai_gym.py b/src/cloudai/_core/configurator/cloudai_gym.py
index 6e4e2b84..b3b87d75 100644
--- a/src/cloudai/_core/configurator/cloudai_gym.py
+++ b/src/cloudai/_core/configurator/cloudai_gym.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import gymnasium as gym
 import numpy as np
@@ -46,7 +46,7 @@ def __init__(self, test_run: TestRun, system: SlurmSystem, test_scenario: TestSc
         self.action_space = self.extract_action_space(self.test_run.test.cmd_args)
         self.observation_space = self.define_observation_space()
 
-    def extract_action_space(self, cmd_args: dict) -> spaces.Dict:
+    def extract_action_space(self, cmd_args: dict) -> Dict[str, Any]:
         """
         Extract the action space from the cmd_args dictionary.
 
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 49814c21..8ea3d481 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -22,6 +22,9 @@
 from unittest.mock import Mock
 
 from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System
+from cloudai._core.configurator.agents.grid_search import GridSearchAgent
+from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
+from cloudai.systems.slurm.slurm_system import SlurmSystem
 
 from ..parser import HOOK_ROOT
 
@@ -93,6 +96,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     """
     parser = Parser(args.system_config)
     system, tests, test_scenario = parser.parse(args.tests_dir, args.test_scenario)
+
     assert test_scenario is not None
 
     if args.output_dir:
@@ -125,19 +129,29 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
 
     logging.info(test_scenario.pretty_print())
 
-    runner = Runner(args.mode, system, test_scenario)
-    asyncio.run(runner.run())
+    tr = test_scenario.test_runs[0]
 
-    logging.info(f"All test scenario results stored at: {runner.runner.output_path}")
+    agent = GridSearchAgent(tr)
+    env = CloudAIGymEnv(test_run=tr, system=SlurmSystem(system), test_scenario=test_scenario)
 
-    if args.mode == "run":
-        generator = ReportGenerator(runner.runner.output_path)
-        generator.generate_report(test_scenario)
-        logging.info(
-            "All test scenario execution attempts are complete. Please review"
-            f" the '{args.log_file}' file to confirm successful completion or to"
-            " identify any issues."
-        )
+    agent.configure(env.action_space)
+
+    for action in agent.get_all_combinations():
+        for key, value in action.items():
+            tr.test.test_definition.cmd_args_dict[key] = value
+        runner = Runner(args.mode, system, test_scenario)
+        asyncio.run(runner.run())
+
+        logging.info(f"All test scenario results stored at: {runner.runner.output_path}")
+
+        if args.mode == "run":
+            generator = ReportGenerator(runner.runner.output_path)
+            generator.generate_report(test_scenario)
+            logging.info(
+                "All test scenario execution attempts are complete. Please review"
+                f" the '{args.log_file}' file to confirm successful completion or to"
+                " identify any issues."
+            )
 
     return 0
 

From 1be4398c7db97629cf310c136447cc6e5b6b5bb8 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 03:10:43 -0800
Subject: [PATCH 02/21] more fixes

---
 src/cloudai/_core/configurator/cloudai_gym.py | 24 ++++++++++++++++++-
 src/cloudai/cli/handlers.py                   |  8 ++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/cloudai/_core/configurator/cloudai_gym.py b/src/cloudai/_core/configurator/cloudai_gym.py
index b3b87d75..c012ae79 100644
--- a/src/cloudai/_core/configurator/cloudai_gym.py
+++ b/src/cloudai/_core/configurator/cloudai_gym.py
@@ -19,11 +19,33 @@
 import gymnasium as gym
 import numpy as np
 from gymnasium import spaces
+from gymnasium.spaces import Space
 
 from cloudai._core.test_scenario import TestRun, TestScenario
 from cloudai.systems import SlurmSystem
 
 
+class DictSpace(Space):
+    """
+    A custom space that wraps a dictionary of spaces.
+
+    Args:
+        space_dict (Dict[str, Any]): A dictionary of spaces.
+    """
+
+    def __init__(self, space_dict: Dict[str, Any]):
+        self.space_dict = space_dict
+        super().__init__((), None)
+
+    def sample(self, mask: Optional[np.ndarray] = None):
+        # Implement sampling logic if needed
+        pass
+
+    def contains(self, x) -> bool:
+        # Implement containment logic if needed
+        return True
+
+
 class CloudAIGymEnv(gym.Env):
     """
     Custom Gym environment for CloudAI integration.
@@ -46,7 +68,7 @@ def __init__(self, test_run: TestRun, system: SlurmSystem, test_scenario: TestSc
         self.action_space = self.extract_action_space(self.test_run.test.cmd_args)
         self.observation_space = self.define_observation_space()
 
-    def extract_action_space(self, cmd_args: dict) -> Dict[str, Any]:
+    def extract_action_space(self, cmd_args: dict):
         """
         Extract the action space from the cmd_args dictionary.
 
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 8ea3d481..89a3e605 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -24,7 +24,6 @@
 from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System
 from cloudai._core.configurator.agents.grid_search import GridSearchAgent
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
-from cloudai.systems.slurm.slurm_system import SlurmSystem
 
 from ..parser import HOOK_ROOT
 
@@ -132,9 +131,12 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     tr = test_scenario.test_runs[0]
 
     agent = GridSearchAgent(tr)
-    env = CloudAIGymEnv(test_run=tr, system=SlurmSystem(system), test_scenario=test_scenario)
+    env = CloudAIGymEnv(test_run=tr, system=system, test_scenario=test_scenario)
 
-    agent.configure(env.action_space)
+    # Convert env.action_space to a dictionary
+    action_space_dict = {key: space for key, space in env.action_space.spaces.items()}
+
+    agent.configure(action_space_dict)
 
     for action in agent.get_all_combinations():
         for key, value in action.items():

From 4df4ab974ac22ad0aaf04355213e41a2fefb20ef Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:04:20 -0800
Subject: [PATCH 03/21] Remove Farma gym dependies for more control over types
 + other fixes in pyright

---
 src/cloudai/_core/configurator/base_gym.py    | 102 +++++++++++++++++
 src/cloudai/_core/configurator/cloudai_gym.py | 107 ++++++++----------
 src/cloudai/cli/handlers.py                   |   5 +-
 3 files changed, 148 insertions(+), 66 deletions(-)
 create mode 100644 src/cloudai/_core/configurator/base_gym.py

diff --git a/src/cloudai/_core/configurator/base_gym.py b/src/cloudai/_core/configurator/base_gym.py
new file mode 100644
index 00000000..b985f46c
--- /dev/null
+++ b/src/cloudai/_core/configurator/base_gym.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Tuple
+
+
+class BaseGym(ABC):
+    """Base class for CloudAI Gym environments."""
+
+    def __init__(self):
+        """Initialize the CloudAIGym environment."""
+        self.action_space = self.define_action_space()
+        self.observation_space = self.define_observation_space()
+
+    @abstractmethod
+    def define_action_space(self) -> Dict[str, Any]:
+        """
+        Define the action space for the environment.
+
+        Returns:
+            Dict[str, Any]: The action space.
+        """
+        pass
+
+    @abstractmethod
+    def define_observation_space(self) -> list:
+        """
+        Define the observation space for the environment.
+
+        Returns:
+            list: The observation space.
+        """
+        pass
+
+    @abstractmethod
+    def reset(
+        self, seed: Optional[int] = None, options: Optional[dict[str, Any]] = None
+    ) -> Tuple[list, dict[str, Any]]:
+        """
+        Reset the environment.
+
+        Args:
+            seed (Optional[int]): Seed for the environment's random number generator.
+            options (Optional[dict]): Additional options for reset.
+
+        Returns:
+            Tuple: A tuple containing:
+                - observation (list): Initial observation.
+                - info (dict): Additional info for debugging.
+        """
+        pass
+
+    @abstractmethod
+    def step(self, action: Any) -> Tuple[list, float, bool, dict]:
+        """
+        Execute one step in the environment.
+
+        Args:
+            action (Any): Action chosen by the agent.
+
+        Returns:
+            Tuple: A tuple containing:
+                - observation (list): Updated system state.
+                - reward (float): Reward for the action taken.
+                - done (bool): Whether the episode is done.
+                - info (dict): Additional info for debugging.
+        """
+        pass
+
+    @abstractmethod
+    def render(self, mode: str = "human"):
+        """
+        Render the current state of the environment.
+
+        Args:
+            mode (str): The mode to render with. Default is "human".
+        """
+        pass
+
+    @abstractmethod
+    def seed(self, seed: Optional[int] = None):
+        """
+        Set the seed for the environment's random number generator.
+
+        Args:
+            seed (Optional[int]): Seed for the environment's random number generator.
+        """
+        pass
diff --git a/src/cloudai/_core/configurator/cloudai_gym.py b/src/cloudai/_core/configurator/cloudai_gym.py
index c012ae79..4205e3f9 100644
--- a/src/cloudai/_core/configurator/cloudai_gym.py
+++ b/src/cloudai/_core/configurator/cloudai_gym.py
@@ -16,44 +16,21 @@
 
 from typing import Any, Dict, Optional, Tuple
 
-import gymnasium as gym
 import numpy as np
-from gymnasium import spaces
-from gymnasium.spaces import Space
 
+from cloudai import System
+from cloudai._core.configurator.base_gym import BaseGym
 from cloudai._core.test_scenario import TestRun, TestScenario
-from cloudai.systems import SlurmSystem
 
 
-class DictSpace(Space):
-    """
-    A custom space that wraps a dictionary of spaces.
-
-    Args:
-        space_dict (Dict[str, Any]): A dictionary of spaces.
-    """
-
-    def __init__(self, space_dict: Dict[str, Any]):
-        self.space_dict = space_dict
-        super().__init__((), None)
-
-    def sample(self, mask: Optional[np.ndarray] = None):
-        # Implement sampling logic if needed
-        pass
-
-    def contains(self, x) -> bool:
-        # Implement containment logic if needed
-        return True
-
-
-class CloudAIGymEnv(gym.Env):
+class CloudAIGymEnv(BaseGym):
     """
     Custom Gym environment for CloudAI integration.
 
     Uses the TestRun object and actual runner methods to execute jobs.
     """
 
-    def __init__(self, test_run: TestRun, system: SlurmSystem, test_scenario: TestScenario):
+    def __init__(self, test_run: TestRun, system: System, test_scenario: TestScenario):
         """
         Initialize the Gym environment using the TestRun object.
 
@@ -62,47 +39,40 @@ def __init__(self, test_run: TestRun, system: SlurmSystem, test_scenario: TestSc
             system (SlurmSystem): The system configuration for running the tests.
             test_scenario (TestScenario): The test scenario configuration.
         """
-        super(CloudAIGymEnv, self).__init__()
         self.test_run = test_run
+        self.system = system
+        self.test_scenario = test_scenario
+        super().__init__()
 
-        self.action_space = self.extract_action_space(self.test_run.test.cmd_args)
-        self.observation_space = self.define_observation_space()
-
-    def extract_action_space(self, cmd_args: dict):
+    def define_action_space(self) -> Dict[str, Any]:
         """
-        Extract the action space from the cmd_args dictionary.
-
-        Args:
-            cmd_args (dict): The command arguments dictionary from the TestRun object.
+        Define the action space for the environment.
 
         Returns:
-            spaces.Dict: A dictionary containing the action space variables and their feasible values.
+            Dict[str, Any]: The action space.
         """
         action_space = {}
-        for key, value in cmd_args.items():
+        for key, value in self.test_run.test.cmd_args.items():
             if isinstance(value, list):
-                action_space[key] = spaces.Discrete(len(value))
+                action_space[key] = len(value)
             elif isinstance(value, dict):
                 for sub_key, sub_value in value.items():
                     if isinstance(sub_value, list):
-                        action_space[f"{key}.{sub_key}"] = spaces.Discrete(len(sub_value))
-        return spaces.Dict(action_space)
+                        action_space[f"{key}.{sub_key}"] = len(sub_value)
+        return action_space
 
-    def define_observation_space(self) -> spaces.Space:
+    def define_observation_space(self) -> list:
         """
         Define the observation space for the environment.
 
         Returns:
-            spaces.Space: The observation space.
+            list: The observation space.
         """
-        return spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
+        return [0.0]
 
     def reset(
-        self,
-        *,
-        seed: Optional[int] = None,
-        options: Optional[dict[str, Any]] = None,
-    ) -> Tuple[np.ndarray, dict[str, Any]]:
+        self, seed: Optional[int] = None, options: Optional[dict[str, Any]] = None
+    ) -> Tuple[list, dict[str, Any]]:
         """
         Reset the environment and reinitialize the TestRun.
 
@@ -112,37 +82,37 @@ def reset(
 
         Returns:
             Tuple: A tuple containing:
-                - observation (np.ndarray): Initial observation.
+                - observation (list): Initial observation.
                 - info (dict): Additional info for debugging.
         """
-        super().reset(seed=seed, options=options)
+        if seed is not None:
+            np.random.seed(seed)
         self.test_run.current_iteration = 0
-        observation = np.array([0.0], dtype=np.float32)
+        observation = [0.0]
         info = {}
         return observation, info
 
-    def step(self, action: np.ndarray) -> tuple:
+    def step(self, action: Any) -> Tuple[list, float, bool, dict]:
         """
         Execute one step in the environment.
 
         Args:
-            action (np.ndarray): Action chosen by the agent.
+            action (Any): Action chosen by the agent.
 
         Returns:
-            tuple: A tuple containing:
-                - observation (np.ndarray): Updated system state.
+            Tuple: A tuple containing:
+                - observation (list): Updated system state.
                 - reward (float): Reward for the action taken.
                 - done (bool): Whether the episode is done.
                 - info (dict): Additional info for debugging.
         """
         observation = self.get_observation(action)
-        reward = 0.0
+        reward = self.compute_reward()
         done = False
         info = {}
-
         return observation, reward, done, info
 
-    def render(self, mode="human"):
+    def render(self, mode: str = "human"):
         """
         Render the current state of the TestRun.
 
@@ -151,6 +121,16 @@ def render(self, mode="human"):
         """
         print(f"Step {self.test_run.current_iteration}: Parameters {self.test_run.test.cmd_args}")
 
+    def seed(self, seed: Optional[int] = None):
+        """
+        Set the seed for the environment's random number generator.
+
+        Args:
+            seed (Optional[int]): Seed for the environment's random number generator.
+        """
+        if seed is not None:
+            np.random.seed(seed)
+
     def compute_reward(self) -> float:
         """
         Compute a reward based on the TestRun result.
@@ -160,12 +140,15 @@ def compute_reward(self) -> float:
         """
         return 0.0
 
-    def get_observation(self, action) -> np.ndarray:
+    def get_observation(self, action: Any) -> list:
         """
         Get the observation from the TestRun object.
 
+        Args:
+            action (Any): Action taken by the agent.
+
         Returns:
-            np.ndarray: A scalar value representing the observation.
+            list: The observation.
         """
         obs = action * 0.5
-        return np.array([obs], dtype=np.float32)
+        return [obs]
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index 89a3e605..c4663975 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -133,10 +133,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     agent = GridSearchAgent(tr)
     env = CloudAIGymEnv(test_run=tr, system=system, test_scenario=test_scenario)
 
-    # Convert env.action_space to a dictionary
-    action_space_dict = {key: space for key, space in env.action_space.spaces.items()}
-
-    agent.configure(action_space_dict)
+    agent.configure(env.action_space)
 
     for action in agent.get_all_combinations():
         for key, value in action.items():

From e6905f72b3ec862930599267053fb9fa11a38554 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:06:23 -0800
Subject: [PATCH 04/21] vulture fix

---
 src/cloudai/_core/configurator/base_gym.py    | 2 +-
 src/cloudai/_core/configurator/cloudai_gym.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/_core/configurator/base_gym.py b/src/cloudai/_core/configurator/base_gym.py
index b985f46c..fff4ffe5 100644
--- a/src/cloudai/_core/configurator/base_gym.py
+++ b/src/cloudai/_core/configurator/base_gym.py
@@ -48,7 +48,7 @@ def define_observation_space(self) -> list:
 
     @abstractmethod
     def reset(
-        self, seed: Optional[int] = None, options: Optional[dict[str, Any]] = None
+        self, seed: Optional[int] = None, _options: Optional[dict[str, Any]] = None
     ) -> Tuple[list, dict[str, Any]]:
         """
         Reset the environment.
diff --git a/src/cloudai/_core/configurator/cloudai_gym.py b/src/cloudai/_core/configurator/cloudai_gym.py
index 4205e3f9..7569e899 100644
--- a/src/cloudai/_core/configurator/cloudai_gym.py
+++ b/src/cloudai/_core/configurator/cloudai_gym.py
@@ -71,7 +71,7 @@ def define_observation_space(self) -> list:
         return [0.0]
 
     def reset(
-        self, seed: Optional[int] = None, options: Optional[dict[str, Any]] = None
+        self, seed: Optional[int] = None, _options: Optional[dict[str, Any]] = None
     ) -> Tuple[list, dict[str, Any]]:
         """
         Reset the environment and reinitialize the TestRun.

From 177694f693e1fa9a102fc264802c7f0131868e2d Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:18:32 -0800
Subject: [PATCH 05/21] remove farma gym dependencies + update the pytest for
 cloudai_gym

---
 requirements.txt         |  1 -
 tests/test_cloudaigym.py | 51 ++++++++++------------------------------
 2 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d9ec54c0..ddaf06e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-gymnasium @ git+https://github.com/Farama-Foundation/Gymnasium/@v1.0.0a2
 bokeh==3.4.1
 pandas==2.2.1
 tbparse==0.0.8
diff --git a/tests/test_cloudaigym.py b/tests/test_cloudaigym.py
index 2d1e22bc..5049863e 100644
--- a/tests/test_cloudaigym.py
+++ b/tests/test_cloudaigym.py
@@ -1,24 +1,6 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from unittest.mock import MagicMock
 
-import numpy as np
 import pytest
-from gymnasium.spaces import Box, Dict, Discrete
 
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
 from cloudai._core.test_scenario import TestRun, TestScenario
@@ -48,32 +30,25 @@ def setup_env():
 def test_action_space_nccl(setup_env):
     test_run, system, test_scenario = setup_env
     env = CloudAIGymEnv(test_run=test_run, system=system, test_scenario=test_scenario)
-    assert isinstance(env.action_space, Dict)
+    action_space = env.define_action_space()
 
-    expected_action_space = Dict(
-        {
-            "iters": Discrete(2),
-            "maxbytes": Discrete(2),
-            "minbytes": Discrete(4),
-            "ngpus": Discrete(1),
-        }
-    )
+    expected_action_space = {
+        "iters": 2,
+        "maxbytes": 2,
+        "minbytes": 4,
+        "ngpus": 1,
+    }
 
-    assert env.action_space.spaces.keys() == expected_action_space.spaces.keys()
-    for key in expected_action_space.spaces:
-        assert isinstance(env.action_space.spaces[key], Discrete)
-        assert isinstance(expected_action_space.spaces[key], Discrete)
-        assert env.action_space.spaces[key].__dict__ == expected_action_space.spaces[key].__dict__
+    assert action_space.keys() == expected_action_space.keys()
+    for key in expected_action_space:
+        assert action_space[key] == expected_action_space[key]
 
 
 def test_observation_space(setup_env):
     test_run, system, test_scenario = setup_env
     env = CloudAIGymEnv(test_run=test_run, system=system, test_scenario=test_scenario)
-    assert isinstance(env.observation_space, Box)
+    observation_space = env.define_observation_space()
 
-    expected_observation_space = Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
+    expected_observation_space = [0.0]
 
-    assert env.observation_space.shape == expected_observation_space.shape
-    assert env.observation_space.dtype == expected_observation_space.dtype
-    assert np.all(env.observation_space.low == expected_observation_space.low)
-    assert np.all(env.observation_space.high == expected_observation_space.high)
+    assert observation_space == expected_observation_space

From b10dbfb98a61b7ce32e512f3beb0685c9fe0cfaa Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:20:14 -0800
Subject: [PATCH 06/21] remove farma gym from pyproject

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2321a82c..5f5a5d77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,6 @@
 name = "cloudai"
 dynamic = ["version"]
 dependencies = [
-  "gymnasium @ git+https://github.com/Farama-Foundation/Gymnasium/@v1.0.0a2",
   "bokeh==3.4.1",
   "pandas==2.2.1",
   "tbparse==0.0.8",

From 15be693be356c0bfb8a74198cd179d75e3f73060 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:22:48 -0800
Subject: [PATCH 07/21] fix the copyright headers checks

---
 tests/test_cloudaigym.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/test_cloudaigym.py b/tests/test_cloudaigym.py
index 5049863e..9585f3f6 100644
--- a/tests/test_cloudaigym.py
+++ b/tests/test_cloudaigym.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from unittest.mock import MagicMock
 
 import pytest

From d5d1e14babc68320dcb00edff44f265ec1f1c18f Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 11:35:51 -0800
Subject: [PATCH 08/21] use iterators to avoid indexing errors.

---
 src/cloudai/cli/handlers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index c4663975..ebb5fdd5 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -128,7 +128,10 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
 
     logging.info(test_scenario.pretty_print())
 
-    tr = test_scenario.test_runs[0]
+    tr = next(iter(test_scenario.test_runs), None)
+    if tr is None:
+        logging.error("No test runs found in the test scenario.")
+        return 1
 
     agent = GridSearchAgent(tr)
     env = CloudAIGymEnv(test_run=tr, system=system, test_scenario=test_scenario)

From 96ab05537d65be12b72980399b759d27c129316a Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 12:13:47 -0800
Subject: [PATCH 09/21] helper method for manipulating the TestRun object
 directly

---
 src/cloudai/cli/handlers.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index ebb5fdd5..aedcde0f 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -140,7 +140,7 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
 
     for action in agent.get_all_combinations():
         for key, value in action.items():
-            tr.test.test_definition.cmd_args_dict[key] = value
+            update_nested_attr(tr.test.test_definition.cmd_args, key, value)
         runner = Runner(args.mode, system, test_scenario)
         asyncio.run(runner.run())
 
@@ -158,6 +158,21 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     return 0
 
 
+def update_nested_attr(obj, attr_path, value):
+    """Update a nested attribute of an object."""
+    attrs = attr_path.split(".")
+    # hot fix. Will be removed after the issue is fixed in the codebase
+    prefix = "Grok"
+    if attrs[0] == prefix:
+        attrs = attrs[1:]
+    for attr in attrs[:-1]:
+        if hasattr(obj, attr):
+            obj = getattr(obj, attr)
+        else:
+            raise AttributeError(f"{type(obj).__name__!r} object has no attribute {attr!r}")
+    setattr(obj, attrs[-1], value)
+
+
 def handle_generate_report(args: argparse.Namespace) -> int:
     """
     Generate a report based on the existing configuration and test results.

From 8cab45096c22a1ec6fdecffff98fe897baf069b5 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 14:56:52 -0800
Subject: [PATCH 10/21] Modifcations for storing dse results

---
 src/cloudai/_core/base_runner.py | 23 ++++++++++++-----------
 src/cloudai/cli/handlers.py      |  3 ++-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/cloudai/_core/base_runner.py b/src/cloudai/_core/base_runner.py
index 6eba2d69..72aeb6d9 100644
--- a/src/cloudai/_core/base_runner.py
+++ b/src/cloudai/_core/base_runner.py
@@ -74,20 +74,19 @@ def __init__(self, mode: str, system: System, test_scenario: TestScenario):
 
     def setup_output_directory(self, base_output_path: Path) -> Path:
         """
-        Set up and return the output directory path for the runner instance.
+        Set up and return the base output directory path for the runner instance.
 
         Args:
             base_output_path (Path): The base output directory.
 
         Returns:
-            Path: The path to the output directory.
+            Path: The path to the base output directory.
         """
-        if not base_output_path.exists():
-            base_output_path.mkdir()
-        current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        output_subpath = base_output_path / f"{self.test_scenario.name}_{current_time}"
-        output_subpath.mkdir()
-        return output_subpath
+        base_output__path_with_name = base_output_path / self.test_scenario.name
+        if not base_output__path_with_name.exists():
+            base_output__path_with_name = base_output__path_with_name
+            base_output__path_with_name.mkdir(parents=True, exist_ok=True)
+        return base_output__path_with_name
 
     def register_signal_handlers(self):
         """Register signal handlers for handling termination-related signals."""
@@ -264,10 +263,12 @@ def get_job_output_path(self, tr: TestRun) -> Path:
         job_output_path = Path()  # avoid reportPossiblyUnboundVariable from pyright
 
         try:
-            test_output_path = self.output_path / tr.name
-            test_output_path.mkdir()
+            iteration_path = self.output_path / f"iteration_{tr.dse_iteration}"
+            iteration_path.mkdir(parents=True, exist_ok=True)
+            test_output_path = iteration_path / f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" / tr.name
+            test_output_path.mkdir(parents=True, exist_ok=True)
             job_output_path = test_output_path / str(tr.current_iteration)
-            job_output_path.mkdir()
+            job_output_path.mkdir(parents=True, exist_ok=True)
         except PermissionError as e:
             raise PermissionError(f"Cannot create directory {job_output_path}: {e}") from e
 
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index aedcde0f..e17596dc 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -138,7 +138,8 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
 
     agent.configure(env.action_space)
 
-    for action in agent.get_all_combinations():
+    for dse_iteration, action in enumerate(agent.get_all_combinations(), start=1):
+        tr.dse_iteration = dse_iteration
         for key, value in action.items():
             update_nested_attr(tr.test.test_definition.cmd_args, key, value)
         runner = Runner(args.mode, system, test_scenario)

From 0acf43ec89619f472ff8b768cafdda85981195bc Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 15:07:01 -0800
Subject: [PATCH 11/21] add dse_iteration to TestRun object

---
 src/cloudai/_core/test_scenario.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cloudai/_core/test_scenario.py b/src/cloudai/_core/test_scenario.py
index 39f1bd21..99264ade 100644
--- a/src/cloudai/_core/test_scenario.py
+++ b/src/cloudai/_core/test_scenario.py
@@ -53,6 +53,7 @@ class TestRun:
     output_path: Path = Path("")
     iterations: int = 1
     current_iteration: int = 0
+    dse_iteration: int = 0
     time_limit: Optional[str] = None
     sol: Optional[float] = None
     weight: float = 0.0

From 55c203adcb572f0f7b0f20fca22ff8fc6d9943d5 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 19:03:01 -0800
Subject: [PATCH 12/21] safety valve to seperate the dse execution with
 benchmarking execution

---
 src/cloudai/cli/handlers.py    | 83 +++++++++++++++++++++++++---------
 tests/test_job_type_handler.py | 39 ++++++++++++++++
 2 files changed, 100 insertions(+), 22 deletions(-)
 create mode 100644 tests/test_job_type_handler.py

diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index e17596dc..baef723e 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -84,6 +84,63 @@ def handle_install_and_uninstall(args: argparse.Namespace) -> int:
     return rc
 
 
+def is_dse_job(cmd_args):
+    """
+    Recursively check if any value in cmd_args is a list.
+
+    Args:
+        cmd_args (dict): The command arguments to check.
+
+    Returns:
+        bool: True if any value is a list, False otherwise.
+    """
+    if isinstance(cmd_args, dict):
+        for _key, value in cmd_args.items():
+            if isinstance(value, list) or (isinstance(value, dict) and is_dse_job(value)):
+                return True
+    return False
+
+
+def handle_dse_job(tr, system, test_scenario, args):
+    agent = GridSearchAgent(tr)
+    env = CloudAIGymEnv(test_run=tr, system=system, test_scenario=test_scenario)
+    agent.configure(env.action_space)
+
+    for dse_iteration, action in enumerate(agent.get_all_combinations(), start=1):
+        tr.dse_iteration = dse_iteration
+        for key, value in action.items():
+            update_nested_attr(tr.test.test_definition.cmd_args, key, value)
+        runner = Runner(args.mode, system, test_scenario)
+        asyncio.run(runner.run())
+
+        logging.info(f"All test scenario results stored at: {runner.runner.output_path}")
+
+        if args.mode == "run":
+            generator = ReportGenerator(runner.runner.output_path)
+            generator.generate_report(test_scenario)
+            logging.info(
+                "All test scenario execution attempts are complete. Please review"
+                f" the '{args.log_file}' file to confirm successful completion or to"
+                " identify any issues."
+            )
+
+
+def handle_non_dse_job(tr, system, test_scenario, args):
+    runner = Runner(args.mode, system, test_scenario)
+    asyncio.run(runner.run())
+
+    logging.info(f"All test scenario results stored at: {runner.runner.output_path}")
+
+    if args.mode == "run":
+        generator = ReportGenerator(runner.runner.output_path)
+        generator.generate_report(test_scenario)
+        logging.info(
+            "All test scenario execution attempts are complete. Please review"
+            f" the '{args.log_file}' file to confirm successful completion or to"
+            " identify any issues."
+        )
+
+
 def handle_dry_run_and_run(args: argparse.Namespace) -> int:
     """
     Execute the dry-run or run modes for CloudAI.
@@ -133,28 +190,10 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
         logging.error("No test runs found in the test scenario.")
         return 1
 
-    agent = GridSearchAgent(tr)
-    env = CloudAIGymEnv(test_run=tr, system=system, test_scenario=test_scenario)
-
-    agent.configure(env.action_space)
-
-    for dse_iteration, action in enumerate(agent.get_all_combinations(), start=1):
-        tr.dse_iteration = dse_iteration
-        for key, value in action.items():
-            update_nested_attr(tr.test.test_definition.cmd_args, key, value)
-        runner = Runner(args.mode, system, test_scenario)
-        asyncio.run(runner.run())
-
-        logging.info(f"All test scenario results stored at: {runner.runner.output_path}")
-
-        if args.mode == "run":
-            generator = ReportGenerator(runner.runner.output_path)
-            generator.generate_report(test_scenario)
-            logging.info(
-                "All test scenario execution attempts are complete. Please review"
-                f" the '{args.log_file}' file to confirm successful completion or to"
-                " identify any issues."
-            )
+    if is_dse_job(tr.test.cmd_args):
+        handle_dse_job(tr, system, test_scenario, args)
+    else:
+        handle_non_dse_job(tr, system, test_scenario, args)
 
     return 0
 
diff --git a/tests/test_job_type_handler.py b/tests/test_job_type_handler.py
new file mode 100644
index 00000000..1053ed14
--- /dev/null
+++ b/tests/test_job_type_handler.py
@@ -0,0 +1,39 @@
+from cloudai.cli.handlers import is_dse_job
+
+# Mock data for testing
+mock_toml_dse = {
+    "test": {
+        "cmd_args": {
+            "docker_image_url": "https://docker/fake_url",
+            "load_container": True,
+            "FakeConfig": {
+                "policy": ["option1", "option2"],
+                "shape": "[1, 2, 3, 4]",
+                "dtype": "fake_type",
+                "mesh_shape": "[4, 3, 2, 1]",
+            },
+        }
+    }
+}
+
+mock_toml_non_dse = {
+    "test": {
+        "cmd_args": {
+            "docker_image_url": "https://docker/fake_url",
+            "load_container": True,
+            "FakeConfig": {
+                "policy": "option1",
+                "shape": "[1, 2, 3, 4]",
+                "dtype": "fake_type",
+            },
+        }
+    }
+}
+
+
+def test_is_dse_job_dse():
+    assert is_dse_job(mock_toml_dse["test"]["cmd_args"])
+
+
+def test_is_dse_job_non_dse():
+    assert not is_dse_job(mock_toml_non_dse["test"]["cmd_args"])

From be56e31d70d23eafe5a7739bd5b617e0b8891bc7 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 19:07:06 -0800
Subject: [PATCH 13/21] add copyright headers

---
 tests/test_job_type_handler.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_job_type_handler.py b/tests/test_job_type_handler.py
index 1053ed14..11c1fcc4 100644
--- a/tests/test_job_type_handler.py
+++ b/tests/test_job_type_handler.py
@@ -1,3 +1,19 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from cloudai.cli.handlers import is_dse_job
 
 # Mock data for testing

From 71a12c48bee25adfaae95829ffbff9da48deb3f3 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 19:07:27 -0800
Subject: [PATCH 14/21] remove

---
 tests/test_job_type_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_job_type_handler.py b/tests/test_job_type_handler.py
index 11c1fcc4..5e29d30b 100644
--- a/tests/test_job_type_handler.py
+++ b/tests/test_job_type_handler.py
@@ -16,7 +16,7 @@
 
 from cloudai.cli.handlers import is_dse_job
 
-# Mock data for testing
+
 mock_toml_dse = {
     "test": {
         "cmd_args": {

From 669bd8df4c5a659b2bacf8f47bb8bf1dfac9a704 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Sat, 11 Jan 2025 19:09:56 -0800
Subject: [PATCH 15/21] remove empty line

---
 tests/test_job_type_handler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_job_type_handler.py b/tests/test_job_type_handler.py
index 5e29d30b..61eef1c0 100644
--- a/tests/test_job_type_handler.py
+++ b/tests/test_job_type_handler.py
@@ -16,7 +16,6 @@
 
 from cloudai.cli.handlers import is_dse_job
 
-
 mock_toml_dse = {
     "test": {
         "cmd_args": {

From 02bb3e0d5ec57859bbd57cca462a7b4052f96662 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Mon, 13 Jan 2025 23:02:35 -0800
Subject: [PATCH 16/21] Removing the agent's configuration and instead query
 from the environment.

---
 .../_core/configurator/agents/base_agent.py   | 42 ++-----------------
 .../_core/configurator/agents/grid_search.py  | 10 +++--
 2 files changed, 9 insertions(+), 43 deletions(-)

diff --git a/src/cloudai/_core/configurator/agents/base_agent.py b/src/cloudai/_core/configurator/agents/base_agent.py
index cdb555ad..a62717c1 100644
--- a/src/cloudai/_core/configurator/agents/base_agent.py
+++ b/src/cloudai/_core/configurator/agents/base_agent.py
@@ -17,8 +17,6 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
-from cloudai._core.test_scenario import TestRun
-
 
 class BaseAgent(ABC):
     """
@@ -28,48 +26,14 @@ class BaseAgent(ABC):
     Automatically infers parameter types from TestRun's cmd_args.
     """
 
-    def __init__(self, test_run: TestRun):
+    def __init__(self, action_space: Dict[str, Any]):
         """
         Initialize the agent with the TestRun object.
 
         Args:
-            test_run (TestRun): The TestRun object containing cmd_args and test state.
-        """
-        self.test_run = test_run
-        self.action_space = self.extract_action_space(test_run.test.cmd_args)
-
-    def extract_action_space(self, cmd_args: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Extract the action space from cmd_args by inferring parameter types.
-
-        Args:
-            cmd_args (Dict[str, Any]): The command arguments from TestRun.
-
-        Returns:
-            Dict[str, Any]: Action space defined with inferred parameter types.
+            action_space (Dict[str, Any]): The action space for the agent.
         """
-        action_space = {}
-
-        for key, value in cmd_args.items():
-            self._process_value(action_space, key, value)
-
-        return action_space
-
-    def _process_value(self, action_space: Dict[str, Any], key: str, value: Any) -> None:
-        if isinstance(value, list):
-            self._process_list(action_space, key, value)
-        elif isinstance(value, dict):
-            for sub_key, sub_value in value.items():
-                full_key = f"{key}.{sub_key}"
-                self._process_value(action_space, full_key, sub_value)
-
-    def _process_list(self, action_space: Dict[str, Any], key: str, value: list) -> None:
-        if all(isinstance(v, int) for v in value):
-            action_space[key] = {"type": "int", "values": value}
-        elif all(isinstance(v, float) for v in value):
-            action_space[key] = {"type": "float", "values": value}
-        else:
-            action_space[key] = {"type": "categorical", "categories": value}
+        self.action_space = action_space
 
     @abstractmethod
     def configure(self, config: Dict[str, Any]) -> None:
diff --git a/src/cloudai/_core/configurator/agents/grid_search.py b/src/cloudai/_core/configurator/agents/grid_search.py
index 2ab1b36b..6775948d 100644
--- a/src/cloudai/_core/configurator/agents/grid_search.py
+++ b/src/cloudai/_core/configurator/agents/grid_search.py
@@ -18,7 +18,7 @@
 from typing import Any, Dict, List
 
 from cloudai._core.configurator.agents.base_agent import BaseAgent
-from cloudai._core.test_scenario import TestRun
+from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
 
 
 class GridSearchAgent(BaseAgent):
@@ -28,14 +28,16 @@ class GridSearchAgent(BaseAgent):
     Iterates through all possible parameter combinations.
     """
 
-    def __init__(self, test_run: TestRun):
+    def __init__(self, env: CloudAIGymEnv):
         """
         Initialize the GridSearchAgent with the TestRun object.
 
         Args:
-            test_run (TestRun): The TestRun object containing cmd_args and test state.
+             env (CloudAIGymEnv): The environment instance to query the action space from.
         """
-        super().__init__(test_run)
+        self.action_space = env.define_action_space()
+        super().__init__(self.action_space)
+        self.env = env
         self.action_combinations = []
         self.index = 0
 

From 382e424c1aec849b42fa338092544cb2b7a7a09d Mon Sep 17 00:00:00 2001
From: itamar-rauch <irauch@nvidia.com>
Date: Mon, 13 Jan 2025 15:43:06 +0200
Subject: [PATCH 17/21] Fixed typo in NeMoRunTestDefinition docstr (#336)

---
 src/cloudai/test_definitions/nemo_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py
index fa424672..b5db1a03 100644
--- a/src/cloudai/test_definitions/nemo_run.py
+++ b/src/cloudai/test_definitions/nemo_run.py
@@ -29,7 +29,7 @@ class NeMoRunCmdArgs(CmdArgs):
 
 
 class NeMoRunTestDefinition(TestDefinition):
-    """Test object for NeMoLauncher."""
+    """Test object for NeMoRun."""
 
     cmd_args: NeMoRunCmdArgs
     _docker_image: Optional[DockerImage] = None

From e16873de7be223979f1c6502f4793fea57f63d72 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <theo@nvidia.com>
Date: Mon, 13 Jan 2025 22:07:45 -0500
Subject: [PATCH 18/21] Mount NCCL_TOPO_FILE in NCCL test (#337)

---
 .../test_template/nccl_test/slurm_command_gen_strategy.py  | 7 +++----
 .../test_nccl_slurm_command_gen_strategy.py                | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
index 3157c5d8..f26f214c 100644
--- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py
@@ -31,10 +31,9 @@ def _parse_slurm_args(
         base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)
 
         container_mounts = ""
-        if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars:
-            nccl_graph_path = Path(env_vars["NCCL_TOPO_FILE"]).resolve()
-            nccl_graph_file = env_vars["DOCKER_NCCL_TOPO_FILE"]
-            container_mounts = f"{nccl_graph_path}:{nccl_graph_file}"
+        if "NCCL_TOPO_FILE" in env_vars:
+            nccl_topo_file = Path(env_vars["NCCL_TOPO_FILE"]).resolve()
+            container_mounts = f"{nccl_topo_file}:{nccl_topo_file}"
         elif "NCCL_TOPO_FILE" in env_vars:
             del env_vars["NCCL_TOPO_FILE"]
 
diff --git a/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py
index c32ede71..33dd60c7 100644
--- a/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py
+++ b/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py
@@ -35,12 +35,12 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGen
         [
             (
                 "nccl_test",
-                {"NCCL_TOPO_FILE": "/path/to/topo", "DOCKER_NCCL_TOPO_FILE": "/docker/topo"},
+                {"NCCL_TOPO_FILE": "/path/to/topo"},
                 {"subtest_name": "all_reduce_perf", "docker_image_url": "fake_image_url"},
                 2,
                 ["node1", "node2"],
                 {
-                    "container_mounts": "/path/to/topo:/docker/topo",
+                    "container_mounts": "/path/to/topo:/path/to/topo",
                 },
             ),
             (
@@ -50,7 +50,7 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGen
                 1,
                 ["node1"],
                 {
-                    "container_mounts": "",
+                    "container_mounts": "/path/to/topo:/path/to/topo",
                 },
             ),
         ],

From fd3b6c90fcf34491737e5f1da938339be9873421 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Mon, 13 Jan 2025 23:09:23 -0800
Subject: [PATCH 19/21] Fix the testing code

---
 tests/test_agents.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tests/test_agents.py b/tests/test_agents.py
index 075664bd..71963040 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -13,39 +13,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from unittest.mock import MagicMock
 
 import pytest
 
 from cloudai._core.configurator.agents.grid_search import GridSearchAgent
-from cloudai._core.test_scenario import TestRun
+from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
 
 
 @pytest.fixture
-def mock_test_run():
+def mock_env():
     """
-    Fixture to provide a mock TestRun object for testing.
+    Fixture to provide a mock CloudAIGymEnv object for testing.
     """
-    test_run = MagicMock(spec=TestRun)
-    test_run.test = MagicMock()
-    test_run.test.cmd_args = {
-        "docker_image_url": "https://docker/url",
-        "iters": [10, 100],
-        "maxbytes": [1024, 2048],
-        "minbytes": [512, 1024, 2048, 4096],
-        "ngpus": [4],
-        "subtest_name": "nccl_test",
-        "warmup_iters": 5,
+    env = MagicMock(spec=CloudAIGymEnv)
+    env.define_action_space.return_value = {
+        "iters": {"type": "int", "values": [10, 100]},
+        "maxbytes": {"type": "int", "values": [1024, 2048]},
+        "minbytes": {"type": "int", "values": [512, 1024, 2048, 4096]},
+        "ngpus": {"type": "int", "values": [4]},
     }
-    return test_run
+    return env
 
 
-def test_grid_search_agent(mock_test_run):
+def test_grid_search_agent(mock_env):
     """
     Test the GridSearchAgent's ability to traverse the action space.
     """
-    agent = GridSearchAgent(mock_test_run)
+    agent = GridSearchAgent(mock_env)
     agent.configure(config={})
 
     combinations = agent.get_all_combinations()

From 9fadfce6b3fe16fd8455d4dd54ff8b8f05eb138c Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Tue, 14 Jan 2025 09:55:05 -0800
Subject: [PATCH 20/21] fix the configurator structure

---
 src/cloudai/_core/configurator/{agents => }/base_agent.py  | 0
 src/cloudai/_core/configurator/{agents => }/grid_search.py | 2 +-
 src/cloudai/cli/handlers.py                                | 2 +-
 tests/test_agents.py                                       | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename src/cloudai/_core/configurator/{agents => }/base_agent.py (100%)
 rename src/cloudai/_core/configurator/{agents => }/grid_search.py (98%)

diff --git a/src/cloudai/_core/configurator/agents/base_agent.py b/src/cloudai/_core/configurator/base_agent.py
similarity index 100%
rename from src/cloudai/_core/configurator/agents/base_agent.py
rename to src/cloudai/_core/configurator/base_agent.py
diff --git a/src/cloudai/_core/configurator/agents/grid_search.py b/src/cloudai/_core/configurator/grid_search.py
similarity index 98%
rename from src/cloudai/_core/configurator/agents/grid_search.py
rename to src/cloudai/_core/configurator/grid_search.py
index 6775948d..8b47d00c 100644
--- a/src/cloudai/_core/configurator/agents/grid_search.py
+++ b/src/cloudai/_core/configurator/grid_search.py
@@ -17,7 +17,7 @@
 import itertools
 from typing import Any, Dict, List
 
-from cloudai._core.configurator.agents.base_agent import BaseAgent
+from cloudai._core.configurator.base_agent import BaseAgent
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
 
 
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
index baef723e..382cc410 100644
--- a/src/cloudai/cli/handlers.py
+++ b/src/cloudai/cli/handlers.py
@@ -22,8 +22,8 @@
 from unittest.mock import Mock
 
 from cloudai import Installable, Parser, Registry, ReportGenerator, Runner, System
-from cloudai._core.configurator.agents.grid_search import GridSearchAgent
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
+from cloudai._core.configurator.grid_search import GridSearchAgent
 
 from ..parser import HOOK_ROOT
 
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 71963040..ce920a10 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from cloudai._core.configurator.agents.grid_search import GridSearchAgent
+from cloudai._core.configurator.grid_search import GridSearchAgent
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
 
 

From 4cb3f0248c6142433a64d53d9d34f92208398d76 Mon Sep 17 00:00:00 2001
From: Srivatsan Krishnan <srivatsank@nvidia.com>
Date: Tue, 14 Jan 2025 16:45:39 -0800
Subject: [PATCH 21/21] update the test_agent

---
 tests/test_agents.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_agents.py b/tests/test_agents.py
index ce920a10..2e13bc6e 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -17,8 +17,8 @@
 
 import pytest
 
-from cloudai._core.configurator.grid_search import GridSearchAgent
 from cloudai._core.configurator.cloudai_gym import CloudAIGymEnv
+from cloudai._core.configurator.grid_search import GridSearchAgent
 
 
 @pytest.fixture