aidudezzz · stavliv · Apr 26, 2023 · Apr 26, 2023 · Apr 28, 2023 · Apr 28, 2023
diff --git a/examples/double_cartpole/README.md b/examples/double_cartpole/README.md
@@ -0,0 +1,28 @@
+# DoubleCartPole
+
+This example extends the classic [CartPole](https://gymnasium.farama.org/environments/classic_control/cart_pole/) problem by simulating a Double CartPole environment in [Webots](https://cyberbotics.com), with both carts interacting in a shared environment, showcasing a multi-agent reinforcement learning approach.
+
+[PyTorch](https://pytorch.org/) is used as the backend neural network library. 
+
+The solution uses discrete action spaces, with each cart-pole agent applying discrete forces to balance the poles. The implementation provided is using a custom [Proximal Policy Optimization Reinforcement (PPO) Learning (RL) algorithm](https://openai.com/blog/openai-baselines-ppo/), and the emitter-receiver scheme ([supervisor script](./controllers/supervisor_manager/supervisor_controller.py), [robot script](./controllers/robot_controller/robot_controller.py)).
+
+This setup highlights the flexibility of the deepbots framework in multi-agent environments.
+
+You can find the corresponding .wbt world files to open in Webots [here](./worlds/).
+
+----
+
+### Contents
+- [supervisor](./controllers/supervisor_manager/supervisor_controller.py), [robot](./controllers/robot_controller/), [custom PPO](./controllers/supervisor_manager/agent/PPO_agent.py)
+
+----
+
+### Showcase of trained PPO agents
+
+Trained agent in action:
+
+![image](./doc/double_cartpole_trained.gif)
+
+Reward per episode plot:
+
+![image](./doc/reward.png)
diff --git a/examples/double_cartpole/controllers/.keep b/examples/double_cartpole/controllers/.keep
diff --git a/examples/double_cartpole/controllers/robot_controller/robot_controller.py b/examples/double_cartpole/controllers/robot_controller/robot_controller.py
@@ -0,0 +1,99 @@
+from deepbots.robots.controllers.csv_robot import CSVRobot
+
+
+class CartPoleRobot(CSVRobot):
+    """
+    CartPole robot has 4 wheels and pole connected by an unactuated hinge to its body.
+    The hinge contains a Position Sensor device to measure the angle from vertical needed in the observation.
+    Hinge: https://cyberbotics.com/doc/reference/hingejoint
+    Position Sensor: https://cyberbotics.com/doc/reference/positionsensor
+    """
+    def __init__(self):
+        """
+        The constructor gets the Position Sensor reference and enables it and also initializes the wheels.
+        """
+        super().__init__()
+
+        self.pole_position = self.getDevice("polePosSensor")
+        self.pole_position.enable(self.timestep)
+
+        self.wheels = [None for _ in range(4)]
+        self.setup_motors()
+
+    def initialize_comms(self,
+                         emitter_name="emitter",
+                         receiver_name="receiver"):
+        emitter = self.getDevice(emitter_name)
+        receiver = self.getDevice(receiver_name)
+        receiver.enable(self.timestep)
+        emitter.setChannel(int(self.getName()[-1]))
+        receiver.setChannel(int(self.getName()[-1]))
+
+        return emitter, receiver
+
+    def setup_motors(self):
+        """
+        This method initializes the four wheels, storing the references inside a list and setting the starting
+        positions and velocities.
+        """
+        self.wheels[0] = self.getDevice('wheel1')
+        self.wheels[1] = self.getDevice('wheel2')
+        self.wheels[2] = self.getDevice('wheel3')
+        self.wheels[3] = self.getDevice('wheel4')
+        for i in range(len(self.wheels)):
+            self.wheels[i].setPosition(float('inf'))
+            self.wheels[i].setVelocity(0.0)
+
+    def create_message(self):
+        """
+        This method packs the robot's observation into a list of strings to be sent to the supervisor.
+        The message contains only the Position Sensor value, ie. the angle from vertical position in radians.
+        From Webots documentation:
+        'The getValue function returns the most recent value measured by the specified position sensor. Depending on
+        the type, it will return a value in radians (angular position sensor) or in meters (linear position sensor).'
+        :return: A list of strings with the robot's observations.
+        :rtype: list
+        """
+        return str(self.pole_position.getValue())
+
+    def handle_receiver(self):
+        """
+        Modified handle_receiver from the basic implementation of deepbots.
+        This one consumes all available messages in the queue during the step it is called.
+        """
+        while self.receiver.getQueueLength() > 0:
+            # Receive and decode message from supervisor
+            message = self.receiver.getString()
+            # Convert string message into a list
+            message = message.split(",")
+
+            self.use_message_data(message)
+
+            self.receiver.nextPacket()
+
+    def use_message_data(self, message):
+        """
+        This method unpacks the supervisor's message, which contains the next action to be executed by the robot.
+        In this case it contains an integer denoting the action, either 0 or 1, with 0 being forward and
+        1 being backward movement. The corresponding motor_speed value is applied to the wheels.
+        :param message: The message the supervisor sent containing the next action.
+        :type message: list of strings
+        """
+        action = int(message[0])
+
+        assert action == 0 or action == 1, "CartPoleRobot controller got incorrect action value: " + str(
+            action)
+
+        if action == 0:
+            motor_speed = 5.0
+        else:
+            motor_speed = -5.0
+
+        for i in range(len(self.wheels)):
+            self.wheels[i].setPosition(float('inf'))
+            self.wheels[i].setVelocity(motor_speed)
+
+
+# Create the robot controller object and run it
+robot_controller = CartPoleRobot()
+robot_controller.run()
diff --git a/examples/double_cartpole/controllers/supervisor_manager/PPO_runner.py b/examples/double_cartpole/controllers/supervisor_manager/PPO_runner.py
@@ -0,0 +1,177 @@
+import numpy as np
+
+from agent.PPO_agent import PPOAgent, Transition
+from supervisor_controller import DoubleCartPoleSupervisor
+from utilities import plot_data
+
+# Change these variables if needed
+EPISODE_LIMIT = 10000
+STEPS_PER_EPISODE = 200
+NUM_ROBOTS = 2
+
+def run(full_space=True):
+    """
+    Performs the training of the PPO agants and then deployes the trained agents to run in an infinite loop.
+
+    Also plots the training results and prints progress during training.
+
+    :param full_space: Toggle between providing each agent with the full observation space or only its own cart's data.
+
+        - When True, each agent receives the full observation space, including the other cart's data: 
+            [x_cart1, v_cart1, theta_pole1, v_pole1, x_cart2, v_cart2, theta_pole2, v_pole2].
+        - When False, each agent receives only its own cart's data: [x_cart, v_cart, theta_pole, v_pole]. Deafults to True.
+    :type full_space: bool
+    """
+    # Initialize supervisor object
+    supervisor = DoubleCartPoleSupervisor(num_robots=NUM_ROBOTS)
+    # Determine the dimensonality of the observation space each agent will be fed based on the `full_space` parameter
+    agent_obs_dim = (
+        supervisor.observation_space.shape[0] if full_space 
+        else supervisor.observation_space.shape[0] // supervisor.num_robots
+    )
+    # The agents used here are trained with the PPO algorithm (https://arxiv.org/abs/1707.06347).
+    agent_1 = PPOAgent(
+        agent_obs_dim,
+        supervisor.action_space.n,
+        clip_param=0.2,
+        max_grad_norm=0.5,
+        ppo_update_iters=5,
+        batch_size=8,
+        gamma=0.99,
+        use_cuda=False,
+        actor_lr=0.001,
+        critic_lr=0.003,
+    )
+    agent_2 = PPOAgent(
+        agent_obs_dim,
+        supervisor.action_space.n,
+        clip_param=0.2,
+        max_grad_norm=0.5,
+        ppo_update_iters=5,
+        batch_size=8,
+        gamma=0.99,
+        use_cuda=False,
+        actor_lr=0.001,
+        critic_lr=0.003,
+    )
+    agents = [agent_1, agent_2]
+
+    episode_count = 0
+    solved = False  # Whether the solved requirement is met
+
+    # Run outer loop until the episodes limit is reached or the task is solved
+    while not solved and episode_count < EPISODE_LIMIT:
+        state = supervisor.reset()  # Reset robots and get starting observation
+        supervisor.episode_score = 0
+        action_probs = []
+        # Inner loop is the episode loop
+        for step in range(STEPS_PER_EPISODE):
+            # In training mode the agent samples from the probability distribution, naturally implementing exploration
+            selected_actions, action_probs = [], []
+            for i in range(NUM_ROBOTS):
+                if full_space:
+                    agent_state = state
+                else:
+                    agent_state = state[(i * agent_obs_dim): ((i + 1) * agent_obs_dim)]
+                selected_action, action_prob = agents[i].work(
+                    agent_state,
+                    type_="selectAction"
+                )
+                action_probs.append(action_prob)
+                selected_actions.append(selected_action)
+
+            # Step the supervisor to get the current selected_action reward, the new state and whether we reached the
+            # done condition
+            new_state, reward, done, info = supervisor.step(
+                [*selected_actions]
+            )
+
+            # Save the current state transitions from all robots in agent's memory
+            for i in range(NUM_ROBOTS):
+                if full_space:
+                    agent_state = state
+                    agent_new_state = new_state
+                else:
+                    agent_state = state[(i * agent_obs_dim): ((i + 1) * agent_obs_dim)]
+                    agent_new_state = new_state[(i * agent_obs_dim): ((i + 1) * agent_obs_dim)]
+                agents[i].store_transition(
+                    Transition(
+                        agent_state,
+                        selected_actions[i],
+                        action_probs[i],
+                        reward[i],
+                        agent_new_state,
+                    )
+                )
+            # Accumulate episode reward
+            supervisor.episode_score += np.array(reward)
+
+            if done:
+                # Save the episode's score
+                supervisor.episode_score_list.append(
+                    supervisor.episode_score
+                )
+                # Perform a training step
+                for i in range(NUM_ROBOTS):
+                    agents[i].train_step(batch_size=step + 1)
+                # Check whether the task is solved
+                solved = supervisor.solved()
+                break
+
+            state = new_state  # state for next step is current step's new_state
+
+        # The average action probability tells us how confident the agent was of its actions.
+        # By looking at this we can check whether the agent is converging to a certain policy.
+        avg_action_prob = [
+            round(np.mean(action_probs[i]), 4) for i in range(NUM_ROBOTS)
+        ]
+        print(
+            f"Episode: {episode_count} Score = {supervisor.episode_score} | Average Action Probabilities = {avg_action_prob}"
+        )
+
+        episode_count += 1  # Increment episode counter
+
+    moving_avg_n = 10
+    plot_data(
+        np.convolve(
+            np.array(supervisor.episode_score_list).T[0],
+            np.ones((moving_avg_n,)) / moving_avg_n,
+            mode='valid',
+        ),
+        "episode",
+        "episode score",
+        "Episode scores over episodes", save=True, save_name="reward.png"
+    )
+
+    if not solved:
+        print("Reached episode limit and task was not solved.")
+    else:
+        if not solved:
+            print("Task is not solved, deploying agent for testing...")
+        elif solved:
+            print("Task is solved, deploying agent for testing...")
+
+    state = supervisor.reset()
+    supervisor.episode_score = 0
+    while True:
+        selected_actions = []
+        action_probs = []
+        for i in range(NUM_ROBOTS):
+            if full_space:
+                agent_state = state
+            else:
+                agent_state = state[(i * agent_obs_dim): ((i + 1) * agent_obs_dim)]
+            selected_action, action_prob = agents[i].work(
+                agent_state, # state[0:4] for 1st robot, state[4:8] for 2nd robot
+                type_="selectAction"
+            )
+            action_probs.append(action_prob)
+            selected_actions.append(selected_action)
+
+        state, reward, done, _ = supervisor.step(selected_actions)
+        supervisor.episode_score += np.array(reward)  # Accumulate episode reward
+
+        if done:
+            print(f"Reward accumulated = {supervisor.episode_score}")
+            supervisor.episode_score = 0
+            state = supervisor.reset()