From 5568db5129eefe7b7abe0209c984868b33a34476 Mon Sep 17 00:00:00 2001 From: johnjim0816 Date: Tue, 11 Jun 2024 23:43:09 +0800 Subject: [PATCH] [feat_dev] try to use independ_actor --- joyrl/algos/PPO/config.py | 5 +- joyrl/algos/PPO/data_handler.py | 14 +- joyrl/algos/PPO/policy.py | 22 ++-- joyrl/algos/base/data_handler.py | 11 +- joyrl/algos/base/network.py | 123 ++++++++---------- joyrl/framework/collector.py | 4 +- .../CartPole-v1/CartPole-v1_PPO-KL_Test.yaml | 32 ----- .../CartPole-v1/CartPole-v1_PPO-KL_Train.yaml | 32 ----- .../CartPole-v1/CartPole-v1_PPO.yaml | 20 +-- .../CartPole-v1_PPO_off_policy.yaml | 10 +- 10 files changed, 112 insertions(+), 161 deletions(-) delete mode 100644 presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Test.yaml delete mode 100644 presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Train.yaml diff --git a/joyrl/algos/PPO/config.py b/joyrl/algos/PPO/config.py index e5686ac..584c2d6 100644 --- a/joyrl/algos/PPO/config.py +++ b/joyrl/algos/PPO/config.py @@ -5,12 +5,12 @@ Email: johnjim0816@gmail.com Date: 2023-02-20 21:53:39 LastEditor: JiangJi -LastEditTime: 2024-06-03 13:38:24 +LastEditTime: 2024-06-11 23:34:11 Discription: ''' class AlgoConfig(object): def __init__(self): - self.independ_actor = True # whether to use independent actor + self.independ_actor = False # whether to use independent actor # whether actor and critic share the same optimizer self.ppo_type = 'clip' # clip or kl self.eps_clip = 0.2 # clip parameter for PPO @@ -21,6 +21,7 @@ def __init__(self): self.kl_beta = 1.5 # beta for KL penalty, 1.5 is the default value in the paper self.kl_alpha = 2 # alpha for KL penalty, 2 is the default value in the paper self.action_type_list = "continuous" # continuous action space + self.return_form = 'mc' # 'mc' or 'td' or 'gae' self.gamma = 0.99 # discount factor self.k_epochs = 4 # update policy for K epochs self.lr = 0.0001 # for shared optimizer diff --git a/joyrl/algos/PPO/data_handler.py b/joyrl/algos/PPO/data_handler.py index f4ea780..bf40b2d 100644 --- a/joyrl/algos/PPO/data_handler.py +++ b/joyrl/algos/PPO/data_handler.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-05-17 01:08:36 LastEditor: JiangJi -LastEditTime: 2024-06-05 14:32:55 +LastEditTime: 2024-06-11 19:59:16 Discription: ''' import numpy as np @@ -18,7 +18,7 @@ def __init__(self, cfg): self.gae_lambda = getattr(self.cfg, 'gae_lambda', 0.95) self.gamma = getattr(self.cfg, 'gamma', 0.95) self.batch_exps = [] - + def handle_exps_after_interact(self, exps): exp_len = self._get_exp_len(exps) next_value = exps[-1].value @@ -65,8 +65,14 @@ def _handle_exps_before_train(self, exps: list): log_probs = [exp.log_prob.detach().cpu().numpy().item() for exp in exps] # log_probs = torch.cat(log_probs, dim=0).detach() # [batch_size,1] # log_probs = torch.tensor(log_probs, dtype = torch.float32, device = self.cfg.device).unsqueeze(dim=1) - - returns = np.array([exp.return_mc_normed for exp in exps]) + if self.cfg.return_form.lower() == 'mc': + returns = np.array([exp.return_mc_normed for exp in exps]) + elif self.cfg.return_form.lower() == 'td': + returns = np.array([exp.normed_return_td for exp in exps]) + elif self.cfg.return_form.lower() == 'gae': + returns = np.array([exp.normed_return_gae for exp in exps]) + else: + raise NotImplementedError("return_form not implemented") # returns = torch.tensor(returns, dtype = torch.float32, device = self.cfg.device).unsqueeze(dim=1) self.data_after_train.update({'log_probs': log_probs, 'returns': returns}) diff --git a/joyrl/algos/PPO/policy.py b/joyrl/algos/PPO/policy.py index 1e829c5..5900ec7 100644 --- a/joyrl/algos/PPO/policy.py +++ b/joyrl/algos/PPO/policy.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-12-22 23:02:13 LastEditor: JiangJi -LastEditTime: 2024-06-05 14:33:20 +LastEditTime: 2024-06-11 23:38:05 Discription: ''' import torch @@ -63,7 +63,11 @@ def create_model(self): self.model = ActorCriticNetwork(self.cfg, self.state_size_list).to(self.device) def create_optimizer(self): - self.optimizer = optim.Adam(self.model.parameters(), lr = self.cfg.lr) + if getattr(self.cfg, 'independ_actor', False): + self.optimizer = optim.Adam([{'params': self.model.actor.parameters(), 'lr': self.cfg.actor_lr}, + {'params': self.model.critic.parameters(), 'lr': self.cfg.critic_lr}]) + else: + self.optimizer = optim.Adam(self.model.parameters(), lr = self.cfg.lr) def update_policy_transition(self): self.policy_transition = {'value': self.value.detach().cpu().numpy().item(), 'log_prob': self.log_prob} @@ -71,12 +75,11 @@ def update_policy_transition(self): def sample_action(self, state, **kwargs): state = torch.tensor(np.array(state), device=self.device, dtype=torch.float32) # single state shape must be [batch_size, state_dim] - if state.dim() == 1: - state = state.unsqueeze(dim=0) + if state.dim() == 1: state = state.unsqueeze(dim=0) model_outputs = self.model(state) self.value = model_outputs['value'] actor_outputs = model_outputs['actor_outputs'] - actions, self.log_prob = self.model.action_layers.get_actions_and_log_probs(mode = 'sample', actor_outputs = actor_outputs) + actions, self.log_prob = self.model.get_actions_and_log_probs(mode = 'sample', actor_outputs = actor_outputs) self.update_policy_transition() return actions @@ -84,11 +87,10 @@ def sample_action(self, state, **kwargs): def predict_action(self, state, **kwargs): state = torch.tensor(np.array(state), device=self.device, dtype=torch.float32) # single state shape must be [batch_size, state_dim] - if state.dim() == 1: - state = state.unsqueeze(dim=0) + if state.dim() == 1: state = state.unsqueeze(dim=0) model_outputs = self.model(state) actor_outputs = model_outputs['actor_outputs'] - actions = self.model.action_layers.get_actions(mode = 'predict', actor_outputs = actor_outputs) + actions = self.model.get_actions(mode = 'predict', actor_outputs = actor_outputs) return actions def prepare_data_before_learn(self, **kwargs): @@ -120,9 +122,9 @@ def learn(self, **kwargs): model_outputs = self.model(old_states) values = model_outputs['value'] actor_outputs = model_outputs['actor_outputs'] - new_log_probs = self.model.action_layers.get_log_probs_action(actor_outputs, old_actions) + new_log_probs = self.model.get_log_probs_action(actor_outputs, old_actions) # new_log_probs = self.model.action_layers.get_log_probs_action(old_actions) - entropy_mean = self.model.action_layers.get_mean_entropy(actor_outputs) + entropy_mean = self.model.get_mean_entropy(actor_outputs) advantages = returns - values.detach() # shape:[batch_size,1] # get action probabilities # compute ratio (pi_theta / pi_theta__old): diff --git a/joyrl/algos/base/data_handler.py b/joyrl/algos/base/data_handler.py index 77f8255..57ebad8 100644 --- a/joyrl/algos/base/data_handler.py +++ b/joyrl/algos/base/data_handler.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-12-02 15:02:30 LastEditor: JiangJi -LastEditTime: 2024-06-05 14:17:47 +LastEditTime: 2024-06-11 20:12:47 Discription: ''' import torch @@ -32,6 +32,14 @@ def _get_exp_len(self, exps, max_step: int = 1): exp_len = exp_len - max_step return exp_len + def get_training_data(self): + ''' get training data + ''' + exps = self.buffer.sample(sequential=True) + if exps is not None: + self._handle_exps_before_train(exps) + return self.data_after_train + def handle_exps_after_interact(self, exps: list) -> list: ''' handle exps after interact ''' @@ -40,7 +48,6 @@ def handle_exps_after_interact(self, exps: list) -> list: def add_exps(self, exps): exps = self.handle_exps_after_interact(exps) self.buffer.push(exps) - def get_training_data(self): ''' get training data diff --git a/joyrl/algos/base/network.py b/joyrl/algos/base/network.py index 783a064..1be2b43 100644 --- a/joyrl/algos/base/network.py +++ b/joyrl/algos/base/network.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-12-22 23:02:13 LastEditor: JiangJi -LastEditTime: 2024-06-02 10:51:15 +LastEditTime: 2024-06-11 23:41:41 Discription: ''' import copy @@ -218,26 +218,7 @@ def reset_noise(self): self.branch_layers.reset_noise() self.merge_layer.reset_noise() -class ActorCriticNetwork(BaseNework): - ''' Value network, for policy-based methods, in which the branch_layers and critic share the same network - ''' - def __init__(self, cfg: MergedConfig, input_size_list: list) -> None: - super(ActorCriticNetwork, self).__init__(cfg, input_size_list) - self.action_type_list = self.cfg.action_type_list - self.create_graph() - - def create_graph(self): - self.branch_layers = BranchLayers(self.cfg.branch_layers, self.input_size_list) - self.merge_layer = MergeLayer(self.cfg.merge_layers, self.branch_layers.output_size_list) - self.value_layer, _ = create_layer(self.merge_layer.output_size, LayerConfig(layer_type='linear', layer_size=[1], activation='none')) - self.action_layers = ActionLayers(self.cfg, self.merge_layer.output_size,) - - def forward(self, x, pre_legal_actions=None): - x = self.branch_layers(x) - x = self.merge_layer(x) - value = self.value_layer(x) - actor_outputs = self.action_layers(x, pre_legal_actions = pre_legal_actions) - return {'value': value, 'actor_outputs': actor_outputs} + class ActorNetwork(BaseNework): def __init__(self, cfg: MergedConfig, input_size_list) -> None: @@ -272,48 +253,58 @@ def forward(self, x): x = self.merge_layer(x) value = self.value_layer(x) return value - -if __name__ == "__main__": - # test:export PYTHONPATH=./:$PYTHONPATH - import torch - from joyrl.framework.config import MergedConfig - import gymnasium as gym - cfg = MergedConfig() - state_size = [[None, 4], [None, 4]] - cfg.n_actions = 2 - cfg.continuous = False - cfg.min_policy = 0 - cfg.branch_layers = [ - - [ - {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, - {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, - ], - # [ - # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, - # {'layer_type': 'linear', 'layer_size': [64], 'activation': 'ReLU'}, - # ], - ] - cfg.merge_layers = [ - {'layer_type': 'linear', 'layer_size': [2], 'activation': 'ReLU'}, - {'layer_type': 'linear', 'layer_size': [2], 'activation': 'ReLU'}, - ] - cfg.value_layers = [ - {'layer_type': 'embed', 'n_embeddings': 10, 'embedding_dim': 32, 'activation': 'none'}, - {'layer_type': 'Linear', 'layer_size': [64], 'activation': 'ReLU'}, - {'layer_type': 'Linear', 'layer_size': [64], 'activation': 'ReLU'}, - ] - cfg.actor_layers = [ - {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, - {'layer_type': 'linear', 'layer_size': [256], 'activation': 'ReLU'}, - ] - action_space = gym.spaces.Discrete(2) - model = QNetwork(cfg, state_size, [action_space.n]) - x = [torch.tensor([[ 0.0012, 0.0450, -0.0356, 0.0449]]), torch.tensor([[ 0.0012, 0.0450, -0.0356, 0.0449]])] - x = model(x) - print(x) - # value_net = QNetwork(cfg, state_dim, cfg.n_actions) - # print(value_net) - # x = torch.tensor([36]) - # print(x.shape) - # print(value_net(x)) \ No newline at end of file + +class ActorCriticNetwork(BaseNework): + ''' Value network, for policy-based methods, in which the branch_layers and critic share the same network + ''' + def __init__(self, cfg: MergedConfig, input_size_list: list) -> None: + super(ActorCriticNetwork, self).__init__(cfg, input_size_list) + self.action_type_list = self.cfg.action_type_list + self.create_graph() + + def create_graph(self): + if getattr(self.cfg, 'independ_actor', False): + self.actor = ActorNetwork(self.cfg, self.input_size_list) + self.critic = CriticNetwork(self.cfg, self.input_size_list) + else: + self.branch_layers = BranchLayers(self.cfg.branch_layers, self.input_size_list) + self.merge_layer = MergeLayer(self.cfg.merge_layers, self.branch_layers.output_size_list) + self.value_layer, _ = create_layer(self.merge_layer.output_size, LayerConfig(layer_type='linear', layer_size=[1], activation='none')) + self.action_layers = ActionLayers(self.cfg, self.merge_layer.output_size,) + + def forward(self, x, pre_legal_actions=None): + if getattr(self.cfg, 'independ_actor', False): + actor_outputs = self.actor(x, pre_legal_actions) + value = self.critic(x) + return {'value': value, 'actor_outputs': actor_outputs} + else: + x = self.branch_layers(x) + x = self.merge_layer(x) + value = self.value_layer(x) + actor_outputs = self.action_layers(x, pre_legal_actions = pre_legal_actions) + return {'value': value, 'actor_outputs': actor_outputs} + + def get_actions_and_log_probs(self, **kwargs): + if getattr(self.cfg, 'independ_actor', False): + return self.actor.action_layers.get_actions_and_log_probs(**kwargs) + else: + return self.action_layers.get_actions_and_log_probs(**kwargs) + + def get_log_probs_action(self, actor_outputs, actions): + if getattr(self.cfg, 'independ_actor', False): + return self.actor.action_layers.get_log_probs_action(actor_outputs, actions) + else: + return self.action_layers.get_log_probs_action(actor_outputs, actions) + + def get_mean_entropy(self, actor_outputs): + if getattr(self.cfg, 'independ_actor', False): + return self.actor.action_layers.get_mean_entropy(actor_outputs) + else: + return self.action_layers.get_mean_entropy(actor_outputs) + + def get_actions(self, **kwargs): + if getattr(self.cfg, 'independ_actor', False): + return self.actor.action_layers.get_actions(**kwargs) + else: + return self.action_layers.get_actions(**kwargs) + \ No newline at end of file diff --git a/joyrl/framework/collector.py b/joyrl/framework/collector.py index 93e9794..1269454 100644 --- a/joyrl/framework/collector.py +++ b/joyrl/framework/collector.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-12-22 23:02:13 LastEditor: JiangJi -LastEditTime: 2024-06-11 13:46:02 +LastEditTime: 2024-06-11 19:52:29 Discription: ''' import time @@ -84,7 +84,7 @@ def _get_training_data(self): break if self.cfg.is_learner_async: break - if time.time() - get_training_data_time >= 0.05: + if time.time() - get_training_data_time >= 0.02: # exec_method(self.logger, 'warning', 'remote', "[Collector._get_training_data] get training data timeout!") get_training_data_time = time.time() break diff --git a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Test.yaml b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Test.yaml deleted file mode 100644 index d37e16f..0000000 --- a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Test.yaml +++ /dev/null @@ -1,32 +0,0 @@ -general_cfg: - algo_name: PPO - device: cuda - env_name: CartPole-v1 - mode: test - eval_eps: 10 - eval_per_episode: 5 - load_checkpoint: true - load_path: Train_CartPole-v1_PPO_20221217-204003 - max_steps: 200 - new_step_api: true - render: false - save_fig: true - seed: 1 - show_fig: false - test_eps: 10 - train_eps: 200 - wrapper: null -algo_cfg: - ppo_type: kl - actor_hidden_dim: 256 - actor_lr: 0.0003 - critic_hidden_dim: 256 - critic_lr: 0.001 - entropy_coef: 0.01 - gamma: 0.99 - k_epochs: 4 - kl_alpha: 2 - kl_beta: 1.5 - kl_lambda: 0.5 - kl_target: 0.01 - train_batch_size: 100 diff --git a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Train.yaml b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Train.yaml deleted file mode 100644 index a70549c..0000000 --- a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO-KL_Train.yaml +++ /dev/null @@ -1,32 +0,0 @@ -general_cfg: - algo_name: PPO - device: cuda - env_name: CartPole-v1 - eval_eps: 10 - eval_per_episode: 5 - load_checkpoint: false - load_path: Train_CartPole-v1_PPO-KL_20221206-054757 - max_steps: 200 - mode: train - new_step_api: true - render: false - save_fig: true - seed: 1 - show_fig: false - test_eps: 10 - train_eps: 200 - wrapper: null -algo_cfg: - ppo_type: kl - actor_hidden_dim: 256 - actor_lr: 0.0003 - critic_hidden_dim: 256 - critic_lr: 0.001 - entropy_coef: 0.01 - gamma: 0.99 - k_epochs: 4 - kl_alpha: 2 - kl_beta: 1.5 - kl_lambda: 0.5 - kl_target: 0.01 - train_batch_size: 100 diff --git a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml index a8119c2..42aaf74 100644 --- a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml +++ b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO.yaml @@ -4,7 +4,7 @@ general_cfg: env_name: gym device: cpu mode: train - exps_trucation_size: 20 + exps_trucation_size: 200 is_learner_async: false load_checkpoint: false load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir @@ -14,11 +14,13 @@ general_cfg: max_step: 200 seed: 1 online_eval: true - online_eval_episode: 10 + online_eval_episode: 15 model_save_fre: 10 - policy_summary_fre: 5 + policy_summary_fre: 10 interact_summary_fre: 100 algo_cfg: + independ_actor: false + return_form: td actor_branch_layers: - name: feature_1 layers: @@ -47,16 +49,18 @@ algo_cfg: layer_size: [256] activation: relu buffer_type: ONPOLICY_QUE - lr: 0.0001 + lr: 0.0003 actor_lr: 0.0003 critic_lr: 0.001 - entropy_coef: 0.01 + entropy_coef: 0.001 critic_loss_coef: 0.001 - eps_clip: 0.2 - gamma: 0.99 + eps_clip: 0.1 + gamma: 0.95 + gae_lambda: 0.95 k_epochs: 4 - batch_size: 200 + batch_size: 2000 sgd_batch_size: 200 + # min_policy: 0.001 env_cfg: id: CartPole-v1 render_mode: null diff --git a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml index c9bf1b7..2340ff1 100644 --- a/presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml +++ b/presets/ClassControl/CartPole-v1/CartPole-v1_PPO_off_policy.yaml @@ -4,6 +4,7 @@ general_cfg: device: cpu mode: train load_checkpoint: false + is_learner_async: false load_path: Train_CartPole-v1_PPO_20231225-124842 # if load checkpoint, then config path in 'tasks' dir load_model_step: best n_interactors: 1 @@ -11,15 +12,15 @@ general_cfg: max_step: 200 seed: 1 online_eval: true - online_eval_episode: 10 + online_eval_episode: 1 model_save_fre: 100 - policy_summary_fre: 2 + policy_summary_fre: 100 + interact_summary_fre: 100 exps_trucation_size: 20 algo_cfg: ppo_type: clip learn_frequency: 20 independ_actor: true - share_optimizer: false actor_merge_layers: - layer_type: linear layer_size: [256] @@ -36,6 +37,7 @@ algo_cfg: activation: relu buffer_type: REPLAY_QUE max_buffer_size: 2000 + lr: 0.0003 actor_lr: 0.0003 critic_lr: 0.001 entropy_coef: 0.01 @@ -47,4 +49,6 @@ algo_cfg: env_cfg: id: CartPole-v1 render_mode: null + wrappers: + - wrapper_name: CartPoleActionWrapper