From 5f0f054bdbddbe3c1afdf0663cf573a94510b7ba Mon Sep 17 00:00:00 2001 From: johnjim0816 <39483938+johnjim0816@users.noreply.github.com> Date: Sun, 24 Dec 2023 19:05:24 +0800 Subject: [PATCH] [0.4.5] update DDPG --- joyrl/__init__.py | 4 +- joyrl/algos/DDPG/__init__.py | 0 joyrl/algos/DDPG/config.py | 32 ++++ joyrl/algos/DDPG/data_handler.py | 4 + joyrl/algos/DDPG/ddpg.py | 96 ----------- joyrl/algos/DDPG/env.py | 56 ------- joyrl/algos/DDPG/main.py | 152 ------------------ joyrl/algos/DDPG/policy.py | 119 ++++++++++++++ joyrl/run.py | 11 +- .../Pendulum-v1/Pendulum-v1_DDPG.yaml | 28 ++-- 10 files changed, 177 insertions(+), 325 deletions(-) create mode 100644 joyrl/algos/DDPG/__init__.py create mode 100644 joyrl/algos/DDPG/config.py create mode 100644 joyrl/algos/DDPG/data_handler.py delete mode 100644 joyrl/algos/DDPG/ddpg.py delete mode 100644 joyrl/algos/DDPG/env.py delete mode 100644 joyrl/algos/DDPG/main.py create mode 100644 joyrl/algos/DDPG/policy.py diff --git a/joyrl/__init__.py b/joyrl/__init__.py index b145de8..f3d24c3 100644 --- a/joyrl/__init__.py +++ b/joyrl/__init__.py @@ -5,13 +5,13 @@ Email: johnjim0816@gmail.com Date: 2023-01-01 16:20:49 LastEditor: JiangJi -LastEditTime: 2023-12-24 17:49:27 +LastEditTime: 2023-12-24 19:04:37 Discription: ''' from joyrl import algos, framework, envs, utils from joyrl.run import run -__version__ = "0.4.4.1" +__version__ = "0.4.5" __all__ = [ "algos", diff --git a/joyrl/algos/DDPG/__init__.py b/joyrl/algos/DDPG/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/joyrl/algos/DDPG/config.py b/joyrl/algos/DDPG/config.py new file mode 100644 index 0000000..f9dd93a --- /dev/null +++ b/joyrl/algos/DDPG/config.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2023-12-15 13:16:24 +LastEditor: JiangJi +LastEditTime: 2023-12-19 13:17:37 +Discription: +''' +import numpy as np +class AlgoConfig: + def __init__(self): + self.action_type = 'dpg' # action type, dpg: deterministic policy gradient + self.buffer_type = 'REPLAY_QUE' # replay buffer type + self.buffer_size = 100000 # replay buffer size + self.batch_size = 128 # batch size + self.gamma = 0.99 # discount factor + self.policy_loss_weight = 0.002 # policy loss weight + self.critic_lr = 1e-3 # learning rate of critic + self.actor_lr = 1e-4 # learning rate of actor + self.tau = 0.001 # soft update parameter + self.value_min = -np.inf # clip min critic value + self.value_max = np.inf # clip max critic value + self.actor_layers = [ + {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, + {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, + ] + self.critic_layers = [ + {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, + {'layer_type': 'Linear', 'layer_size': [256], 'activation': 'ReLU'}, + ] \ No newline at end of file diff --git a/joyrl/algos/DDPG/data_handler.py b/joyrl/algos/DDPG/data_handler.py new file mode 100644 index 0000000..93ddf6b --- /dev/null +++ b/joyrl/algos/DDPG/data_handler.py @@ -0,0 +1,4 @@ +from joyrl.algos.base.data_handler import BaseDataHandler +class DataHandler(BaseDataHandler): + def __init__(self, cfg): + super().__init__(cfg) \ No newline at end of file diff --git a/joyrl/algos/DDPG/ddpg.py b/joyrl/algos/DDPG/ddpg.py deleted file mode 100644 index 246966b..0000000 --- a/joyrl/algos/DDPG/ddpg.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-09 20:25:52 -@LastEditor: John -LastEditTime: 2022-09-27 15:43:21 -@Discription: -@Environment: python 3.7.7 -''' -import random -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim - -class DDPG: - def __init__(self, models,memories,cfg): - self.device = torch.device(cfg['device']) - self.critic = models['critic'].to(self.device) - self.target_critic = models['critic'].to(self.device) - self.actor = models['actor'].to(self.device) - self.target_actor = models['actor'].to(self.device) - # copy weights from critic to target_critic - for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): - target_param.data.copy_(param.data) - # copy weights from actor to target_actor - for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): - target_param.data.copy_(param.data) - self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg['critic_lr']) - self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg['actor_lr']) - self.memory = memories['memory'] - self.batch_size = cfg['batch_size'] - self.gamma = cfg['gamma'] - self.tau = cfg['tau'] - - def sample_action(self, state): - state = torch.FloatTensor(state).unsqueeze(0).to(self.device) - action = self.actor(state) - return action.detach().cpu().numpy()[0, 0] - @torch.no_grad() - def predict_action(self, state): - ''' predict action - ''' - state = torch.FloatTensor(state).unsqueeze(0).to(self.device) - action = self.actor(state) - return action.cpu().numpy()[0, 0] - - def update(self): - if len(self.memory) < self.batch_size: # when memory size is less than batch size, return - return - # sample a random minibatch of N transitions from R - state, action, reward, next_state, done = self.memory.sample(self.batch_size) - # convert to tensor - state = torch.FloatTensor(np.array(state)).to(self.device) - next_state = torch.FloatTensor(np.array(next_state)).to(self.device) - action = torch.FloatTensor(np.array(action)).to(self.device) - reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) - done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) - - policy_loss = self.critic(state, self.actor(state)) - policy_loss = -policy_loss.mean() - next_action = self.target_actor(next_state) - target_value = self.target_critic(next_state, next_action.detach()) - expected_value = reward + (1.0 - done) * self.gamma * target_value - expected_value = torch.clamp(expected_value, -np.inf, np.inf) - - value = self.critic(state, action) - value_loss = nn.MSELoss()(value, expected_value.detach()) - - self.actor_optimizer.zero_grad() - policy_loss.backward() - self.actor_optimizer.step() - self.critic_optimizer.zero_grad() - value_loss.backward() - self.critic_optimizer.step() - # soft update - for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): - target_param.data.copy_( - target_param.data * (1.0 - self.tau) + - param.data * self.tau - ) - for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): - target_param.data.copy_( - target_param.data * (1.0 - self.tau) + - param.data * self.tau - ) - def save_model(self,path): - from pathlib import Path - # create path - Path(path).mkdir(parents=True, exist_ok=True) - torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt") - - def load_model(self,path): - self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt")) \ No newline at end of file diff --git a/joyrl/algos/DDPG/env.py b/joyrl/algos/DDPG/env.py deleted file mode 100644 index 89445cf..0000000 --- a/joyrl/algos/DDPG/env.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-10 15:28:30 -@LastEditor: John -LastEditTime: 2021-09-16 00:52:30 -@Discription: -@Environment: python 3.7.7 -''' -import gym -import numpy as np - -class NormalizedActions(gym.ActionWrapper): - ''' 将action范围重定在[0.1]之间 - ''' - def action(self, action): - low_bound = self.action_space.low - upper_bound = self.action_space.high - action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) - action = np.clip(action, low_bound, upper_bound) - return action - - def reverse_action(self, action): - low_bound = self.action_space.low - upper_bound = self.action_space.high - action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 - action = np.clip(action, low_bound, upper_bound) - return action - -class OUNoise(object): - '''Ornstein–Uhlenbeck噪声 - ''' - def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): - self.mu = mu # OU噪声的参数 - self.theta = theta # OU噪声的参数 - self.sigma = max_sigma # OU噪声的参数 - self.max_sigma = max_sigma - self.min_sigma = min_sigma - self.decay_period = decay_period - self.n_actions = action_space.shape[0] - self.low = action_space.low - self.high = action_space.high - self.reset() - def reset(self): - self.obs = np.ones(self.n_actions) * self.mu - def evolve_obs(self): - x = self.obs - dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) - self.obs = x + dx - return self.obs - def get_action(self, action, t=0): - ou_obs = self.evolve_obs() - self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减 - return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切 \ No newline at end of file diff --git a/joyrl/algos/DDPG/main.py b/joyrl/algos/DDPG/main.py deleted file mode 100644 index 8da5d29..0000000 --- a/joyrl/algos/DDPG/main.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 20:58:21 -@LastEditor: John -LastEditTime: 2022-09-27 15:50:12 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # current path -parent_path = os.path.dirname(curr_path) # parent path -sys.path.append(parent_path) # add to system path - -import datetime -import gym -import torch -import argparse -import torch.nn as nn -import torch.nn.functional as F -from env import NormalizedActions,OUNoise -from ddpg import DDPG -from common.utils import all_seed -from common.memories import ReplayBufferQue -from common.launcher import Launcher -from envs.register import register_env - -class Actor(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): - super(Actor, self).__init__() - self.linear1 = nn.Linear(n_states, hidden_dim) - self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.linear3 = nn.Linear(hidden_dim, n_actions) - - self.linear3.weight.data.uniform_(-init_w, init_w) - self.linear3.bias.data.uniform_(-init_w, init_w) - - def forward(self, x): - x = F.relu(self.linear1(x)) - x = F.relu(self.linear2(x)) - x = torch.tanh(self.linear3(x)) - return x -class Critic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): - super(Critic, self).__init__() - - self.linear1 = nn.Linear(n_states + n_actions, hidden_dim) - self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.linear3 = nn.Linear(hidden_dim, 1) - # 随机初始化为较小的值 - self.linear3.weight.data.uniform_(-init_w, init_w) - self.linear3.bias.data.uniform_(-init_w, init_w) - - def forward(self, state, action): - # 按维数1拼接 - x = torch.cat([state, action], 1) - x = F.relu(self.linear1(x)) - x = F.relu(self.linear2(x)) - x = self.linear3(x) - return x -class Main(Launcher): - def get_args(self): - """ hyperparameters - """ - curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time - parser = argparse.ArgumentParser(description="hyperparameters") - parser.add_argument('--algo_name',default='DDPG',type=str,help="name of algorithm") - parser.add_argument('--env_name',default='Pendulum-v1',type=str,help="name of environment") - parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") - parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") - parser.add_argument('--max_steps',default=100000,type=int,help="steps per episode, much larger value can simulate infinite steps") - parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") - parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic") - parser.add_argument('--actor_lr',default=1e-4,type=float,help="learning rate of actor") - parser.add_argument('--memory_capacity',default=8000,type=int,help="memory capacity") - parser.add_argument('--batch_size',default=128,type=int) - parser.add_argument('--target_update',default=2,type=int) - parser.add_argument('--tau',default=1e-2,type=float) - parser.add_argument('--critic_hidden_dim',default=256,type=int) - parser.add_argument('--actor_hidden_dim',default=256,type=int) - parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") - parser.add_argument('--seed',default=1,type=int,help="random seed") - parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not") - parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") - args = parser.parse_args() - default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/", - 'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/", - } - args = {**vars(args),**default_args} # type(dict) - return args - - def env_agent_config(self,cfg): - register_env(cfg['env_name']) - env = gym.make(cfg['env_name']) - env = NormalizedActions(env) # decorate with action noise - if cfg['seed'] !=0: # set random seed - all_seed(env,seed=cfg["seed"]) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - print(f"n_states: {n_states}, n_actions: {n_actions}") - cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters - models = {"actor":Actor(n_states,n_actions,hidden_dim=cfg['actor_hidden_dim']),"critic":Critic(n_states,n_actions,hidden_dim=cfg['critic_hidden_dim'])} - memories = {"memory":ReplayBufferQue(cfg['memory_capacity'])} - agent = DDPG(models,memories,cfg) - return env,agent - def train(self,cfg, env, agent): - print('Start training!') - ou_noise = OUNoise(env.action_space) # noise of action - rewards = [] # record rewards for all episodes - for i_ep in range(cfg['train_eps']): - state = env.reset() - ou_noise.reset() - ep_reward = 0 - for i_step in range(cfg['max_steps']): - action = agent.sample_action(state) - action = ou_noise.get_action(action, i_step+1) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push((state, action, reward, next_state, done)) - agent.update() - state = next_state - if done: - break - if (i_ep+1)%10 == 0: - print(f"Env:{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}") - rewards.append(ep_reward) - print('Finish training!') - return {'rewards':rewards} - - def test(self,cfg, env, agent): - print('Start testing!') - rewards = [] # record rewards for all episodes - for i_ep in range(cfg['test_eps']): - state = env.reset() - ep_reward = 0 - for i_step in range(cfg['max_steps']): - action = agent.predict_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - state = next_state - if done: - break - rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg['test_eps']}, Reward:{ep_reward:.1f}") - print('Finish testing!') - return {'rewards':rewards} -if __name__ == "__main__": - main = Main() - main.run() - diff --git a/joyrl/algos/DDPG/policy.py b/joyrl/algos/DDPG/policy.py new file mode 100644 index 0000000..056024b --- /dev/null +++ b/joyrl/algos/DDPG/policy.py @@ -0,0 +1,119 @@ +import torch +import torch.nn.functional as F +import torch.optim as optim +from joyrl.algos.base.policy import BasePolicy +from joyrl.algos.base.networks import CriticNetwork, ActorNetwork +from joyrl.algos.base.noises import OUNoise + +class Policy(BasePolicy): + def __init__(self,cfg) -> None: + super(Policy, self).__init__(cfg) + self.cfg = cfg + self.action_type = cfg.action_type + self.ou_noise = OUNoise(self.action_space) + self.gamma = cfg.gamma + self.tau = cfg.tau + self.device = torch.device(cfg.device) + self.action_scale = torch.FloatTensor((self.action_space.high - self.action_space.low) / 2.).to(self.device) + self.action_bias = torch.FloatTensor((self.action_space.high + self.action_space.low) / 2.).to(self.device) + self.create_graph() # create graph and optimizer + self.create_summary() # create summary + self.to(self.device) + self.sample_count = 0 # sample count + + def create_graph(self): + self.state_size, self.action_size = self.get_state_action_size() + self.input_head_size = [None, self.state_size[-1]+self.action_size[-1]] + self.actor = ActorNetwork(self.cfg, self.state_size, self.action_space) + self.critic = CriticNetwork(self.cfg, self.input_head_size) + self.target_actor = ActorNetwork(self.cfg, self.state_size, self.action_space) + self.target_critic = CriticNetwork(self.cfg, self.input_head_size) + self.target_actor.load_state_dict(self.actor.state_dict()) + self.target_critic.load_state_dict(self.critic.state_dict()) + self.create_optimizer() + + def create_optimizer(self): + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.cfg.actor_lr) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.cfg.critic_lr) + + def create_summary(self): + ''' + 创建 tensorboard 数据 + ''' + self.summary = { + 'scalar': { + 'tot_loss': 0.0, + 'policy_loss': 0.0, + 'value_loss': 0.0, + }, + } + def update_summary(self): + ''' 更新 tensorboard 数据 + ''' + if hasattr(self, 'tot_loss'): + self.summary['scalar']['tot_loss'] = self.tot_loss.item() + self.summary['scalar']['policy_loss'] = self.policy_loss.item() + self.summary['scalar']['value_loss'] = self.value_loss.item() + + def sample_action(self, state, **kwargs): + ''' sample action + ''' + self.sample_count += 1 + state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0) + mu = self.actor(state) # mu is in [-1, 1] + action = self.action_scale * mu + self.action_bias + action = action.cpu().detach().numpy()[0] + action = self.ou_noise.get_action(action, self.sample_count) # add noise to action + return action + + @torch.no_grad() + def predict_action(self, state, **kwargs): + ''' predict action + ''' + state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0) + mu = self.actor(state) # mu is in [-1, 1] + action = self.action_scale * mu + self.action_bias + action = action.cpu().detach().numpy()[0] + return action + + def learn(self, **kwargs): + ''' train policy + ''' + states, actions, next_states, rewards, dones = kwargs.get('states'), kwargs.get('actions'), kwargs.get('next_states'), kwargs.get('rewards'), kwargs.get('dones') + # convert numpy to tensor + states = torch.tensor(states, device=self.device, dtype=torch.float32) + actions = torch.tensor(actions, device=self.device, dtype=torch.float32) + next_states = torch.tensor(next_states, device=self.device, dtype=torch.float32) + rewards = torch.tensor(rewards, device=self.device, dtype=torch.float32).unsqueeze(dim=1) + dones = torch.tensor(dones, device=self.device, dtype=torch.float32).unsqueeze(dim=1) + # calculate policy loss + state_actions = torch.cat([states, self.actor(states)], dim=1) + self.policy_loss = -self.critic(state_actions).mean() * self.cfg.policy_loss_weight + # calculate value loss + next_actions = self.target_actor(next_states).detach() + next_state_actions = torch.cat([next_states, next_actions], dim=1) + target_values = self.target_critic(next_state_actions) + expected_values = rewards + self.gamma * target_values * (1.0 - dones) + expected_values = torch.clamp(expected_values, self.cfg.value_min, self.cfg.value_max) # clip value + values = self.critic(torch.cat([states, actions], dim=1)) + self.value_loss = F.mse_loss(values, expected_values.detach()) + self.tot_loss = self.policy_loss + self.value_loss + # actor and critic update, the order is important + self.actor_optimizer.zero_grad() + self.policy_loss.backward() + self.actor_optimizer.step() + self.critic_optimizer.zero_grad() + self.value_loss.backward() + self.critic_optimizer.step() + # soft update target network + self.soft_update(self.actor, self.target_actor, self.tau) + self.soft_update(self.critic, self.target_critic, self.tau) + self.update_summary() # update summary + + def soft_update(self, curr_model, target_model, tau): + ''' soft update model parameters + θ_target = τ*θ_local + (1 - τ)*θ_target + ''' + for target_param, curr_param in zip(target_model.parameters(), curr_model.parameters()): + target_param.data.copy_(tau*curr_param.data + (1.0-tau)*target_param.data) + \ No newline at end of file diff --git a/joyrl/run.py b/joyrl/run.py index 7d06be8..97ddaf9 100644 --- a/joyrl/run.py +++ b/joyrl/run.py @@ -5,7 +5,7 @@ Email: johnjim0816@gmail.com Date: 2023-12-22 13:16:59 LastEditor: JiangJi -LastEditTime: 2023-12-23 10:54:12 +LastEditTime: 2023-12-24 19:00:32 Discription: ''' import sys,os @@ -23,6 +23,7 @@ from joyrl.framework.trainer import Trainer from joyrl.framework.model_mgr import ModelMgr from joyrl.utils.utils import merge_class_attrs, all_seed,save_frames_as_gif +from joyrl.envs.register import register_env class Launcher(object): def __init__(self, **kwargs): @@ -171,7 +172,6 @@ def policy_config(self): policy.load_model(f"tasks/{self.cfg.load_path}/models/{self.cfg.load_model_step}") data_handler = data_handler_mod.DataHandler(self.cfg) return policy, data_handler - def _start(self, **kwargs): ''' start serial training @@ -191,7 +191,9 @@ def _start(self, **kwargs): policy = policy, logger = logger ) - model_mgr = ModelMgr(self.cfg, model_params = policy.get_model_params(),logger = logger) + model_mgr = ModelMgr(self.cfg, + policy = policy, + logger = logger) trainer = Trainer( self.cfg, tracker = tracker, model_mgr = model_mgr, @@ -216,7 +218,7 @@ def _ray_start(self, **kwargs): collector = ray.remote(Collector).options(num_cpus = 1).remote(self.cfg, data_handler = data_handler, logger = logger) interactor_mgr = ray.remote(InteractorMgr).options(num_cpus = 0).remote(self.cfg, env = env, policy = policy, logger = logger) learner_mgr = ray.remote(LearnerMgr).options(num_cpus = 0).remote(self.cfg, policy = policy, logger = logger) - model_mgr = ray.remote(ModelMgr).options(num_cpus = 0).remote(self.cfg, model_params = policy.get_model_params(),logger = logger) + model_mgr = ray.remote(ModelMgr).options(num_cpus = 0).remote(self.cfg, policy = policy,logger = logger) trainer = ray.remote(Trainer).options(num_cpus = 0).remote(self.cfg, tracker = tracker, model_mgr = model_mgr, @@ -229,6 +231,7 @@ def _ray_start(self, **kwargs): ray.get(trainer.ray_run.remote()) def run(self) -> None: + register_env(self.env_cfg.id) # register env env = self.env_config() # create single env policy, data_handler = self.policy_config() # configure policy and data_handler if self.cfg.learner_mode == 'serial': diff --git a/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml b/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml index 52d228e..1106ffa 100644 --- a/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml +++ b/presets/ClassControl/Pendulum-v1/Pendulum-v1_DDPG.yaml @@ -1,20 +1,17 @@ general_cfg: algo_name: DDPG - env_name: gym # env name, differ from env_id in env_cfgs - device: cuda # device, cpu or cuda - mode: train # run mode: train, test - collect_traj: false # if collect trajectories or not - mp_backend: single # multi-processing mode: single(default), ray - n_workers: 4 # number of workers if using multi-processing, default 1 - load_checkpoint: true # if load checkpoint or not - load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715 # if load checkpoint, then config path in 'tasks' dir - load_model_step: best # load model step - max_episode: 400 # max episodes, set -1 to keep running - max_step: 200 # max steps per episode, set -1 means unlimited steps - seed: 10 # random seed, set 0 not to use seed - online_eval: true # if online eval or not - online_eval_episode: 20 # online eval episodes - model_save_fre: 2000 # update step frequency of saving model + env_name: gym + device: cpu + mode: train + load_checkpoint: false + load_path: Train_ray_Pendulum-v1_DDPG_20230527-001715 + load_model_step: best + max_episode: 400 + max_step: 200 + seed: 10 + online_eval: true + online_eval_episode: 20 + model_save_fre: 2000 algo_cfg: action_type: dpg actor_layers: @@ -31,6 +28,7 @@ algo_cfg: - layer_type: linear layer_size: [256] activation: relu + n_steps_per_learn: 1 batch_size: 128 buffer_type: REPLAY_QUE buffer_size: 8000