From 5563c4764f997d224f52e1edb5a26119ec51eba2 Mon Sep 17 00:00:00 2001 From: nicolengsy Date: Wed, 14 Oct 2020 00:35:32 -0700 Subject: [PATCH 1/7] Init TNPG doc --- docs/index.md | 1 + docs/user/algo_tnpg.md | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 docs/user/algo_tnpg.md diff --git a/docs/index.md b/docs/index.md index a817ab5649..5a9357bc9a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,6 +60,7 @@ and how to implement new MDPs and new algorithms. user/algo_vpg user/algo_td3 user/algo_ddpg + user/algo_tnpg .. toctree:: :maxdepth: 2 diff --git a/docs/user/algo_tnpg.md b/docs/user/algo_tnpg.md new file mode 100644 index 0000000000..e7bb429f23 --- /dev/null +++ b/docs/user/algo_tnpg.md @@ -0,0 +1,41 @@ +# Truncated Natural Policy Gradient + +```eval_rst ++-------------------+--------------------------------------------------------------------------------------------------------------+ +| **Paper** | Model-Free Imitation Learning with Policy Optimization :cite:`ho2016model` | ++-------------------+--------------------------------------------------------------------------------------------------------------+ +| **Framework(s)** | .. figure:: ./images/tf.png | +| | :scale: 10% | +| | :class: no-scaled-link | +| | | +| | Tensorflow | ++-------------------+--------------------------------------------------------------------------------------------------------------+ +| **API Reference** | `garage.tf.algos.TNPG <../_autoapi/garage/tf/algos/index.html#garage.tf.algos.TNPG>`_ | ++-------------------+--------------------------------------------------------------------------------------------------------------+ +| **Code** | `garage/tf/algos/tnpg.py `_ | ++-------------------+--------------------------------------------------------------------------------------------------------------+ +| **Examples** | | ++-------------------+--------------------------------------------------------------------------------------------------------------+ +``` + +## Default Parameters + +```py +discount=0.99, +gae_lambda=0.98, +lr_clip_range=0.01, +max_kl_step=0.01, +policy_ent_coeff=0.0, +entropy_method='no_entropy', +``` + +## References + +```eval_rst +.. bibliography:: references.bib + :style: unsrt + :filter: docname in docnames +``` +---- + +*This page was authored by Nicole Shin Ying Ng ([@nicolengsy](https://github.com/nicolengsy)).* From 89b8316f2beaf9f8648b1771c19ac648542a7da2 Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Thu, 29 Oct 2020 02:26:37 -0700 Subject: [PATCH 2/7] Add tnpg docs --- docs/user/algo_tnpg.md | 9 ++++++--- docs/user/references.bib | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/docs/user/algo_tnpg.md b/docs/user/algo_tnpg.md index e7bb429f23..a05a293f05 100644 --- a/docs/user/algo_tnpg.md +++ b/docs/user/algo_tnpg.md @@ -2,7 +2,8 @@ ```eval_rst +-------------------+--------------------------------------------------------------------------------------------------------------+ -| **Paper** | Model-Free Imitation Learning with Policy Optimization :cite:`ho2016model` | +| **Paper** | Benchmarking Deep Reinforcement Learning for Continuous Control :cite:`duan2016benchmarking`, A Natural | +| | Policy Gradient :cite:`10.5555/2980539.2980738` | +-------------------+--------------------------------------------------------------------------------------------------------------+ | **Framework(s)** | .. figure:: ./images/tf.png | | | :scale: 10% | @@ -14,8 +15,10 @@ +-------------------+--------------------------------------------------------------------------------------------------------------+ | **Code** | `garage/tf/algos/tnpg.py `_ | +-------------------+--------------------------------------------------------------------------------------------------------------+ -| **Examples** | | -+-------------------+--------------------------------------------------------------------------------------------------------------+ +``` + +```eval_rst +Truncated Natural Policy Gradient develops upon the Natural Policy Gradient, which optimizes a policy for the maximum discounted rewards by gradient descent. TNPG a conjugate gradient algorithm to compute the natural policy gradient, cutting the computation cost when there are high-dimensional parameters. See :cite:`duan2016benchmarking` for more details. ``` ## Default Parameters diff --git a/docs/user/references.bib b/docs/user/references.bib index eea7a77918..4d680cc038 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -114,3 +114,26 @@ @article{2009koberpolicy year = {2009}, month_numeric = {6} } + +@misc{duan2016benchmarking, + title={Benchmarking Deep Reinforcement Learning for Continuous Control}, + author={Yan Duan and Xi Chen and Rein Houthooft and John Schulman and Pieter Abbeel}, + year={2016}, + eprint={1604.06778}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{10.5555/2980539.2980738, +author = {Kakade, Sham}, +title = {A Natural Policy Gradient}, +year = {2001}, +publisher = {MIT Press}, +address = {Cambridge, MA, USA}, +abstract = {We provide a natural gradient method that represents the steepest descent direction based on the underlying structure of the parameter space. Although gradient methods cannot make large changes in the values of the parameters, we show that the natural gradient is moving toward choosing a greedy optimal action rather than just a better action. These greedy optimal actions are those that would be chosen under one improvement step of policy iteration with approximate, compatible value functions, as defined by Sutton et al. [9]. We then show drastic performance improvements in simple MDPs and in the more challenging MDP of Tetris.}, +booktitle = {Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic}, +pages = {1531–1538}, +numpages = {8}, +location = {Vancouver, British Columbia, Canada}, +series = {NIPS'01} +} From 15950762f1873ce01f5a09fc3caefff33c1f9bfa Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Fri, 30 Oct 2020 01:55:20 -0700 Subject: [PATCH 3/7] Fix pre-commit --- docs/index.md | 2 +- docs/user/algo_tnpg.md | 4 ++-- docs/user/references.bib | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index e36a24dc7b..968cdac56d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -62,7 +62,7 @@ and how to implement new MDPs and new algorithms. user/algo_vpg user/algo_td3 user/algo_ddpg - user/algo_tnpg + TNPG user/algo_cem .. toctree:: diff --git a/docs/user/algo_tnpg.md b/docs/user/algo_tnpg.md index a05a293f05..1ac4d9cdd7 100644 --- a/docs/user/algo_tnpg.md +++ b/docs/user/algo_tnpg.md @@ -1,8 +1,8 @@ -# Truncated Natural Policy Gradient +# Truncated Natural Policy Gradient (TNPG) ```eval_rst +-------------------+--------------------------------------------------------------------------------------------------------------+ -| **Paper** | Benchmarking Deep Reinforcement Learning for Continuous Control :cite:`duan2016benchmarking`, A Natural | +| **Paper** | Benchmarking Deep Reinforcement Learning for Continuous Control :cite:`duan2016benchmarking`, A Natural | | | Policy Gradient :cite:`10.5555/2980539.2980738` | +-------------------+--------------------------------------------------------------------------------------------------------------+ | **Framework(s)** | .. figure:: ./images/tf.png | diff --git a/docs/user/references.bib b/docs/user/references.bib index 38ec3db9cf..4dead1b188 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -117,7 +117,7 @@ @article{2009koberpolicy } @misc{duan2016benchmarking, - title={Benchmarking Deep Reinforcement Learning for Continuous Control}, + title={Benchmarking Deep Reinforcement Learning for Continuous Control}, author={Yan Duan and Xi Chen and Rein Houthooft and John Schulman and Pieter Abbeel}, year={2016}, eprint={1604.06778}, From 1b76b9d747b69c4c7607f7bccbb5bb2825223663 Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Sat, 31 Oct 2020 19:20:18 -0700 Subject: [PATCH 4/7] Update reference typo --- docs/user/references.bib | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/docs/user/references.bib b/docs/user/references.bib index 4dead1b188..b0f4f8cbf4 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -126,17 +126,18 @@ @misc{duan2016benchmarking } @inproceedings{10.5555/2980539.2980738, -author = {Kakade, Sham}, -title = {A Natural Policy Gradient}, -year = {2001}, -publisher = {MIT Press}, -address = {Cambridge, MA, USA}, -abstract = {We provide a natural gradient method that represents the steepest descent direction based on the underlying structure of the parameter space. Although gradient methods cannot make large changes in the values of the parameters, we show that the natural gradient is moving toward choosing a greedy optimal action rather than just a better action. These greedy optimal actions are those that would be chosen under one improvement step of policy iteration with approximate, compatible value functions, as defined by Sutton et al. [9]. We then show drastic performance improvements in simple MDPs and in the more challenging MDP of Tetris.}, -booktitle = {Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic}, -pages = {1531–1538}, -numpages = {8}, -location = {Vancouver, British Columbia, Canada}, -series = {NIPS'01} + author = {Kakade, Sham}, + title = {A Natural Policy Gradient}, + year = {2001}, + publisher = {MIT Press}, + address = {Cambridge, MA, USA}, + abstract = {We provide a natural gradient method that represents the steepest descent direction based on the underlying structure of the parameter space. Although gradient methods cannot make large changes in the values of the parameters, we show that the natural gradient is moving toward choosing a greedy optimal action rather than just a better action. These greedy optimal actions are those that would be chosen under one improvement step of policy iteration with approximate, compatible value functions, as defined by Sutton et al. [9]. We then show drastic performance improvements in simple MDPs and in the more challenging MDP of Tetris.}, + booktitle = {Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic}, + pages = {1531–1538}, + numpages = {8}, + location = {Vancouver, British Columbia, Canada}, + series = {NIPS'01} +} @misc{finn2017modelagnostic, title={Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks}, From f4912a82a210236d98806ee9255f31b2fcdcde5c Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Thu, 29 Oct 2020 02:26:37 -0700 Subject: [PATCH 5/7] Add tnpg docs --- docs/user/references.bib | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/user/references.bib b/docs/user/references.bib index b0f4f8cbf4..e3f9ef287d 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -147,3 +147,25 @@ @misc{finn2017modelagnostic archivePrefix={arXiv}, primaryClass={cs.LG} } +@misc{duan2016benchmarking, + title={Benchmarking Deep Reinforcement Learning for Continuous Control}, + author={Yan Duan and Xi Chen and Rein Houthooft and John Schulman and Pieter Abbeel}, + year={2016}, + eprint={1604.06778}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@inproceedings{10.5555/2980539.2980738, + author = {Kakade, Sham}, + title = {A Natural Policy Gradient}, + year = {2001}, + publisher = {MIT Press}, + address = {Cambridge, MA, USA}, + abstract = {We provide a natural gradient method that represents the steepest descent direction based on the underlying structure of the parameter space. Although gradient methods cannot make large changes in the values of the parameters, we show that the natural gradient is moving toward choosing a greedy optimal action rather than just a better action. These greedy optimal actions are those that would be chosen under one improvement step of policy iteration with approximate, compatible value functions, as defined by Sutton et al. [9]. We then show drastic performance improvements in simple MDPs and in the more challenging MDP of Tetris.}, + booktitle = {Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic}, + pages = {1531–1538}, + numpages = {8}, + location = {Vancouver, British Columbia, Canada}, + series = {NIPS'01} +} From f1483bddb77f82b8d53df16334eab4cbf1ae1d8d Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Fri, 30 Oct 2020 01:55:20 -0700 Subject: [PATCH 6/7] Fix pre-commit --- docs/user/references.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/references.bib b/docs/user/references.bib index e3f9ef287d..03dab61dc8 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -148,7 +148,7 @@ @misc{finn2017modelagnostic primaryClass={cs.LG} } @misc{duan2016benchmarking, - title={Benchmarking Deep Reinforcement Learning for Continuous Control}, + title={Benchmarking Deep Reinforcement Learning for Continuous Control}, author={Yan Duan and Xi Chen and Rein Houthooft and John Schulman and Pieter Abbeel}, year={2016}, eprint={1604.06778}, From 93a276117582a349c0c61bbf212b9477e338d666 Mon Sep 17 00:00:00 2001 From: Nicole Ng Date: Sat, 31 Oct 2020 19:20:18 -0700 Subject: [PATCH 7/7] Update reference typo --- docs/user/references.bib | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/docs/user/references.bib b/docs/user/references.bib index 03dab61dc8..b0f4f8cbf4 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -147,25 +147,3 @@ @misc{finn2017modelagnostic archivePrefix={arXiv}, primaryClass={cs.LG} } -@misc{duan2016benchmarking, - title={Benchmarking Deep Reinforcement Learning for Continuous Control}, - author={Yan Duan and Xi Chen and Rein Houthooft and John Schulman and Pieter Abbeel}, - year={2016}, - eprint={1604.06778}, - archivePrefix={arXiv}, - primaryClass={cs.LG} -} - -@inproceedings{10.5555/2980539.2980738, - author = {Kakade, Sham}, - title = {A Natural Policy Gradient}, - year = {2001}, - publisher = {MIT Press}, - address = {Cambridge, MA, USA}, - abstract = {We provide a natural gradient method that represents the steepest descent direction based on the underlying structure of the parameter space. Although gradient methods cannot make large changes in the values of the parameters, we show that the natural gradient is moving toward choosing a greedy optimal action rather than just a better action. These greedy optimal actions are those that would be chosen under one improvement step of policy iteration with approximate, compatible value functions, as defined by Sutton et al. [9]. We then show drastic performance improvements in simple MDPs and in the more challenging MDP of Tetris.}, - booktitle = {Proceedings of the 14th International Conference on Neural Information Processing Systems: Natural and Synthetic}, - pages = {1531–1538}, - numpages = {8}, - location = {Vancouver, British Columbia, Canada}, - series = {NIPS'01} -}