learnables · seba-1511 · Feb 27, 2020 · Feb 21, 2020 · Feb 21, 2020 · Feb 26, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+* Bugfix when using `td.discount` with replays coming from vectorized environments (@galatolofederico) 
 * env.action_size and env.state_size when the number of vectorized environments is 1. (thanks @galatolofederico)
 * Actor-critic integration test being to finicky.
 * `cherry.onehot` support for numpy's float and integer types. (thanks @ngoby)
diff --git a/cherry/td.py b/cherry/td.py
@@ -52,7 +52,7 @@ def discount(gamma, rewards, dones, bootstrap=0.0):
 
     msg = 'dones and rewards must have equal length.'
     assert rewards.size(0) == dones.size(0), msg
-    R = th.zeros_like(rewards[0]) + bootstrap
+    R = th.zeros_like(rewards) + bootstrap
     discounted = th.zeros_like(rewards)
     length = discounted.size(0)
     for t in reversed(range(length)):

diff --git a/tests/unit/rl_tests.py b/tests/unit/rl_tests.py
@@ -9,6 +9,8 @@
 TAU = 0.9
 NUM_SAMPLES = 10
 VECTOR_SIZE = 5
+TIME_STEPS = 10
+NUM_ENVS = 4
 
 """
 TODO: Should test each method to make sure that they properly handle different
@@ -61,6 +63,52 @@ def setUp(self):
 
     def tearDown(self):
         pass
+
+
+    def test_vectorized_discount(self):
+        state = th.randn(TIME_STEPS, NUM_ENVS, VECTOR_SIZE)
+        action = th.randn(TIME_STEPS, NUM_ENVS)
+        reward = th.randn(TIME_STEPS, NUM_ENVS)
+        boostrap = th.randn(NUM_ENVS)
+        done = th.zeros_like(reward)
+        for i in list(reversed(range(TIME_STEPS)))[:4]:
+            done[i,i%NUM_ENVS] = 1
+
+
+        # Computing the discounted rewards
+        # as non-vectorized environment
+        nonvec_discounted_rewards = []
+        for i in range(NUM_ENVS):
+            replay = ch.ExperienceReplay()
+            for t in range(TIME_STEPS):
+                replay.append(
+                    state[t, i, :], action[t, i], 
+                    reward[t, i], state[t, i, :], done[t, i]
+                )
+            nonvec_discounted_rewards.append(
+                ch.td.discount(
+                    GAMMA, replay.reward(), replay.done(), boostrap[i]
+                )
+            )
+        # Computing the discounted rewards
+        # as vectorized environment
+        replay = ch.ExperienceReplay()
+        for t in range(TIME_STEPS):
+            replay.append(
+                state[t, :, :], action[t, :], 
+                reward[t, :], state[t, :, :], done[t, :]
+            )        
+        vec_discounted_rewards = ch.td.discount(
+            GAMMA, replay.reward(), replay.done(), boostrap
+        )
+
+        for i in range(NUM_ENVS):
+            assert th.all(
+                    nonvec_discounted_rewards[i][:, 0] 
+                    == 
+                    vec_discounted_rewards[:, i],
+                )
+
 
     def test_discount(self):
         vector = th.randn(VECTOR_SIZE)