update ppo discrete

quantumiracle · quantumiracle · commit 577e61d9e067 · 2020-02-21T16:06:43.000-05:00
diff --git a/ppo_continuous_multiprocess2.py b/ppo_continuous_multiprocess2.py
@@ -50,7 +50,7 @@
 
 #####################  hyper parameters  ####################
 
-ENV_NAME = 'Pendulum-v0'  # environment name
+ENV_NAME = 'LunarLanderContinuous-v2'  # environment name: LunarLander-v2, Pendulum-v0
 RANDOMSEED = 2  # random seed
 
 EP_MAX = 1000  # total number of episodes for training
@@ -63,7 +63,7 @@
 C_UPDATE_STEPS = 10  # critic update steps
 EPS = 1e-8  # numerical residual
 MODEL_PATH = 'model/ppo_multi'
-NUM_WORKERS=2  # or: mp.cpu_count()
+NUM_WORKERS=1  # or: mp.cpu_count()
 ACTION_RANGE = 2.  # if unnormalized, normalized action range should be 1.
 METHOD = [
     dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
diff --git a/ppo_gae_discrete.py b/ppo_gae_discrete.py
@@ -14,12 +14,12 @@
 T_horizon     = 20
 
 class PPO(nn.Module):
-    def __init__(self):
+    def __init__(self, state_dim, action_dim):
         super(PPO, self).__init__()
         self.data = []
         
-        self.fc1   = nn.Linear(4,256)
-        self.fc_pi = nn.Linear(256,2)
+        self.fc1   = nn.Linear(state_dim,256)
+        self.fc_pi = nn.Linear(256,action_dim)
         self.fc_v  = nn.Linear(256,1)
         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
 
@@ -86,7 +86,9 @@ def train_net(self):
         
 def main():
     env = gym.make('CartPole-v1')
-    model = PPO()
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n  # discrete
+    model = PPO(state_dim, action_dim)
     score = 0.0
     print_interval = 20
 
@@ -99,8 +101,9 @@ def main():
                 m = Categorical(prob)
                 a = m.sample().item()
                 s_prime, r, done, info = env.step(a)
-
+                # env.render()
                 model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
+
                 s = s_prime
 
                 score += r