Skip to content

Commit ed4d47e

Browse files
committed
ppo without gae, not as stable as ppo with gae
1 parent 2dd6ec5 commit ed4d47e

File tree

2 files changed

+29
-19
lines changed

2 files changed

+29
-19
lines changed

ppo_discrete.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import torch.nn.functional as F
88
import torch.optim as optim
99
from torch.distributions import Categorical
10+
import numpy as np
1011

1112
#Hyperparameters
1213
learning_rate = 0.0005
@@ -25,6 +26,7 @@ def __init__(self, state_dim, action_dim):
2526
self.fc_pi = nn.Linear(256,action_dim)
2627
self.fc_v = nn.Linear(256,1)
2728
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
29+
self.mseLoss = nn.MSELoss()
2830

2931
def pi(self, x, softmax_dim = 0):
3032
x = F.relu(self.fc1(x))
@@ -50,30 +52,34 @@ def make_batch(self):
5052
r_lst.append([r])
5153
s_prime_lst.append(s_prime)
5254
prob_a_lst.append([prob_a])
53-
done_mask = 0 if done else 1
54-
done_lst.append([done_mask])
55+
done_lst.append([int(done)])
5556

56-
s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
57+
s,a,r,s_prime,done, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
5758
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
5859
torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
5960
self.data = []
60-
return s, a, r, s_prime, done_mask, prob_a
61+
return s, a, r, s_prime, done, prob_a
6162

6263
def train_net(self):
63-
s, a, r, s_prime, done_mask, prob_a = self.make_batch()
64+
s, a, r, s_prime, done, prob_a = self.make_batch()
6465

65-
for i in range(K_epoch):
66-
td_target = r + gamma * self.v(s_prime) * done_mask
67-
delta = td_target - self.v(s)
68-
delta = delta.detach().numpy()
66+
rewards = []
67+
discounted_r = 0
68+
for reward, d in zip(reversed(r), reversed(done)):
69+
if d:
70+
discounted_r = 0
71+
discounted_r = reward + gamma * discounted_r
72+
rewards.insert(0, discounted_r)
73+
# rewards.append(discounted_r)
74+
rewards = torch.tensor(rewards, dtype=torch.float32)
75+
if rewards.shape[0]>1: # a batch with size 1 will cause 0 std
76+
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
77+
rewards = rewards.unsqueeze(dim=-1)
6978

70-
advantage_lst = []
71-
advantage = 0.0
72-
for delta_t in delta[::-1]:
73-
advantage = gamma * lmbda * advantage + delta_t[0]
74-
advantage_lst.append([advantage])
75-
advantage_lst.reverse()
76-
advantage = torch.tensor(advantage_lst, dtype=torch.float)
79+
for _ in range(K_epoch):
80+
vs = self.v(s)
81+
advantage = rewards - vs.detach()
82+
vs_target = rewards
7783

7884
pi = self.pi(s, softmax_dim=-1)
7985
dist_entropy = Categorical(pi).entropy()
@@ -82,7 +88,8 @@ def train_net(self):
8288

8389
surr1 = ratio * advantage
8490
surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
85-
loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach()) - 0.01*dist_entropy
91+
# loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , vs_target.detach()) - 0.01*dist_entropy
92+
loss = -torch.min(surr1, surr2) + 0.5*self.mseLoss(vs , vs_target.detach()) - 0.01*dist_entropy
8693

8794
self.optimizer.zero_grad()
8895
loss.mean().backward()
@@ -118,8 +125,9 @@ def main():
118125
model.train_net()
119126
epi_len.append(t)
120127
if n_epi%print_interval==0 and n_epi!=0:
121-
print("# of episode :{}, avg score : {:.1f}, avg epi length :{}".format(n_epi, score/print_interval, int(np.mean(epi_len)))
128+
print("# of episode :{}, avg score : {:.3f}, avg epi length :{}".format(n_epi, score/print_interval, int(np.mean(epi_len))))
122129
score = 0.0
130+
epi_len = []
123131

124132
env.close()
125133

ppo_gae_discrete.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import torch.nn.functional as F
88
import torch.optim as optim
99
from torch.distributions import Categorical
10+
import numpy as np
1011

1112
#Hyperparameters
1213
learning_rate = 0.0005
@@ -118,8 +119,9 @@ def main():
118119
model.train_net()
119120
epi_len.append(t)
120121
if n_epi%print_interval==0 and n_epi!=0:
121-
print("# of episode :{}, avg score : {:.1f}, avg epi length :{}".format(n_epi, score/print_interval, int(np.mean(epi_len)))
122+
print("# of episode :{}, avg score : {:.1f}, avg epi length :{}".format(n_epi, score/print_interval, int(np.mean(epi_len))))
122123
score = 0.0
124+
epi_len = []
123125

124126
env.close()
125127

0 commit comments

Comments
 (0)