77import torch .nn .functional as F
88import torch .optim as optim
99from torch .distributions import Categorical
10+ import numpy as np
1011
1112#Hyperparameters
1213learning_rate = 0.0005
@@ -25,6 +26,7 @@ def __init__(self, state_dim, action_dim):
2526 self .fc_pi = nn .Linear (256 ,action_dim )
2627 self .fc_v = nn .Linear (256 ,1 )
2728 self .optimizer = optim .Adam (self .parameters (), lr = learning_rate )
29+ self .mseLoss = nn .MSELoss ()
2830
2931 def pi (self , x , softmax_dim = 0 ):
3032 x = F .relu (self .fc1 (x ))
@@ -50,30 +52,34 @@ def make_batch(self):
5052 r_lst .append ([r ])
5153 s_prime_lst .append (s_prime )
5254 prob_a_lst .append ([prob_a ])
53- done_mask = 0 if done else 1
54- done_lst .append ([done_mask ])
55+ done_lst .append ([int (done )])
5556
56- s ,a ,r ,s_prime ,done_mask , prob_a = torch .tensor (s_lst , dtype = torch .float ), torch .tensor (a_lst ), \
57+ s ,a ,r ,s_prime ,done , prob_a = torch .tensor (s_lst , dtype = torch .float ), torch .tensor (a_lst ), \
5758 torch .tensor (r_lst ), torch .tensor (s_prime_lst , dtype = torch .float ), \
5859 torch .tensor (done_lst , dtype = torch .float ), torch .tensor (prob_a_lst )
5960 self .data = []
60- return s , a , r , s_prime , done_mask , prob_a
61+ return s , a , r , s_prime , done , prob_a
6162
6263 def train_net (self ):
63- s , a , r , s_prime , done_mask , prob_a = self .make_batch ()
64+ s , a , r , s_prime , done , prob_a = self .make_batch ()
6465
65- for i in range (K_epoch ):
66- td_target = r + gamma * self .v (s_prime ) * done_mask
67- delta = td_target - self .v (s )
68- delta = delta .detach ().numpy ()
66+ rewards = []
67+ discounted_r = 0
68+ for reward , d in zip (reversed (r ), reversed (done )):
69+ if d :
70+ discounted_r = 0
71+ discounted_r = reward + gamma * discounted_r
72+ rewards .insert (0 , discounted_r )
73+ # rewards.append(discounted_r)
74+ rewards = torch .tensor (rewards , dtype = torch .float32 )
75+ if rewards .shape [0 ]> 1 : # a batch with size 1 will cause 0 std
76+ rewards = (rewards - rewards .mean ()) / (rewards .std () + 1e-5 )
77+ rewards = rewards .unsqueeze (dim = - 1 )
6978
70- advantage_lst = []
71- advantage = 0.0
72- for delta_t in delta [::- 1 ]:
73- advantage = gamma * lmbda * advantage + delta_t [0 ]
74- advantage_lst .append ([advantage ])
75- advantage_lst .reverse ()
76- advantage = torch .tensor (advantage_lst , dtype = torch .float )
79+ for _ in range (K_epoch ):
80+ vs = self .v (s )
81+ advantage = rewards - vs .detach ()
82+ vs_target = rewards
7783
7884 pi = self .pi (s , softmax_dim = - 1 )
7985 dist_entropy = Categorical (pi ).entropy ()
@@ -82,7 +88,8 @@ def train_net(self):
8288
8389 surr1 = ratio * advantage
8490 surr2 = torch .clamp (ratio , 1 - eps_clip , 1 + eps_clip ) * advantage
85- loss = - torch .min (surr1 , surr2 ) + F .smooth_l1_loss (self .v (s ) , td_target .detach ()) - 0.01 * dist_entropy
91+ # loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , vs_target.detach()) - 0.01*dist_entropy
92+ loss = - torch .min (surr1 , surr2 ) + 0.5 * self .mseLoss (vs , vs_target .detach ()) - 0.01 * dist_entropy
8693
8794 self .optimizer .zero_grad ()
8895 loss .mean ().backward ()
@@ -118,8 +125,9 @@ def main():
118125 model .train_net ()
119126 epi_len .append (t )
120127 if n_epi % print_interval == 0 and n_epi != 0 :
121- print ("# of episode :{}, avg score : {:.1f }, avg epi length :{}" .format (n_epi , score / print_interval , int (np .mean (epi_len )))
128+ print ("# of episode :{}, avg score : {:.3f }, avg epi length :{}" .format (n_epi , score / print_interval , int (np .mean (epi_len ) )))
122129 score = 0.0
130+ epi_len = []
123131
124132 env .close ()
125133
0 commit comments