-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_safe_action.py
More file actions
133 lines (121 loc) · 6.33 KB
/
test_safe_action.py
File metadata and controls
133 lines (121 loc) · 6.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import pickle
import gym
import torch
import torch.nn as nn
import numpy as np
import numpy.random as rd
from torch.nn.modules import loss
from plotDRL import plot_optimization_result
from random_generator_battery import ESSEnv
import pandas as pd
from tools import Arguments,get_episode_return,test_one_episode,ReplayBuffer,optimization_base_result,test_ten_episodes, test_ten_episodes_safe, get_episode_return_safe
from agent import AgentTD3, AgentTD3_with_safe_action
from random_generator_battery import ESSEnv
def update_buffer(_trajectory):
ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
ary_other = torch.as_tensor([item[1] for item in _trajectory])
ary_other[:, 0] = ary_other[:, 0] # ten_reward
ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma # ten_mask = (1.0 - ary_done) * gamma
buffer.extend_buffer(ten_state, ary_other)
_steps = ten_state.shape[0]
_r_exp = ary_other[:, 0].mean() # other = (reward, mask, action)
return _steps, _r_exp
if __name__=='__main__':
args=Arguments()
reward_record={'episode':[],'steps':[],'mean_episode_reward':[],'unbalance':[]}
loss_record={'episode':[],'steps':[],'critic_loss':[],'actor_loss':[],'entropy_loss':[]}
args.visible_gpu = '0'
all_seeds_reward_record = {}
if bool(args.random_seed_list):
for seed in args.random_seed_list:
# 奖励函数记录
reward_record = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': [], 'cost': []}
# 损失函数记录
loss_record = {'episode': [], 'steps': [], 'critic_loss': [], 'actor_loss': [], 'entropy_loss': []}
args.random_seed = seed
args.agent = AgentTD3_with_safe_action()
agent_name = f'{args.agent.__class__.__name__}'
args.agent.cri_target = True
args.env = ESSEnv()
args.init_before_training(if_main=True)
'''init agent and environment'''
agent = args.agent
env = args.env
all_seeds_reward_record[seed] = {'episode': [], 'steps': [], 'mean_episode_reward': [], 'unbalance': [],
'cost': []}
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
args.if_per_or_gae)
'''init replay buffer'''
buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
action_dim=env.action_space.shape[0])
'''start training'''
cwd = args.cwd
gamma = args.gamma
batch_size = args.batch_size # how much data should be used to update net
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
agent.state = env.reset()
'''collect data and train and update network'''
num_episode = args.num_episode
##
# args.train=False
# args.save_network=False
# args.test_network=False
# args.save_test_data=False
# args.compare_with_pyomo=False
#
if args.train:
collect_data = True
while collect_data:
print(f'buffer:{buffer.now_len}')
with torch.no_grad():
trajectory = agent.explore_env_safe(env, target_step)
steps, r_exp = update_buffer(trajectory)
buffer.update_now_len()
if buffer.now_len >= 10000:
collect_data = False
for i_episode in range(num_episode):
reward_record['episode'].append(i_episode)
loss_record['episode'].append(i_episode)
critic_loss, actor_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)
loss_record['critic_loss'].append(critic_loss)
loss_record['actor_loss'].append(actor_loss)
with torch.no_grad():
episode_reward, episode_unbalance, episode_cost = get_episode_return_safe(env, agent.act, agent.device)
reward_record['mean_episode_reward'].append(episode_reward)
reward_record['unbalance'].append(episode_unbalance)
reward_record['cost'].append(episode_cost)
print(
f'curren epsiode is {i_episode}, reward:{episode_reward},unbalance:{episode_unbalance},cost:{episode_cost},buffer_length: {buffer.now_len}')
if i_episode % 10 == 0:
# target_step
with torch.no_grad():
trajectory = agent.explore_env_safe(env, target_step)
steps, r_exp = update_buffer(trajectory)
all_seeds_reward_record[seed] = reward_record
loss_record_path = f'{args.cwd}/loss_data.pkl'
reward_record_path = f'{args.cwd}/reward_data.pkl'
all_seeds_reward_record_path = f'{args.cwd}/all_seeds_reward_record.pkl'
# current only store last seed corresponded actor
with open(loss_record_path, 'wb') as tf:
pickle.dump(loss_record, tf)
with open(reward_record_path, 'wb') as tf:
pickle.dump(reward_record, tf)
with open(all_seeds_reward_record_path, 'wb') as tf:
pickle.dump(all_seeds_reward_record, tf)
act_save_path = f'{args.cwd}/actor.pth'
if args.save_network:
torch.save(agent.act.state_dict(), act_save_path)
print('actor parameters have been saved')
if args.test_network:
args.cwd = agent_name
agent.act.load_state_dict(torch.load(act_save_path))
print('parameters have been reload and test')
state = env.reset()
record = test_ten_episodes_safe(state, env, agent.act, agent.device)
print(record)
#eval_data = pd.DataFrame(record['information'])
#eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
# 'gen3', 'unbalance', 'operation_cost']