-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmachine_replacement_gen_return_dists.py
More file actions
74 lines (53 loc) · 2.25 KB
/
machine_replacement_gen_return_dists.py
File metadata and controls
74 lines (53 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import bayesian_irl
import mdp_worlds
import utils
import mdp
import numpy as np
import scipy
import random
import generate_efficient_frontier
from machine_replacement import generate_posterior_samples
if __name__=="__main__":
seed = 1234
np.random.seed(seed)
scipy.random.seed(seed)
random.seed(seed)
num_states = 4
num_samples = 2000
gamma = 0.95
alpha = 0.99
posterior = generate_posterior_samples(num_samples)
r_sa = np.mean(posterior, axis=1)
init_distribution = np.ones(num_states)/num_states #uniform distribution
mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution)
print("mean MDP reward", r_sa)
u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
#write out to file
f = open('./results/machine_replacement/policy_usas.txt', 'w')
f.write("--mean policy\n")
utils.write_line(u_sa, f)
print("mean policy from posterior")
utils.print_stochastic_policy_action_probs(u_sa, mdp_env)
print("MAP/Mean policy from posterior")
utils.print_policy_from_occupancies(u_sa, mdp_env)
print("rewards")
print(mdp_env.r_sa)
print("expected value = ", np.dot(u_sa, r_sa))
stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env)
print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env))
print("values", mdp.get_state_values(u_sa, mdp_env))
print('q-values', mdp.get_q_values(u_sa, mdp_env))
#run CVaR optimization, maybe just the robust version for now
u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)
posterior_probs = np.ones(num_samples) / num_samples #uniform dist since samples from MCMC
#generate efficient frontier
lambda_range = [0.0,0.5, 0.95]
for i,lamda in enumerate(lambda_range):
print("lambda = ", lamda)
cvar_opt_usa, cvar, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, posterior, posterior_probs, alpha, False, lamda)
print('action probs')
utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env)
stoch_pi = utils.get_optimal_policy_from_usa(cvar_opt_usa, mdp_env)
print(stoch_pi[:,1])
f.write('--lambda {}\n'.format(lamda))
utils.write_line(cvar_opt_usa, f)