-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain_RL_discrete.py
More file actions
105 lines (89 loc) · 3.4 KB
/
main_RL_discrete.py
File metadata and controls
105 lines (89 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from multiprocessing import Pool, freeze_support
import gym
import numpy as np
import matplotlib.pyplot as plt
from model_RL import PolicyDiscrete
# a single simulation
def sample(model, problem):
env = gym.make(problem)
prev_state, info = env.reset()
reward_list = []
state_list = []
action_list = []
while True:
action = model.sample_action(np.array([prev_state]))
state, reward, terminated, truncated, info = env.step(action)
reward_list.append(reward)
state_list.append(prev_state)
action_list.append(action)
if terminated or truncated:
return np.array(state_list), np.array(action_list), np.array(reward_list)
prev_state = state
if __name__ == "__main__":
# Some setting
# Parallel setting
freeze_support()
pool = Pool(12)
# OpenAI GYM environment setting
problem = "CartPole-v1"
env = gym.make(problem, render_mode="human")
# Get the dimension of state , dimension of action , the upper bound and the lower bound of action
num_states = env.observation_space.shape[0]
print("Size of State Space -> {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space -> {}".format(num_actions))
print()
# Declare the neural network model
model = PolicyDiscrete([num_states, 64, 64, num_actions], learning_rate=0.001)
# Training
total_episodes = 2000
avg_reward_list = []
for ep in range(total_episodes):
# Simulate 128 times
pool_buffer = pool.starmap_async(sample, [(model, problem)] * 128)
S, A, R, avg_R = [], [], [], []
for s, a, r in pool_buffer.get():
S.append(s)
A.append(a)
# Calculate reward sum of each simulation for display
avg_R.append(np.sum(r))
# Calculate the TD learning state value with the discount factor gamma
cumulative = 0.
gamma = 0.999
discount_rewards = np.zeros_like(r)
for i in reversed(range(len(r))):
cumulative = cumulative * gamma + r[i]
discount_rewards[i] = cumulative
# Standardization the TD learning state value for controlling the variance of policy gradient
discount_rewards -= np.mean(discount_rewards)
discount_rewards /= np.std(discount_rewards)
R.append(discount_rewards)
# Apply policy gradient to update the model's parameters
model.update(S, A, R)
# Display the average of reward sum of these 128 times simulation
reward_sum = np.mean(avg_R)
print("Episode * {} * Avg Reward is ==> {}".format(ep + 1, reward_sum))
avg_reward_list.append(reward_sum)
# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()
# Close Parallel
pool.close()
pool.terminate()
# Testing
prev_state = env.reset()
for i in range(10):
reward_sum = 0
prev_state, info = env.reset()
while True:
env.render()
action = model.predict(np.array([prev_state]))[0]
state, reward, terminated, truncated, _ = env.step(action)
reward_sum += reward
if terminated or truncated:
break
prev_state = state
print("eps:", i, "total rewards:", reward_sum)