-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathValue_Iteration.py
More file actions
92 lines (84 loc) · 2.67 KB
/
Value_Iteration.py
File metadata and controls
92 lines (84 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
## package
import numpy as np
import time, os
## define function
def ValueIteration(func_value, func_reward, trans_mat, gamma):
best_action = np.zeros(16)
func_value_now = func_value.copy()
for state in range(1,15):
next_state = trans_mat[:,state,:]
future_reward = func_reward + func_value*gamma
func_value[state] = np.max(np.matmul(np.transpose(next_state), future_reward))
best_action[state] = np.argmax(np.matmul(np.transpose(next_state), future_reward))
delta = np.max(np.abs(func_value - func_value_now))
return func_value, delta, best_action
def ShowValue(delta, theta, gamma, counter_total, func_value):
print('='*60)
print('No. ' + str(counter_total) + ' Value Iteration')
print('='*60)
print('[Parameters]')
print('Gamma = ' + str(gamma))
print('Threshold = ' + str(theta) + '\n')
print('[Variables]')
print('Delta = ' +str(delta) + '\n')
print('[State-Value]')
print(func_value.reshape(4,4))
print('='*60)
def ShowPolicy(counter_total, best_action):
policy_string = []
policy_string.append('*')
for i in range(1,15):
if best_action[i] == 0:
policy_string.append('^')
elif best_action[i] == 1:
policy_string.append('<')
elif best_action[i] == 2:
policy_string.append('v')
elif best_action[i] == 3:
policy_string.append('>')
policy_string.append('*')
policy_string = np.array(policy_string)
print('[Policy]')
print(policy_string.reshape(4,4))
print('='*60)
return policy_string
# main function
def main():
## environment setting
# action
ProbAction = np.zeros([16,4])
ProbAction[1:15,:] = 0.25
# value function
FuncValue = np.zeros(16)
# reward function
FuncReward = np.full(16,-1)
FuncReward[0] = 0
FuncReward[15] = 0
# transition matrix
T = np.load('./gridworld/T.npy')
# parameters
gamma = 0.99
theta = 0.05
delta = delta = theta + 0.001
counter_total = 0
# iteration
while delta > theta:
counter_total += 1
os.system('cls' if os.name == 'nt' else 'clear')
ValueFunc, delta, BestAction = ValueIteration(FuncValue, FuncReward, T, gamma)
ShowValue(delta, theta, gamma, counter_total, FuncValue)
PolicyString = ShowPolicy(counter_total, BestAction)
time.sleep(2)
os.system('cls' if os.name == 'nt' else 'clear')
print('='*60)
print('Final Result')
print('='*60)
print('[State-value]')
print(FuncValue.reshape(4,4))
print('='*60)
print('[Policy]')
print(PolicyString.reshape(4,4))
print('='*60)
## execute
if __name__ == '__main__':
main()