forked from IgnacioCarlucho/Double_QPID
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathalgorithm_4.py
More file actions
executable file
·127 lines (102 loc) · 6.09 KB
/
algorithm_4.py
File metadata and controls
executable file
·127 lines (102 loc) · 6.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import numpy as np
from functions import identify_nearest_centroid_for_multiple_tables
from algorithm_2 import algorithm_2
from double_Q import DQPID
import time
def algorithm_4(Q_arrange,Mt,next_state,reward,maximum_depth,number_of_actions,e_greed_counter,set_point, flag_ab, K_step):
start = time.time()
#print('algoritmo 4')
state_index = Mt[0].astype(int)
action_index = Mt[1].astype(int)
Q_index = Mt[2].astype(int)
action = Mt[3:,]
#print('Q_index', Q_index)
next_Q_index = Q_index
next_state_index, min_distance_to_centroid = Q_arrange[Q_index].identify_nearest_centroid(next_state)
delta = Q_arrange[Q_index].max_state
l_depth = Q_arrange[Q_index].depth
#if(state_index != next_state_index):
#print('*********************************')
#print('****houston we have a problem****')
#print('*********************************')
# Create new table with higher depth, because next state is inside
if (np.abs(min_distance_to_centroid) <= np.abs(delta)) and (l_depth<maximum_depth):
print('new object in algorithm 4')
#
#
amount_of_objects = len(Q_arrange)
min_state = Q_arrange[Q_index].centroids[state_index] + Q_arrange[Q_index].min_state
max_state = Q_arrange[Q_index].centroids[state_index] + Q_arrange[Q_index].max_state
Q_cheat = (Q_arrange[Q_index].Q_A[state_index][action_index] + Q_arrange[Q_index].Q_B[state_index][action_index])/2.
current_depth_v = Q_arrange[Q_index].depth
new_centroid = np.array([Q_arrange[Q_index].centroids[state_index]]) # this is to give it an array format
Q_arrange.append(DQPID(new_centroid,Q_index, current_depth_v + 1, action, number_of_actions, maximum_depth, action_index, Q_cheat, K_step ))
#Q_arrange[amount_of_objects] = DQPID(new_centroid,Q_index, current_depth_v + 1, action, number_of_actions, maximum_depth, action_index, Q_cheat )
#temp_Q_index = len(Q_arrange)
temp_Q_index = amount_of_objects
new_state_index, new_min_distance_to_centroid = Q_arrange[temp_Q_index].identify_nearest_centroid(next_state)
delta_new = Q_arrange[temp_Q_index].max_state
if (np.abs(new_min_distance_to_centroid) >= delta_new):
#print('new centroid when creating object in algortihm 4')
new_state_index = Q_arrange[temp_Q_index].get_new_centroid(next_state)
#TODO this is controversial, but when Creating a new object I like to reset the greed counter
e_greed_counter = 0.
#I increase the descendence of the parent
Q_arrange[Q_index].descendence = Q_arrange[Q_index].descendence + 1
# I store the index of the son in the parent
Q_arrange[Q_index].descendence_index = np.append(Q_arrange[Q_index].descendence_index , amount_of_objects)
# I set the next table
next_Q_index = temp_Q_index
#next_state_index, min_distance_to_centroid = Q_arrange[Q_index].identify_nearest_centroid(next_state)
print('state_index',state_index, action_index)
if flag_ab=='A':
Q_B_max_next_value = np.max(Q_arrange[next_Q_index].Q_B[new_state_index])
Q_arrange[Q_index].update_Q(state_index,action_index,reward,Q_B_max_next_value,flag_ab)
else:
Q_A_max_next_value = np.max(Q_arrange[next_Q_index].Q_A[new_state_index])
Q_arrange[Q_index].update_Q(state_index,action_index,reward,Q_A_max_next_value,flag_ab)
# maximum depth
elif(l_depth==maximum_depth and np.abs(min_distance_to_centroid)<=np.abs(delta) ):
#print('algorithm 2')
Q_arrange[Q_index] = algorithm_2(Q_arrange[Q_index], state_index, next_state, reward, action_index, flag_ab)
next_Q_index = Q_index
# goes back one discratization level because next state is outside
elif(np.abs(min_distance_to_centroid)>= np.abs(delta)):
stop_loop = False
print('it went in the else algorithm_4')
print('**********************************')
print('****are you sure this is correct**')
print('**********************************')
print('*********you should check this****')
print('**********************************')
while stop_loop == False:
l_depth = l_depth - 1
# I search in lower depths to see if there is a centroid for the state
if l_depth > 0:
temp_min_distance_to_centroid, temp_state_index, temp_Q_index = identify_nearest_centroid_for_multiple_tables(Q_arrange,l_depth,next_state)
else:
l_depth = 1 # bug catcher, means deapth cant be smaller than 1
temp_min_distance_to_centroid, temp_state_index, temp_Q_index = identify_nearest_centroid_for_multiple_tables(Q_arrange,l_depth,next_state)
# if the distance the the centroid is less than the minimum I create a new centroid in higher depth
if( np.abs(temp_min_distance_to_centroid)<Q_arrange[temp_Q_index].max_state and l_depth>1):
l_depth = l_depth +1
min_distance_to_centroid, state_index, Q_index = identify_nearest_centroid_for_multiple_tables(Q_arrange,l_depth,next_state)
Q_arrange[Q_index] = algorithm_2(Q_arrange[Q_index], state_index, next_state, reward,action_index, flag_ab)
next_Q_index = Q_index
stop_loop = True
if l_depth == 1:
#TODO
l_new = 1
Q_index = 0
h_new = Q_arrange[Q_index].h
Q_arrange[Q_index] = algorithm_2(Q_arrange[Q_index], state_index,next_state,reward,action_index, flag_ab)
next_Q_index = Q_index
stop_loop = True
else:
print('*********************************')
print('**** this should not happend ****')
print('*********************************')
next_Q_index = -1
end = time.time()
print('el tiempo de ejecucion es',end-start)
return Q_arrange, Q_index, next_Q_index, e_greed_counter