-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtasks_usedforTask3.py
More file actions
189 lines (161 loc) · 9.22 KB
/
tasks_usedforTask3.py
File metadata and controls
189 lines (161 loc) · 9.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# This is the exact code that generated Task 3 in the paper (accidental key-door task).
import numpy as np
N = 4
NBA = 2
NBR = np.random.choice([1, 1, 1, 2, 3])
NBFR = np.random.choice([1, 1, 1, 2])
PROBAUSEOLDSTATE= 1.0
PROBAUSENEWSTATE= 0.0
PROBAUSEACTION= .33
NBSPECIALSTATES= 1
STIMSIZE= 5
PROBANOSTIM = .2
#### Now, set variables for the new meta-task:
PROBAUSESPECIALSTATE = np.random.choice([0.0, 0.0, .5]) # Most meta-tasks don't need special states
#PROBAUSEPROBABILISTICREWARDS = np.random.choice([0.0, 0.0, 0.0, .2, .5, .8]) # Or probabilistic rewards either
PROBAUSEPROBABILISTICREWARDS = np.random.choice([0.0, 0.0, 0.0, 1.0]) # Or probabilistic rewards either (but if they do, all rewards should be probabilistic)
PROBAUSEVARREWARDPROB = np.random.choice([.2, .5, .8]) # *If* a reward is probabilisitc, proba that it's also variable across instances/tasks
PROBAUSEFLAG = .5 #np.random.choice([0.0, 0.0, .5]) # State variables / "flags" should be used sparingly
NBVARSTIM = 2
NBFIXEDSTIM = 3
#### From now on we generate the meta-task automatically
OK = False
while not OK:
specialstatesranges=[]
for ns in range(NBSPECIALSTATES):
myrangesize = 2 if np.random.rand() < .5 else np.random.randint(2, N) # A special state with range size 1 makes no sense. 2 to N-1 inclusive (excludes 0).
myrange= list(np.random.choice(range(1, N), size=myrangesize, replace=False)) # state 0 should not be a special state (kind of arbitrary)
specialstatesranges.append(myrange)
fixedstims = []
for ns in range(NBFIXEDSTIM):
fixedstims.append(np.random.randint(2, size=STIMSIZE))
stims = np.zeros(N).astype(int)
nbdiffvarstims = 1
while nbdiffvarstims == 1:
for ns in range(N):
stims[ns] = np.random.randint(NBFIXEDSTIM) if np.random.rand() < .666 else 1000 + np.random.randint(NBVARSTIM)
if np.random.rand() < PROBANOSTIM:
stims[ns] = -1
nbdiffvarstims = len(np.unique(stims[stims >= 1000])) # 1 task-variable stimulus doesn't induce real variation over tasks, because it ends up being "the state that's not fixed/nothing". Must have at least 2 different vari stims, if any.
# Transition function: For each state and action, a probability distribution over all states.
# In many case, this distribution should be a one-hot vector. Even most of the remaining cases should be two-hot vectors(only two possilbe options).
# But not always, sometimes more options (though probably never all possible states)
# Furthermore, the same distribution (or sometimes, if two-hot, its mirror image) should often be replicated over all actions.
# Only allowed probabilities are binary (equal prob amobg non-zero), or one bigger than the others.
T= np.zeros((N, NBA, N))
for ns in range(N):
for na in range(NBA):
nextS = np.random.randint(N)
T[ns, na, nextS] = 1
if np.random.rand() < .33 :
T[ns, na, nextS] *= 5 # If there are other non-zero probabilities, this one will be the highest
if np.random.rand() < .5:
T[ns, na, np.random.randint(N)] = 1 # Yes, might be the same
if np.random.rand() < .33:
T[ns, na, np.random.randint(N)] = 1
if np.random.rand() < .33:
T[ns, na, np.random.randint(N)] = 1
if np.random.rand() < .5: # Make all actions have the same output distributions, possibly flipped
for na2 in range(1, NBA):
T[ns, na2, :] = T[ns, 0, :]
nzp = np.nonzero(T[ns, 0, :])[0]
if len(nzp) > 1 and np.random.randn() < .5: # Flip? (Only if >1 nonzero values - may amount to noop if all nonzero values are 1, that's fine)
nanotflipped = np.random.randint(NBA)
(p1, p2) = np.random.choice(nzp, size=2, replace=False)
#print("Flipping state", ns, "actions outcome at positions", p1, "and", p2, "other than action", nanotflipped)
for na2 in range(0, NBA):
if na2 == nanotflipped:
continue
tmp = T[ns, na2, p1]
T[ns, na2, p1] = T[ns, na2, p2]
T[ns, na2, p2] = tmp
T = T / np.sum(T, axis=2)[:, :, None]
# Reward rules
# Old state, new state, action taken, probability, value, flag
# Later rules override previous ones
rules = []
for nr in range(NBR):
rule = [-1, -1, -1, 0, 0, -1]
while (rule[0] == -1 and rule[1] == -1 and rule[2] == -1) or (rule[0] == 1 and rule[5] == 1): # Rules in state 0 requiring flag 1 will never apply
if np.random.rand() < PROBAUSEOLDSTATE:
rule[0] = np.random.randint(N)
if np.random.rand() < PROBAUSENEWSTATE:
rule[1] = np.random.randint(N)
if np.random.rand() < PROBAUSEACTION:
rule[2] = np.random.randint(NBA)
rule[3] = 1.0 # np.random.choice([.2, .8, 1.0, 1.0])
rule[4] = 1.0
rules.append(rule)
# Should some of the rules make use of the special states? (The precise identity of which will be picked when we generate an actual new instance/task)
for nr in range(NBR):
if rules[nr][0] != -1 and np.random.rand() < PROBAUSESPECIALSTATE:
rules[nr][0] = 100 + np.random.randint(NBSPECIALSTATES)
if rules[nr][1] != -1 and np.random.rand() < PROBAUSESPECIALSTATE:
rules[nr][1] = 100 + np.random.randint(NBSPECIALSTATES)
# Should some of the rules make use of the flag?
for nr in range(NBR):
if np.random.rand() < PROBAUSEFLAG:
rules[nr][5] = np.random.choice([1.0, 1.0, 1.0, 0.0]) # Mostly look for set flag (just a design choice)
# Probabiliistic rewards?
for nr in range(NBR):
if np.random.rand() < PROBAUSEPROBABILISTICREWARDS:
rules[nr][3] = np.random.choice([.2, .5, .8, 1.0])
if np.random.rand() < PROBAUSEVARREWARDPROB: # Notice the indent !
rules[nr][3] = 1000 # i.e. "choose it at instace/task generation time"
# Flag-setting rules (in addition to the standard rule that transitioning to state 0 sets flag to 0)
# Old state, new state, action taken, new flag value
# New flag value should be mostly 1
flagrules = []
for nr in range(NBFR):
ok0 = False
while not ok0:
flagrule = [-1, -1, -1, 1.0]
while flagrule[0] == -1 and flagrule[1] == -1 and flagrule[2] == -1:
if np.random.rand() < PROBAUSEOLDSTATE:
flagrule[0] = np.random.randint(N)
if np.random.rand() < PROBAUSENEWSTATE:
flagrule[1] = np.random.randint(N)
if np.random.rand() < PROBAUSEACTION:
flagrule[2] = np.random.randint(NBA)
flagrule[3] = np.random.choice([1, 1, 1, 0])
ok0 = flagrule[0] != 0 or flagrule[1] != -1 or flagrule[2] != -1 # State 0 cannot unconditionally set the flag
flagrules.append(flagrule)
# Should some of the flag rules make use of the special states? (The precise identity of which will be picked when we generate an actual new instance/task)
for nr in range(NBFR):
if flagrules[nr][0] != -1 and np.random.rand() < PROBAUSESPECIALSTATE:
flagrules[nr][0] = 100 + np.random.randint(NBSPECIALSTATES)
if flagrules[nr][1] != -1 and np.random.rand() < PROBAUSESPECIALSTATE:
flagrules[nr][1] = 100 + np.random.randint(NBSPECIALSTATES)
flagrules[0][3] = 1 # There should be at least one rule that actually sets the flag (note that flag rules may not be used)
# At least two states must have different outcomes for either action
somediffoutcomes = 0
for ns in range(N):
if np.any(T[ns,0] != T[ns, 1]):
somediffoutcomes += 1
# *Something* must be variable across instances of the meta-task
somevar = np.any(stims>99) or np.any(np.array(rules)>99)
# There must be some way out of 0 - 0 must not be a terminal state
wayoutof0 = np.any(T[0, :, 0] < 1.0)
# Every state must be reachable
someunreachable =0
for ns in range(N):
sumprobastons = 0
for ns2 in range(N):
if ns2 == ns:
continue
sumprobastons += np.sum(T[ns2, :, ns])
if sumprobastons == 0:
someunreachable = 1
break
OK = somevar and somediffoutcomes > 1 and wayoutof0 and not someunreachable
# Might also want to add that the graph must be weakly
# connected (no completely separate sub-graphs)
# OK = True
print("PROBAUSESPECIALSTATE:", PROBAUSESPECIALSTATE, "PROBAUSEPROBABILISTICREWARDS:",
PROBAUSEPROBABILISTICREWARDS, "PROBAUSEVARREWARDPROB:", PROBAUSEVARREWARDPROB,
"PROBAUSEFLAG:", PROBAUSEFLAG) # Yeah, should use dict...
print("Special States' (if any) ranges:", specialstatesranges)
print("Transition table:\n", T)
print("Reward rules (Old state, new state, action taken, probability, value, flag):\n", rules)
print("Flag rules (Old state, new state, action taken, new flag value) (being in state 0 sets flag to 0) (may not be used!):\n", flagrules)
print("Stimuli:\n", stims)