CS229-Final-Project/q2_linear.py at main · tomjmwang/CS229-Final-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import tensorflow as tf
import tensorflow.contrib.layers as layers

from utils.general import get_logger
from utils.test_env import EnvTest
from core.deep_q_learning import DQN
from q1_schedule import LinearExploration, LinearSchedule

from configs.q2_linear import config


class Linear(DQN):
    """
    Implement Fully Connected with Tensorflow
    """
    def add_placeholders_op(self):
        """
        Adds placeholders to the graph

        These placeholders are used as inputs to the rest of the model and will be fed
        data during training.
        """
        # this information might be useful
        state_shape = self.env.input_size
        num_actions = len(self.env.all_action_pairs)

        ##############################################################
        """
        TODO:
            Add placeholders:
            Remember that we stack 4 consecutive frames together.
                - self.s: batch of states, type = uint8
                    shape = (batch_size, img height, img width, nchannels x config.state_history)
                - self.a: batch of actions, type = int32
                    shape = (batch_size)
                - self.r: batch of rewards, type = float32
                    shape = (batch_size)
                - self.sp: batch of next states, type = uint8
                    shape = (batch_size, img height, img width, nchannels x config.state_history)
                - self.done_mask: batch of done, type = bool
                    shape = (batch_size)
                - self.lr: learning rate, type = float32

        (Don't change the variable names!)

        HINT:
            Variables from config are accessible with self.config.variable_name.
            Check the use of None in the dimension for tensorflow placeholders.
            You can also use the state_shape computed above.
        """
        ##############################################################
        ################YOUR CODE HERE (6-15 lines) ##################

        channels = state_shape
        self.s = tf.placeholder(tf.uint8, shape=[None, channels * self.config.state_history])
        self.a = tf.placeholder(tf.int32, shape=[None])
        self.r = tf.placeholder(tf.float32, shape=[None])
        self.sp = tf.placeholder(tf.uint8, shape=[None, channels * self.config.state_history])
        self.done_mask = tf.placeholder(tf.bool, shape=[None])
        self.next_action_mask = tf.placeholder(tf.float32, shape=[None, num_actions])
        self.lr = tf.placeholder(tf.float32, shape=())

        ##############################################################
        ######################## END YOUR CODE #######################


    def get_q_values_op(self, state, scope, reuse=False, hidden=1024):
        """
        Returns Q values for all actions

        Args:
            state: (tf tensor)
                shape = (batch_size, img height, img width, nchannels x config.state_history)
            scope: (string) scope name, that specifies if target network or not
            reuse: (bool) reuse of variables in the scope

        Returns:
            out: (tf tensor) of shape = (batch_size, num_actions)
        """
        # this information might be useful
        num_actions = len(self.env.all_action_pairs)

        ##############################################################
        """
        TODO:
            Implement a fully connected with no hidden layer (linear
            approximation with bias) using tensorflow.

        HINT:
            - You may find the following functions useful:
                - tf.layers.flatten
                - tf.layers.dense

            - Make sure to also specify the scope and reuse
        """
        ##############################################################
        ################ YOUR CODE HERE - 2-3 lines ##################
        with tf.variable_scope(scope, reuse=reuse):
            # TODO: add batch norm
            linear0 = tf.layers.dense(state, hidden, use_bias=True, activation=tf.nn.relu)
            linear1 = tf.layers.dense(linear0, hidden, use_bias=True, activation=tf.nn.relu)
            linear2 = tf.layers.dense(linear1, hidden, use_bias=True)
            linear2 = linear2 + linear0
            linear2 = tf.nn.relu(linear2)

            # linear3 = tf.layers.dense(linear2, hidden, use_bias=True, activation=tf.nn.relu)
            # linear4 = tf.layers.dense(linear3, hidden, use_bias=True)
            # linear4 = linear4 + linear2
            # linear4 = tf.nn.relu(linear4)

            out = tf.layers.dense(linear2, num_actions, use_bias=True)

        ##############################################################
        ######################## END YOUR CODE #######################

        return out


    def add_update_target_op(self, q_scope, target_q_scope):
        """
        update_target_op will be called periodically
        to copy Q network weights to target Q network

        Remember that in DQN, we maintain two identical Q networks with
        2 different sets of weights. In tensorflow, we distinguish them
        with two different scopes. If you're not familiar with the scope mechanism
        in tensorflow, read the docs
        https://www.tensorflow.org/api_docs/python/tf/compat/v1/variable_scope

        Periodically, we need to update all the weights of the Q network
        and assign them with the values from the regular network.
        Args:
            q_scope: (string) name of the scope of variables for q
            target_q_scope: (string) name of the scope of variables
                        for the target network
        """
        ##############################################################
        """
        TODO:
            Add an operator self.update_target_op that for each variable in
            tf.GraphKeys.GLOBAL_VARIABLES that is in q_scope, assigns its
            value to the corresponding variable in target_q_scope

        HINT:
            You may find the following functions useful:
                - tf.get_collection
                - tf.assign
                - tf.group (the * operator can be used to unpack a list)

        (be sure that you set self.update_target_op)
        """
        ##############################################################
        ################### YOUR CODE HERE - 5-10 lines #############

        q_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=q_scope)
        target_q_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=target_q_scope)

        assert len(q_vars) == len(target_q_vars)
        ops = [tf.assign(target_q_v, q_v) for target_q_v, q_v in zip(target_q_vars, q_vars)]

        self.update_target_op = tf.group(*ops)

        ##############################################################
        ######################## END YOUR CODE #######################


    def add_loss_op(self, q, target_q):
        """
        Sets the loss of a batch, self.loss is a scalar

        Args:
            q: (tf tensor) shape = (batch_size, num_actions)
            target_q: (tf tensor) shape = (batch_size, num_actions)
        """
        # you may need this variable
        num_actions = len(self.env.all_action_pairs)

        ##############################################################
        """
        TODO:
            The loss for an example is defined as:
                Q_samp(s) = r if done
                          = r + gamma * max_a' Q_target(s', a')
                loss = (Q_samp(s) - Q(s, a))^2
        HINT:
            - Config variables are accessible through self.config
            - You can access placeholders like self.a (for actions)
                self.r (rewards) or self.done_mask for instance
            - You may find the following functions useful
                - tf.cast
                - tf.reduce_max
                - tf.reduce_sum
                - tf.one_hot
                - tf.squared_difference
                - tf.reduce_mean
        """
        ##############################################################
        ##################### YOUR CODE HERE - 4-5 lines #############

        if not self.use_mask:
            q_samp = self.r + (tf.reduce_max(target_q, axis=1) * self.config.gamma) * (1 - tf.cast(self.done_mask, tf.float32))
        else:
            q_samp = self.r + (tf.reduce_max((target_q - 100000 * (1 - self.next_action_mask)), axis=1) * self.config.gamma) * (1 - tf.cast(self.done_mask, tf.float32))
        q_orig = tf.reduce_sum(tf.one_hot(self.a, num_actions, axis=-1) * q, axis=1)

        self.loss = tf.reduce_mean(tf.squared_difference(q_samp, q_orig))

        ##############################################################
        ######################## END YOUR CODE #######################


    def add_optimizer_op(self, scope):
        """
        Set self.train_op and self.grad_norm

        Args:
            scope: (string) name of the scope whose variables we are
                   differentiating with respect to
        """

        ##############################################################
        """
        TODO:
            1. get Adam Optimizer
            2. compute grads with respect to variables in scope for self.loss
            3. if self.config.grad_clip is True, then clip the grads
                by norm using self.config.clip_val
            4. apply the gradients and store the train op in self.train_op
                (sess.run(train_op) must update the variables)
            5. compute the global norm of the gradients (which are not None) and store
                this scalar in self.grad_norm

        HINT: you may find the following functions useful
            - tf.get_collection
            - optimizer.compute_gradients
            - tf.clip_by_norm
            - optimizer.apply_gradients
            - tf.global_norm

             you can access config variables by writing self.config.variable_name
        """
        ##############################################################
        #################### YOUR CODE HERE - 8-12 lines #############

        optimizer = tf.train.AdamOptimizer(self.lr)
        grad_vars = optimizer.compute_gradients(self.loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope))
        if self.config.grad_clip:
            grad_vars = [(tf.clip_by_norm(grad, self.config.clip_val), var) for grad, var in grad_vars]
        self.train_op = optimizer.apply_gradients(grad_vars)
        self.grad_norm = tf.global_norm([grad for grad, _ in grad_vars if grad is not None])

        ##############################################################
        ######################## END YOUR CODE #######################


# if __name__ == '__main__':
#     env = EnvTest((5, 5, 1))

#     # exploration strategy
#     exp_schedule = LinearExploration(env, config.eps_begin,
#             config.eps_end, config.eps_nsteps)

#     # learning rate schedule
#     lr_schedule  = LinearSchedule(config.lr_begin, config.lr_end,
#             config.lr_nsteps)

#     # train model
#     model = Linear(env, config)
#     model.run(exp_schedule, lr_schedule)