Showing
5 changed files
with
63 additions
and
46 deletions
.vs/ProjectSettings.json
0 → 100644
.vs/VSWorkspaceState.json
0 → 100644
.vs/slnx.sqlite
0 → 100644
No preview for this file type
... | @@ -46,7 +46,7 @@ flags.DEFINE_string( | ... | @@ -46,7 +46,7 @@ flags.DEFINE_string( |
46 | "Some Frame-Level models can be decomposed into a " | 46 | "Some Frame-Level models can be decomposed into a " |
47 | "generalized pooling operation followed by a " | 47 | "generalized pooling operation followed by a " |
48 | "classifier layer") | 48 | "classifier layer") |
49 | -flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.") | 49 | +flags.DEFINE_integer("lstm_cells", 512, "Number of LSTM cells.") |
50 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") | 50 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") |
51 | 51 | ||
52 | 52 | ||
... | @@ -215,6 +215,54 @@ class LstmModel(models.BaseModel): | ... | @@ -215,6 +215,54 @@ class LstmModel(models.BaseModel): |
215 | vocab_size=vocab_size, | 215 | vocab_size=vocab_size, |
216 | **unused_params) | 216 | **unused_params) |
217 | 217 | ||
218 | +class GruModel(models.BaseModel): | ||
219 | + def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): | ||
220 | + """Creates a model which uses a stack of GRUs to represent the video. | ||
221 | + Args: | ||
222 | + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
223 | + input features. | ||
224 | + vocab_size: The number of classes in the dataset. | ||
225 | + num_frames: A vector of length 'batch' which indicates the number of | ||
226 | + frames for each video (before padding). | ||
227 | + Returns: | ||
228 | + A dictionary with a tensor containing the probability predictions of the | ||
229 | + model in the 'predictions' key. The dimensions of the tensor are | ||
230 | + 'batch_size' x 'num_classes'. | ||
231 | + """ | ||
232 | + gru_size = FLAGS.lstm_cells | ||
233 | + number_of_layers = FLAGS.lstm_layers | ||
234 | + backward = False | ||
235 | + random_frames = False | ||
236 | + iterations = 30 | ||
237 | + | ||
238 | + if random_frames: | ||
239 | + num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
240 | + model_input = utils.SampleRandomFrames(model_input, num_frames_2, | ||
241 | + iterations) | ||
242 | + | ||
243 | + if backward: | ||
244 | + model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) | ||
245 | + | ||
246 | + stacked_GRU = tf.contrib.rnn.MultiRNNCell( | ||
247 | + [ | ||
248 | + tf.contrib.rnn.GRUCell(gru_size) | ||
249 | + for _ in range(number_of_layers) | ||
250 | + ], state_is_tuple=False) | ||
251 | + | ||
252 | + loss = 0.0 | ||
253 | + with tf.variable_scope("RNN"): | ||
254 | + outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input, | ||
255 | + sequence_length=num_frames, | ||
256 | + dtype=tf.float32) | ||
257 | + | ||
258 | + aggregated_model = getattr(video_level_models, | ||
259 | + 'MoeModel') | ||
260 | + return aggregated_model().create_model( | ||
261 | + model_input=state, | ||
262 | + vocab_size=vocab_size, | ||
263 | + is_training=is_training, | ||
264 | + **unused_params) | ||
265 | + | ||
218 | class FrameLevelLogisticModel(models.BaseModel): | 266 | class FrameLevelLogisticModel(models.BaseModel): |
219 | """Creates a logistic classifier over the aggregated frame-level features.""" | 267 | """Creates a logistic classifier over the aggregated frame-level features.""" |
220 | def create_model(self, model_input, vocab_size, num_frames, **unused_params): | 268 | def create_model(self, model_input, vocab_size, num_frames, **unused_params): |
... | @@ -249,50 +297,6 @@ class FrameLevelLogisticModel(models.BaseModel): | ... | @@ -249,50 +297,6 @@ class FrameLevelLogisticModel(models.BaseModel): |
249 | weights_regularizer=slim.l2_regularizer(1e-8)) | 297 | weights_regularizer=slim.l2_regularizer(1e-8)) |
250 | return {"predictions": output} | 298 | return {"predictions": output} |
251 | 299 | ||
252 | -class CNN(models.BaseModel): | ||
253 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
254 | - """def model(features, labels, mode, params):""" | ||
255 | - """CNN classifier model.""" | ||
256 | - images = features["image"] | ||
257 | - labels = labels["label"] | ||
258 | - | ||
259 | - tf.summary.image("images", images) | ||
260 | - | ||
261 | - drop_rate = 0.0 | ||
262 | - | ||
263 | - features = images | ||
264 | - for i, filters in enumerate([32, 64, 128]): | ||
265 | - features = tf.layers.conv2d( | ||
266 | - features, filters=filters, kernel_size=3, padding="same", | ||
267 | - name="conv_%d" % (i + 1)) | ||
268 | - features = tf.layers.max_pooling2d( | ||
269 | - inputs=features, pool_size=2, strides=2, padding="same", | ||
270 | - name="pool_%d" % (i + 1)) | ||
271 | - | ||
272 | - features = tf.contrib.layers.flatten(features) | ||
273 | - | ||
274 | - features = tf.layers.dropout(features, drop_rate) | ||
275 | - features = tf.layers.dense(features, 512, name="dense_1") | ||
276 | - | ||
277 | - features = tf.layers.dropout(features, drop_rate) | ||
278 | - logits = tf.layers.dense(features, params.num_classes, activation=None, | ||
279 | - name="dense_2") | ||
280 | - | ||
281 | - predictions = tf.argmax(logits, axis=1) | ||
282 | - | ||
283 | - loss = tf.losses.sparse_softmax_cross_entropy( | ||
284 | - labels=labels, logits=logits) | ||
285 | - | ||
286 | - output = slim.fully_connected(avg_pooled, | ||
287 | - vocab_size, | ||
288 | - activation_fn=tf.nn.sigmoid, | ||
289 | - weights_regularizer=slim.l2_regularizer(1e-8)) | ||
290 | - return {"predictions": predictions}, loss | ||
291 | - | ||
292 | - | ||
293 | - | ||
294 | - | ||
295 | - | ||
296 | 300 | ||
297 | class NetVLAD_NonLocal_types(): | 301 | class NetVLAD_NonLocal_types(): |
298 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | 302 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | ... | ... |
... | @@ -83,7 +83,7 @@ if __name__ == "__main__": | ... | @@ -83,7 +83,7 @@ if __name__ == "__main__": |
83 | "regularization_penalty", 1.0, | 83 | "regularization_penalty", 1.0, |
84 | "How much weight to give to the regularization loss (the label loss has " | 84 | "How much weight to give to the regularization loss (the label loss has " |
85 | "a weight of 1).") | 85 | "a weight of 1).") |
86 | - flags.DEFINE_float("base_learning_rate", 0.0006, | 86 | + flags.DEFINE_float("base_learning_rate", 0.001, |
87 | "Which learning rate to start with.") | 87 | "Which learning rate to start with.") |
88 | flags.DEFINE_float( | 88 | flags.DEFINE_float( |
89 | "learning_rate_decay", 0.8, | 89 | "learning_rate_decay", 0.8, | ... | ... |
-
Please register or login to post a comment