윤영빈

NetVLAD model test

...@@ -20,8 +20,6 @@ ...@@ -20,8 +20,6 @@
20 ![profit_hunter](/img/profit_hunter.png) 20 ![profit_hunter](/img/profit_hunter.png)
21 * 팀명 **Profit Hunter** 21 * 팀명 **Profit Hunter**
22 * 윤영빈(컴퓨터공학과, 2015104192) 22 * 윤영빈(컴퓨터공학과, 2015104192)
23 -* 윤준현(컴퓨터공학과, 2015104193)
24 -* 이현규(컴퓨터공학과, 2015104209)
25 * 이태현(컴퓨터공학과, 2015104208) 23 * 이태현(컴퓨터공학과, 2015104208)
26 24
27 ## Links 25 ## Links
......
...@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.") ...@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
50 flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") 50 flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
51 51
52 52
53 -class FrameLevelLogisticModel(models.BaseModel):
54 - """Creates a logistic classifier over the aggregated frame-level features."""
55 -
56 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
57 - """See base class.
58 -
59 - This class is intended to be an example for implementors of frame level
60 - models. If you want to train a model over averaged features it is more
61 - efficient to average them beforehand rather than on the fly.
62 -
63 - Args:
64 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
65 - input features.
66 - vocab_size: The number of classes in the dataset.
67 - num_frames: A vector of length 'batch' which indicates the number of
68 - frames for each video (before padding).
69 -
70 - Returns:
71 - A dictionary with a tensor containing the probability predictions of the
72 - model in the 'predictions' key. The dimensions of the tensor are
73 - 'batch_size' x 'num_classes'.
74 - """
75 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
76 - feature_size = model_input.get_shape().as_list()[2]
77 -
78 - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
79 - [-1, feature_size])
80 - avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
81 -
82 - output = slim.fully_connected(avg_pooled,
83 - vocab_size,
84 - activation_fn=tf.nn.sigmoid,
85 - weights_regularizer=slim.l2_regularizer(1e-8))
86 - return {"predictions": output}
87 -
88 -
89 class DbofModel(models.BaseModel): 53 class DbofModel(models.BaseModel):
90 """Creates a Deep Bag of Frames model. 54 """Creates a Deep Bag of Frames model.
91 55
...@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel): ...@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel):
239 tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) 203 tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
240 for _ in range(number_of_layers) 204 for _ in range(number_of_layers)
241 ]) 205 ])
242 -
243 _, state = tf.nn.dynamic_rnn(stacked_lstm, 206 _, state = tf.nn.dynamic_rnn(stacked_lstm,
244 model_input, 207 model_input,
245 sequence_length=num_frames, 208 sequence_length=num_frames,
...@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel): ...@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel):
251 return aggregated_model().create_model(model_input=state[-1].h, 214 return aggregated_model().create_model(model_input=state[-1].h,
252 vocab_size=vocab_size, 215 vocab_size=vocab_size,
253 **unused_params) 216 **unused_params)
217 +
218 +class FrameLevelLogisticModel(models.BaseModel):
219 + """Creates a logistic classifier over the aggregated frame-level features."""
220 + def create_model(self, model_input, vocab_size, num_frames, **unused_params):
221 + """See base class.
222 +
223 + This class is intended to be an example for implementors of frame level
224 + models. If you want to train a model over averaged features it is more
225 + efficient to average them beforehand rather than on the fly.
226 +
227 + Args:
228 + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
229 + input features.
230 + vocab_size: The number of classes in the dataset.
231 + num_frames: A vector of length 'batch' which indicates the number of
232 + frames for each video (before padding).
233 +
234 + Returns:
235 + A dictionary with a tensor containing the probability predictions of the
236 + model in the 'predictions' key. The dimensions of the tensor are
237 + 'batch_size' x 'num_classes'.
238 + """
239 + num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
240 + feature_size = model_input.get_shape().as_list()[2]
241 +
242 + denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
243 + [-1, feature_size])
244 + avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
245 +
246 + output = slim.fully_connected(avg_pooled,
247 + vocab_size,
248 + activation_fn=tf.nn.sigmoid,
249 + weights_regularizer=slim.l2_regularizer(1e-8))
250 + return {"predictions": output}
251 +
252 +class CNN(models.BaseModel):
253 + def create_model(self, model_input, vocab_size, num_frames, **unused_params):
254 + """def model(features, labels, mode, params):"""
255 + """CNN classifier model."""
256 + images = features["image"]
257 + labels = labels["label"]
258 +
259 + tf.summary.image("images", images)
260 +
261 + drop_rate = 0.0
262 +
263 + features = images
264 + for i, filters in enumerate([32, 64, 128]):
265 + features = tf.layers.conv2d(
266 + features, filters=filters, kernel_size=3, padding="same",
267 + name="conv_%d" % (i + 1))
268 + features = tf.layers.max_pooling2d(
269 + inputs=features, pool_size=2, strides=2, padding="same",
270 + name="pool_%d" % (i + 1))
271 +
272 + features = tf.contrib.layers.flatten(features)
273 +
274 + features = tf.layers.dropout(features, drop_rate)
275 + features = tf.layers.dense(features, 512, name="dense_1")
276 +
277 + features = tf.layers.dropout(features, drop_rate)
278 + logits = tf.layers.dense(features, params.num_classes, activation=None,
279 + name="dense_2")
280 +
281 + predictions = tf.argmax(logits, axis=1)
282 +
283 + loss = tf.losses.sparse_softmax_cross_entropy(
284 + labels=labels, logits=logits)
285 +
286 + output = slim.fully_connected(avg_pooled,
287 + vocab_size,
288 + activation_fn=tf.nn.sigmoid,
289 + weights_regularizer=slim.l2_regularizer(1e-8))
290 + return {"predictions": predictions}, loss
291 +
292 +
293 +
294 +
295 +
296 +
297 +class NetVLAD_NonLocal_types():
298 + def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
299 + self.feature_size = feature_size
300 + self.max_frames = max_frames
301 + self.is_training = is_training
302 + self.add_batch_norm = add_batch_norm
303 + self.cluster_size = cluster_size
304 +
305 + def forward(self,reshaped_input):
306 +
307 + cluster_weights = tf.get_variable("cluster_weights",
308 + [self.feature_size, self.cluster_size],
309 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
310 +
311 + tf.summary.histogram("cluster_weights", cluster_weights)
312 + activation = tf.matmul(reshaped_input, cluster_weights)
313 +
314 + if self.add_batch_norm:
315 + activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn")
316 + else:
317 + cluster_biases = tf.get_variable("cluster_biases",
318 + [cluster_size],
319 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
320 + tf.summary.histogram("cluster_biases", cluster_biases)
321 + activation += cluster_biases
322 +
323 + activation = tf.nn.softmax(activation)
324 + tf.summary.histogram("cluster_output", activation)
325 +
326 + activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
327 +
328 + a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
329 +
330 + cluster_weights2 = tf.get_variable("cluster_weights2",
331 + [1,self.feature_size, self.cluster_size],
332 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
333 +
334 + a = tf.multiply(a_sum,cluster_weights2)
335 +
336 + activation = tf.transpose(activation,perm=[0,2,1])
337 +
338 + reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
339 + vlad = tf.matmul(activation,reshaped_input)
340 + vlad = tf.transpose(vlad,perm=[0,2,1])
341 + vlad = tf.subtract(vlad,a)
342 +
343 + vlad = tf.transpose(vlad,perm=[0,2,1])
344 + vlad = tf.reshape(vlad, [-1, self.feature_size])
345 +
346 + vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64))
347 +
348 +
349 + nonlocal_g = tf.get_variable("nonlocal_g",
350 + [self.feature_size, self.cluster_size],
351 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
352 + nonlocal_out = tf.get_variable("nonlocal_out",
353 + [self.cluster_size, self.feature_size],
354 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
355 +
356 + vlad_g = tf.matmul(vlad, nonlocal_g)
357 + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size])
358 + vlad_g = tf.matmul(vlad_softmax, vlad_g)
359 + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
360 +
361 + vlad_g = tf.matmul(vlad_g, nonlocal_out)
362 + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size])
363 + vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size])
364 + vlad = vlad + vlad_g
365 +
366 + vlad = tf.transpose(vlad,perm=[0,2,1])
367 + vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
368 +
369 + vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
370 + vlad = tf.nn.l2_normalize(vlad,1)
371 +
372 + return vlad
373 +
374 + def embedgaussian_relation(self, input_, temp=1/float(32)):
375 + nonlocal_theta = tf.get_variable("nonlocal_theta",
376 + [self.feature_size, self.cluster_size],
377 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
378 + nonlocal_phi = tf.get_variable("nonlocal_phi",
379 + [self.feature_size, self.cluster_size],
380 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
381 +
382 + vlad_theta = tf.matmul(input_, nonlocal_theta)
383 + vlad_phi = tf.matmul(input_, nonlocal_phi)
384 + vlad_theta = tf.reshape(vlad_theta, [-1, self.cluster_size, self.cluster_size])
385 + vlad_phi = tf.reshape(vlad_phi, [-1, self.cluster_size, self.cluster_size])
386 + vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
387 + return vlad_softmax
388 +
389 +class NetVLADModelLF(models.BaseModel):
390 + """Creates a NetVLAD based model.
391 + Args:
392 + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
393 + input features.
394 + vocab_size: The number of classes in the dataset.
395 + num_frames: A vector of length 'batch' which indicates the number of
396 + frames for each video (before padding).
397 + Returns:
398 + A dictionary with a tensor containing the probability predictions of the
399 + model in the 'predictions' key. The dimensions of the tensor are
400 + 'batch_size' x 'num_classes'.
401 + """
402 +
403 +
404 + def create_model(self,model_input,vocab_size,num_frames,iterations=None,add_batch_norm=None,sample_random_frames=None,cluster_size=None,hidden_size=None,is_training=True,**unused_params):
405 + iterations = 300
406 + add_batch_norm = True
407 + random_frames = True
408 + cluster_size = 64
409 + hidden1_size = 1024
410 + relu = False
411 + dimred = -1
412 + gating = True
413 + remove_diag = False
414 +
415 + num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
416 + if random_frames:
417 + model_input = utils.SampleRandomFrames(model_input, num_frames,
418 + iterations)
419 + else:
420 + model_input = utils.SampleRandomSequence(model_input, num_frames,
421 + iterations)
422 +
423 +
424 + max_frames = model_input.get_shape().as_list()[1]
425 + feature_size = model_input.get_shape().as_list()[2]
426 + reshaped_input = tf.reshape(model_input, [-1, feature_size])
427 +
428 +
429 + video_NetVLAD = NetVLAD_NonLocal_types(1024,int(max_frames),int(cluster_size), add_batch_norm, is_training)
430 + audio_NetVLAD = NetVLAD_NonLocal_types(128,int(max_frames),int(cluster_size/2), add_batch_norm, is_training)
431 +
432 +
433 + if add_batch_norm:# and not lightvlad:
434 + reshaped_input = slim.batch_norm(
435 + reshaped_input,
436 + center=True,
437 + scale=True,
438 + is_training=is_training,
439 + scope="input_bn")
440 +
441 + with tf.variable_scope("video_VLAD"):
442 + vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
443 +
444 + with tf.variable_scope("audio_VLAD"):
445 + vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
446 +
447 + vlad = tf.concat([vlad_video, vlad_audio],1)
448 +
449 + vlad_dim = vlad.get_shape().as_list()[1]
450 + hidden1_weights = tf.get_variable("hidden1_weights",
451 + [vlad_dim, hidden1_size],
452 + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
453 +
454 + activation = tf.matmul(vlad, hidden1_weights)
455 +
456 + if add_batch_norm and relu:
457 + activation = slim.batch_norm(
458 + activation,
459 + center=True,
460 + scale=True,
461 + is_training=is_training,
462 + scope="hidden1_bn")
463 +
464 + else:
465 + hidden1_biases = tf.get_variable("hidden1_biases",
466 + [hidden1_size],
467 + initializer = tf.random_normal_initializer(stddev=0.01))
468 + tf.summary.histogram("hidden1_biases", hidden1_biases)
469 + activation += hidden1_biases
470 +
471 + if relu:
472 + activation = tf.nn.relu6(activation)
473 +
474 +
475 + if gating:
476 + gating_weights = tf.get_variable("gating_weights_2",
477 + [hidden1_size, hidden1_size],
478 + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
479 +
480 + gates = tf.matmul(activation, gating_weights)
481 +
482 + if remove_diag:
483 + #removes diagonals coefficients
484 + diagonals = tf.matrix_diag_part(gating_weights)
485 + gates = gates - tf.multiply(diagonals,activation)
486 +
487 +
488 + if add_batch_norm:
489 + gates = slim.batch_norm(
490 + gates,
491 + center=True,
492 + scale=True,
493 + is_training=is_training,
494 + scope="gating_bn")
495 + else:
496 + gating_biases = tf.get_variable("gating_biases",
497 + [cluster_size],
498 + initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
499 + gates += gating_biases
500 +
501 + gates = tf.sigmoid(gates)
502 +
503 + activation = tf.multiply(activation,gates)
504 +
505 + aggregated_model = getattr(video_level_models,
506 + 'willow_MoeModel')
507 +
508 +
509 + return aggregated_model().create_model(
510 + model_input=activation,
511 + vocab_size=vocab_size,
512 + is_training=is_training,
513 + **unused_params)
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS ...@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS
25 flags.DEFINE_integer( 25 flags.DEFINE_integer(
26 "moe_num_mixtures", 2, 26 "moe_num_mixtures", 2,
27 "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") 27 "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
28 +# flags.DEFINE_integer(
29 +# "moe_num_mixtures", 2,
30 +# "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
31 +flags.DEFINE_float(
32 + "moe_l2", 1e-8,
33 + "L2 penalty for MoeModel.")
34 +flags.DEFINE_integer(
35 + "moe_low_rank_gating", -1,
36 + "Low rank gating for MoeModel.")
37 +flags.DEFINE_bool(
38 + "moe_prob_gating", True,
39 + "Prob gating for MoeModel.")
40 +flags.DEFINE_string(
41 + "moe_prob_gating_input", "prob",
42 + "input Prob gating for MoeModel.")
28 43
29 44
30 class LogisticModel(models.BaseModel): 45 class LogisticModel(models.BaseModel):
...@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel): ...@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel):
111 final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, 126 final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
112 [-1, vocab_size]) 127 [-1, vocab_size])
113 return {"predictions": final_probabilities} 128 return {"predictions": final_probabilities}
129 +
130 +
131 +class willow_MoeModel(models.BaseModel):
132 + """A softmax over a mixture of logistic models (with L2 regularization)."""
133 +
134 + def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params):
135 + """Creates a Mixture of (Logistic) Experts model.
136 + It also includes the possibility of gating the probabilities
137 + The model consists of a per-class softmax distribution over a
138 + configurable number of logistic classifiers. One of the classifiers in the
139 + mixture is not trained, and always predicts 0.
140 + Args:
141 + model_input: 'batch_size' x 'num_features' matrix of input features.
142 + vocab_size: The number of classes in the dataset.
143 + is_training: Is this the training phase ?
144 + num_mixtures: The number of mixtures (excluding a dummy 'expert' that
145 + always predicts the non-existence of an entity).
146 + l2_penalty: How much to penalize the squared magnitudes of parameter
147 + values.
148 + Returns:
149 + A dictionary with a tensor containing the probability predictions of the
150 + model in the 'predictions' key. The dimensions of the tensor are
151 + batch_size x num_classes.
152 + """
153 + num_mixtures = 8
154 + low_rank_gating = FLAGS.moe_low_rank_gating
155 + l2_penalty = FLAGS.moe_l2
156 + gating_probabilities = FLAGS.moe_prob_gating
157 + gating_input = FLAGS.moe_prob_gating_input
158 +
159 + input_size = model_input.get_shape().as_list()[1]
160 + remove_diag = False
161 +
162 + if low_rank_gating == -1:
163 + gate_activations = slim.fully_connected(
164 + model_input,
165 + vocab_size * (num_mixtures + 1),
166 + activation_fn=None,
167 + biases_initializer=None,
168 + weights_regularizer=slim.l2_regularizer(l2_penalty),
169 + scope="gates")
170 + else:
171 + gate_activations1 = slim.fully_connected(
172 + model_input,
173 + low_rank_gating,
174 + activation_fn=None,
175 + biases_initializer=None,
176 + weights_regularizer=slim.l2_regularizer(l2_penalty),
177 + scope="gates1")
178 + gate_activations = slim.fully_connected(
179 + gate_activations1,
180 + vocab_size * (num_mixtures + 1),
181 + activation_fn=None,
182 + biases_initializer=None,
183 + weights_regularizer=slim.l2_regularizer(l2_penalty),
184 + scope="gates2")
185 +
186 + expert_activations = slim.fully_connected(
187 + model_input,
188 + vocab_size * num_mixtures,
189 + activation_fn=None,
190 + weights_regularizer=slim.l2_regularizer(l2_penalty),
191 + scope="experts")
192 +
193 + gating_distribution = tf.nn.softmax(tf.reshape(
194 + gate_activations,
195 + [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
196 + expert_distribution = tf.nn.sigmoid(tf.reshape(
197 + expert_activations,
198 + [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
199 +
200 + probabilities_by_class_and_batch = tf.reduce_sum(
201 + gating_distribution[:, :num_mixtures] * expert_distribution, 1)
202 + probabilities = tf.reshape(probabilities_by_class_and_batch,
203 + [-1, vocab_size])
204 +
205 + if gating_probabilities:
206 + if gating_input == 'prob':
207 + gating_weights = tf.get_variable("gating_prob_weights",
208 + [vocab_size, vocab_size],
209 + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
210 + gates = tf.matmul(probabilities, gating_weights)
211 + else:
212 + gating_weights = tf.get_variable("gating_prob_weights",
213 + [input_size, vocab_size],
214 + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
215 +
216 + gates = tf.matmul(model_input, gating_weights)
217 +
218 + if remove_diag:
219 + # removes diagonals coefficients
220 + diagonals = tf.matrix_diag_part(gating_weights)
221 + gates = gates - tf.multiply(diagonals, probabilities)
222 +
223 + gates = slim.batch_norm(
224 + gates,
225 + center=True,
226 + scale=True,
227 + is_training=is_training,
228 + scope="gating_prob_bn")
229 +
230 + gates = tf.sigmoid(gates)
231 +
232 + probabilities = tf.multiply(probabilities, gates)
233 +
234 + return {"predictions": probabilities}
...\ No newline at end of file ...\ No newline at end of file
......