NetVLAD model test

윤영빈
Commit e3b6e3b4947b3fe3c66c5fbbf6602dbfe1e26b99 e3b6e3b4 1 parent e9ac668c
Showing 3 changed files with 418 additions and 39 deletions
README.md
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
--- a/README.md
View file @e3b6e3b
+++ b/README.md
View file @e3b6e3b
@@ -20,8 +20,6 @@
 ![profit_hunter](/img/profit_hunter.png)
 * 팀명 **Profit Hunter**
 * 윤영빈(컴퓨터공학과, 2015104192)
- * 윤준현(컴퓨터공학과, 2015104193)
- * 이현규(컴퓨터공학과, 2015104209)
 * 이태현(컴퓨터공학과, 2015104208)
 
 ## Links
--- a/web/backend/yt8m/frame_level_models.py
View file @e3b6e3b
+++ b/web/backend/yt8m/frame_level_models.py
View file @e3b6e3b
@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
 flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
 
 
- class FrameLevelLogisticModel(models.BaseModel):
-   """Creates a logistic classifier over the aggregated frame-level features."""
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """See base class.
- 
-     This class is intended to be an example for implementors of frame level
-     models. If you want to train a model over averaged features it is more
-     efficient to average them beforehand rather than on the fly.
- 
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-         input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
- 
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     feature_size = model_input.get_shape().as_list()[2]
- 
-     denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                               [-1, feature_size])
-     avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
- 
-     output = slim.fully_connected(avg_pooled,
-                                   vocab_size,
-                                   activation_fn=tf.nn.sigmoid,
-                                   weights_regularizer=slim.l2_regularizer(1e-8))
-     return {"predictions": output}
- 
- 
 class DbofModel(models.BaseModel):
   """Creates a Deep Bag of Frames model.
 
@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel):
         tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
         for _ in range(number_of_layers)
     ])
- 
     _, state = tf.nn.dynamic_rnn(stacked_lstm,
                                  model_input,
                                  sequence_length=num_frames,
@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel):
     return aggregated_model().create_model(model_input=state[-1].h,
                                            vocab_size=vocab_size,
                                            **unused_params)
+ 
+ class FrameLevelLogisticModel(models.BaseModel):
+   """Creates a logistic classifier over the aggregated frame-level features."""
+   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+     """See base class.
+ 
+     This class is intended to be an example for implementors of frame level
+     models. If you want to train a model over averaged features it is more
+     efficient to average them beforehand rather than on the fly.
+ 
+     Args:
+       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+         input features.
+       vocab_size: The number of classes in the dataset.
+       num_frames: A vector of length 'batch' which indicates the number of
+         frames for each video (before padding).
+ 
+     Returns:
+       A dictionary with a tensor containing the probability predictions of the
+       model in the 'predictions' key. The dimensions of the tensor are
+       'batch_size' x 'num_classes'.
+     """
+     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+     feature_size = model_input.get_shape().as_list()[2]
+ 
+     denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
+                               [-1, feature_size])
+     avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
+ 
+     output = slim.fully_connected(avg_pooled,
+                                   vocab_size,
+                                   activation_fn=tf.nn.sigmoid,
+                                   weights_regularizer=slim.l2_regularizer(1e-8))
+     return {"predictions": output}
+ 
+ class CNN(models.BaseModel):
+   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+     """def model(features, labels, mode, params):"""
+     """CNN classifier model."""
+     images = features["image"]
+     labels = labels["label"]
+     
+     tf.summary.image("images", images)
+     
+     drop_rate = 0.0
+     
+     features = images
+     for i, filters in enumerate([32, 64, 128]):
+       features = tf.layers.conv2d(
+         features, filters=filters, kernel_size=3, padding="same",
+         name="conv_%d" % (i + 1))
+       features = tf.layers.max_pooling2d(
+         inputs=features, pool_size=2, strides=2, padding="same",
+         name="pool_%d" % (i + 1))
+     
+     features = tf.contrib.layers.flatten(features)
+     
+     features = tf.layers.dropout(features, drop_rate)
+     features = tf.layers.dense(features, 512, name="dense_1")
+     
+     features = tf.layers.dropout(features, drop_rate)
+     logits = tf.layers.dense(features, params.num_classes, activation=None,
+                  name="dense_2")
+     
+     predictions = tf.argmax(logits, axis=1)
+     
+     loss = tf.losses.sparse_softmax_cross_entropy(
+       labels=labels, logits=logits)
+         
+     output = slim.fully_connected(avg_pooled,
+                                   vocab_size,
+                                   activation_fn=tf.nn.sigmoid,
+                                   weights_regularizer=slim.l2_regularizer(1e-8))
+     return {"predictions": predictions}, loss
+ 
+ 
+ 
+ 
+ 
+ 
+ class NetVLAD_NonLocal_types():
+   def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
+     self.feature_size = feature_size
+     self.max_frames = max_frames
+     self.is_training = is_training
+     self.add_batch_norm = add_batch_norm
+     self.cluster_size = cluster_size
+ 
+   def forward(self,reshaped_input):
+ 
+     cluster_weights = tf.get_variable("cluster_weights",
+               [self.feature_size, self.cluster_size],
+               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+ 
+     tf.summary.histogram("cluster_weights", cluster_weights)
+     activation = tf.matmul(reshaped_input, cluster_weights)
+ 
+     if self.add_batch_norm:
+       activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn")
+     else:
+       cluster_biases = tf.get_variable("cluster_biases",
+             [cluster_size],
+             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+       tf.summary.histogram("cluster_biases", cluster_biases)
+       activation += cluster_biases
+ 
+     activation = tf.nn.softmax(activation)
+     tf.summary.histogram("cluster_output", activation)
+ 
+     activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
+ 
+     a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
+ 
+     cluster_weights2 = tf.get_variable("cluster_weights2",
+             [1,self.feature_size, self.cluster_size],
+             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+ 
+     a = tf.multiply(a_sum,cluster_weights2)
+ 
+     activation = tf.transpose(activation,perm=[0,2,1])
+ 
+     reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
+     vlad = tf.matmul(activation,reshaped_input)
+     vlad = tf.transpose(vlad,perm=[0,2,1])
+     vlad = tf.subtract(vlad,a)
+ 
+     vlad = tf.transpose(vlad,perm=[0,2,1])
+     vlad = tf.reshape(vlad, [-1, self.feature_size])
+ 
+     vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64))
+ 
+ 
+     nonlocal_g = tf.get_variable("nonlocal_g",
+               [self.feature_size, self.cluster_size],
+               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+     nonlocal_out = tf.get_variable("nonlocal_out",
+               [self.cluster_size, self.feature_size],
+               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
+ 
+     vlad_g = tf.matmul(vlad, nonlocal_g)
+     vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size])
+     vlad_g = tf.matmul(vlad_softmax, vlad_g)
+     vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
+ 
+     vlad_g = tf.matmul(vlad_g, nonlocal_out)
+     vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size])
+     vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size])
+     vlad = vlad + vlad_g
+ 
+     vlad = tf.transpose(vlad,perm=[0,2,1])
+     vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
+ 
+     vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
+     vlad = tf.nn.l2_normalize(vlad,1)
+ 
+     return vlad
+ 
+   def embedgaussian_relation(self, input_, temp=1/float(32)):
+     nonlocal_theta = tf.get_variable("nonlocal_theta",
+             [self.feature_size, self.cluster_size],
+             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+     nonlocal_phi = tf.get_variable("nonlocal_phi",
+             [self.feature_size, self.cluster_size],
+             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+ 
+     vlad_theta = tf.matmul(input_, nonlocal_theta)
+     vlad_phi = tf.matmul(input_, nonlocal_phi)
+     vlad_theta = tf.reshape(vlad_theta, [-1, self.cluster_size, self.cluster_size])
+     vlad_phi = tf.reshape(vlad_phi, [-1, self.cluster_size, self.cluster_size])
+     vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
+     return vlad_softmax
+ 
+ class NetVLADModelLF(models.BaseModel):
+   """Creates a NetVLAD based model.
+   Args:
+     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+                  input features.
+     vocab_size: The number of classes in the dataset.
+     num_frames: A vector of length 'batch' which indicates the number of
+          frames for each video (before padding).
+   Returns:
+     A dictionary with a tensor containing the probability predictions of the
+     model in the 'predictions' key. The dimensions of the tensor are
+     'batch_size' x 'num_classes'.
+   """
+ 
+ 
+   def create_model(self,model_input,vocab_size,num_frames,iterations=None,add_batch_norm=None,sample_random_frames=None,cluster_size=None,hidden_size=None,is_training=True,**unused_params):
+     iterations = 300
+     add_batch_norm = True
+     random_frames = True
+     cluster_size = 64
+     hidden1_size = 1024
+     relu = False
+     dimred = -1
+     gating = True
+     remove_diag = False
+ 
+     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+     if random_frames:
+       model_input = utils.SampleRandomFrames(model_input, num_frames,
+                                              iterations)
+     else:
+       model_input = utils.SampleRandomSequence(model_input, num_frames,
+                                                iterations)
+ 
+ 
+     max_frames = model_input.get_shape().as_list()[1]
+     feature_size = model_input.get_shape().as_list()[2]
+     reshaped_input = tf.reshape(model_input, [-1, feature_size])
+ 
+ 
+     video_NetVLAD = NetVLAD_NonLocal_types(1024,int(max_frames),int(cluster_size), add_batch_norm, is_training)
+     audio_NetVLAD = NetVLAD_NonLocal_types(128,int(max_frames),int(cluster_size/2), add_batch_norm, is_training)
+ 
+ 
+     if add_batch_norm:# and not lightvlad:
+       reshaped_input = slim.batch_norm(
+           reshaped_input,
+           center=True,
+           scale=True,
+           is_training=is_training,
+           scope="input_bn")
+ 
+     with tf.variable_scope("video_VLAD"):
+         vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
+ 
+     with tf.variable_scope("audio_VLAD"):
+         vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
+ 
+     vlad = tf.concat([vlad_video, vlad_audio],1)
+ 
+     vlad_dim = vlad.get_shape().as_list()[1]
+     hidden1_weights = tf.get_variable("hidden1_weights",
+       [vlad_dim, hidden1_size],
+       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
+ 
+     activation = tf.matmul(vlad, hidden1_weights)
+ 
+     if add_batch_norm and relu:
+       activation = slim.batch_norm(
+           activation,
+           center=True,
+           scale=True,
+           is_training=is_training,
+           scope="hidden1_bn")
+ 
+     else:
+       hidden1_biases = tf.get_variable("hidden1_biases",
+         [hidden1_size],
+         initializer = tf.random_normal_initializer(stddev=0.01))
+       tf.summary.histogram("hidden1_biases", hidden1_biases)
+       activation += hidden1_biases
+ 
+     if relu:
+       activation = tf.nn.relu6(activation)
+ 
+ 
+     if gating:
+         gating_weights = tf.get_variable("gating_weights_2",
+           [hidden1_size, hidden1_size],
+           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
+ 
+         gates = tf.matmul(activation, gating_weights)
+ 
+         if remove_diag:
+             #removes diagonals coefficients
+             diagonals = tf.matrix_diag_part(gating_weights)
+             gates = gates - tf.multiply(diagonals,activation)
+ 
+ 
+         if add_batch_norm:
+           gates = slim.batch_norm(
+               gates,
+               center=True,
+               scale=True,
+               is_training=is_training,
+               scope="gating_bn")
+         else:
+           gating_biases = tf.get_variable("gating_biases",
+             [cluster_size],
+             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
+           gates += gating_biases
+ 
+         gates = tf.sigmoid(gates)
+ 
+         activation = tf.multiply(activation,gates)
+ 
+     aggregated_model = getattr(video_level_models,
+                                'willow_MoeModel')
+ 
+ 
+     return aggregated_model().create_model(
+         model_input=activation,
+         vocab_size=vocab_size,
+         is_training=is_training,
+         **unused_params)
\ No newline at end of file
--- a/web/backend/yt8m/video_level_models.py
View file @e3b6e3b
+++ b/web/backend/yt8m/video_level_models.py
View file @e3b6e3b
@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS
 flags.DEFINE_integer(
     "moe_num_mixtures", 2,
     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
+ # flags.DEFINE_integer(
+ #     "moe_num_mixtures", 2,
+ #     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
+ flags.DEFINE_float(
+     "moe_l2", 1e-8,
+     "L2 penalty for MoeModel.")
+ flags.DEFINE_integer(
+     "moe_low_rank_gating", -1,
+     "Low rank gating for MoeModel.")
+ flags.DEFINE_bool(
+     "moe_prob_gating", True,
+     "Prob gating for MoeModel.")
+ flags.DEFINE_string(
+     "moe_prob_gating_input", "prob",
+     "input Prob gating for MoeModel.")
 
 
 class LogisticModel(models.BaseModel):
@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel):
     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
                                      [-1, vocab_size])
     return {"predictions": final_probabilities}
+ 
+ 
+ class willow_MoeModel(models.BaseModel):
+   """A softmax over a mixture of logistic models (with L2 regularization)."""
+ 
+   def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params):
+     """Creates a Mixture of (Logistic) Experts model.
+      It also includes the possibility of gating the probabilities
+      The model consists of a per-class softmax distribution over a
+      configurable number of logistic classifiers. One of the classifiers in the
+      mixture is not trained, and always predicts 0.
+     Args:
+       model_input: 'batch_size' x 'num_features' matrix of input features.
+       vocab_size: The number of classes in the dataset.
+       is_training: Is this the training phase ?
+       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
+         always predicts the non-existence of an entity).
+       l2_penalty: How much to penalize the squared magnitudes of parameter
+         values.
+     Returns:
+       A dictionary with a tensor containing the probability predictions of the
+       model in the 'predictions' key. The dimensions of the tensor are
+       batch_size x num_classes.
+     """
+     num_mixtures = 8
+     low_rank_gating = FLAGS.moe_low_rank_gating
+     l2_penalty = FLAGS.moe_l2
+     gating_probabilities = FLAGS.moe_prob_gating
+     gating_input = FLAGS.moe_prob_gating_input
+ 
+     input_size = model_input.get_shape().as_list()[1]
+     remove_diag = False
+ 
+     if low_rank_gating == -1:
+       gate_activations = slim.fully_connected(
+           model_input,
+           vocab_size * (num_mixtures + 1),
+           activation_fn=None,
+           biases_initializer=None,
+           weights_regularizer=slim.l2_regularizer(l2_penalty),
+           scope="gates")
+     else:
+       gate_activations1 = slim.fully_connected(
+           model_input,
+           low_rank_gating,
+           activation_fn=None,
+           biases_initializer=None,
+           weights_regularizer=slim.l2_regularizer(l2_penalty),
+           scope="gates1")
+       gate_activations = slim.fully_connected(
+           gate_activations1,
+           vocab_size * (num_mixtures + 1),
+           activation_fn=None,
+           biases_initializer=None,
+           weights_regularizer=slim.l2_regularizer(l2_penalty),
+           scope="gates2")
+ 
+     expert_activations = slim.fully_connected(
+         model_input,
+         vocab_size * num_mixtures,
+         activation_fn=None,
+         weights_regularizer=slim.l2_regularizer(l2_penalty),
+         scope="experts")
+ 
+     gating_distribution = tf.nn.softmax(tf.reshape(
+         gate_activations,
+         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
+     expert_distribution = tf.nn.sigmoid(tf.reshape(
+         expert_activations,
+         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
+ 
+     probabilities_by_class_and_batch = tf.reduce_sum(
+         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
+     probabilities = tf.reshape(probabilities_by_class_and_batch,
+                                [-1, vocab_size])
+ 
+     if gating_probabilities:
+       if gating_input == 'prob':
+         gating_weights = tf.get_variable("gating_prob_weights",
+                                          [vocab_size, vocab_size],
+                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
+         gates = tf.matmul(probabilities, gating_weights)
+       else:
+         gating_weights = tf.get_variable("gating_prob_weights",
+                                          [input_size, vocab_size],
+                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
+ 
+         gates = tf.matmul(model_input, gating_weights)
+ 
+       if remove_diag:
+         # removes diagonals coefficients
+         diagonals = tf.matrix_diag_part(gating_weights)
+         gates = gates - tf.multiply(diagonals, probabilities)
+ 
+       gates = slim.batch_norm(
+           gates,
+           center=True,
+           scale=True,
+           is_training=is_training,
+           scope="gating_prob_bn")
+ 
+       gates = tf.sigmoid(gates)
+ 
+       probabilities = tf.multiply(probabilities, gates)
+ 
+     return {"predictions": probabilities}
\ No newline at end of file