NetVLAD model test

윤영빈
Commit e3b6e3b4947b3fe3c66c5fbbf6602dbfe1e26b99 e3b6e3b4 1 parent e9ac668c
Showing 3 changed files with 418 additions and 39 deletions
README.md
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
--- a/README.md
View file @e3b6e3b
+++ b/README.md
View file @e3b6e3b
@@ -20,8 +20,6 @@
 ![profit_hunter](/img/profit_hunter.png)
 * 팀명 **Profit Hunter**
 * 윤영빈(컴퓨터공학과, 2015104192)
-* 윤준현(컴퓨터공학과, 2015104193)
-* 이현규(컴퓨터공학과, 2015104209)
 * 이태현(컴퓨터공학과, 2015104208)
 ## Links
--- a/web/backend/yt8m/frame_level_models.py
View file @e3b6e3b
+++ b/web/backend/yt8m/frame_level_models.py
View file @e3b6e3b
@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
 flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
-class FrameLevelLogisticModel(models.BaseModel):
-  """Creates a logistic classifier over the aggregated frame-level features."""
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """See base class.
-
-    This class is intended to be an example for implementors of frame level
-    models. If you want to train a model over averaged features it is more
-    efficient to average them beforehand rather than on the fly.
-
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-        input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-        frames for each video (before padding).
-
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    feature_size = model_input.get_shape().as_list()[2]
-
-    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                              [-1, feature_size])
-    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
-
-    output = slim.fully_connected(avg_pooled,
-                                  vocab_size,
-                                  activation_fn=tf.nn.sigmoid,
-                                  weights_regularizer=slim.l2_regularizer(1e-8))
-    return {"predictions": output}
-
-
 class DbofModel(models.BaseModel):
   """Creates a Deep Bag of Frames model.
@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel):
         tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
         for _ in range(number_of_layers)
     ])
-
     _, state = tf.nn.dynamic_rnn(stacked_lstm,
                                  model_input,
                                  sequence_length=num_frames,
@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel):
     return aggregated_model().create_model(model_input=state[-1].h,
                                            vocab_size=vocab_size,
                                            **unused_params)
+
+class FrameLevelLogisticModel(models.BaseModel):
+  """Creates a logistic classifier over the aggregated frame-level features."""
+  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+    """See base class.
+
+    This class is intended to be an example for implementors of frame level
+    models. If you want to train a model over averaged features it is more
+    efficient to average them beforehand rather than on the fly.
+
+    Args:
+      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+        input features.
+      vocab_size: The number of classes in the dataset.
+      num_frames: A vector of length 'batch' which indicates the number of
+        frames for each video (before padding).
+
+    Returns:
+      A dictionary with a tensor containing the probability predictions of the
+      model in the 'predictions' key. The dimensions of the tensor are
+      'batch_size' x 'num_classes'.
+    """
+    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+    feature_size = model_input.get_shape().as_list()[2]
+
+    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
+                              [-1, feature_size])
+    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
+
+    output = slim.fully_connected(avg_pooled,
+                                  vocab_size,
+                                  activation_fn=tf.nn.sigmoid,
+                                  weights_regularizer=slim.l2_regularizer(1e-8))
+    return {"predictions": output}
+
+class CNN(models.BaseModel):
+  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+    """def model(features, labels, mode, params):"""
+    """CNN classifier model."""
+    images = features["image"]
+    labels = labels["label"]
+    
+    tf.summary.image("images", images)
+    
+    drop_rate = 0.0
+    
+    features = images
+    for i, filters in enumerate([32, 64, 128]):
+      features = tf.layers.conv2d(
+        features, filters=filters, kernel_size=3, padding="same",
+        name="conv_%d" % (i + 1))
+      features = tf.layers.max_pooling2d(
+        inputs=features, pool_size=2, strides=2, padding="same",
+        name="pool_%d" % (i + 1))
+    
+    features = tf.contrib.layers.flatten(features)
+    
+    features = tf.layers.dropout(features, drop_rate)
+    features = tf.layers.dense(features, 512, name="dense_1")
+    
+    features = tf.layers.dropout(features, drop_rate)
+    logits = tf.layers.dense(features, params.num_classes, activation=None,
+                 name="dense_2")
+    
+    predictions = tf.argmax(logits, axis=1)
+    
+    loss = tf.losses.sparse_softmax_cross_entropy(
+      labels=labels, logits=logits)
+        
+    output = slim.fully_connected(avg_pooled,
+                                  vocab_size,
+                                  activation_fn=tf.nn.sigmoid,
+                                  weights_regularizer=slim.l2_regularizer(1e-8))
+    return {"predictions": predictions}, loss
+
+
+
+
+
+
+class NetVLAD_NonLocal_types():
+  def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
+    self.feature_size = feature_size
+    self.max_frames = max_frames
+    self.is_training = is_training
+    self.add_batch_norm = add_batch_norm
+    self.cluster_size = cluster_size
+
+  def forward(self,reshaped_input):
+
+    cluster_weights = tf.get_variable("cluster_weights",
+              [self.feature_size, self.cluster_size],
+              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+
+    tf.summary.histogram("cluster_weights", cluster_weights)
+    activation = tf.matmul(reshaped_input, cluster_weights)
+
+    if self.add_batch_norm:
+      activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn")
+    else:
+      cluster_biases = tf.get_variable("cluster_biases",
+            [cluster_size],
+            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+      tf.summary.histogram("cluster_biases", cluster_biases)
+      activation += cluster_biases
+
+    activation = tf.nn.softmax(activation)
+    tf.summary.histogram("cluster_output", activation)
+
+    activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
+
+    a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
+
+    cluster_weights2 = tf.get_variable("cluster_weights2",
+            [1,self.feature_size, self.cluster_size],
+            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+
+    a = tf.multiply(a_sum,cluster_weights2)
+
+    activation = tf.transpose(activation,perm=[0,2,1])
+
+    reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
+    vlad = tf.matmul(activation,reshaped_input)
+    vlad = tf.transpose(vlad,perm=[0,2,1])
+    vlad = tf.subtract(vlad,a)
+
+    vlad = tf.transpose(vlad,perm=[0,2,1])
+    vlad = tf.reshape(vlad, [-1, self.feature_size])
+
+    vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64))
+
+
+    nonlocal_g = tf.get_variable("nonlocal_g",
+              [self.feature_size, self.cluster_size],
+              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+    nonlocal_out = tf.get_variable("nonlocal_out",
+              [self.cluster_size, self.feature_size],
+              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
+
+    vlad_g = tf.matmul(vlad, nonlocal_g)
+    vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size])
+    vlad_g = tf.matmul(vlad_softmax, vlad_g)
+    vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
+
+    vlad_g = tf.matmul(vlad_g, nonlocal_out)
+    vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size])
+    vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size])
+    vlad = vlad + vlad_g
+
+    vlad = tf.transpose(vlad,perm=[0,2,1])
+    vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
+
+    vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
+    vlad = tf.nn.l2_normalize(vlad,1)
+
+    return vlad
+
+  def embedgaussian_relation(self, input_, temp=1/float(32)):
+    nonlocal_theta = tf.get_variable("nonlocal_theta",
+            [self.feature_size, self.cluster_size],
+            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+    nonlocal_phi = tf.get_variable("nonlocal_phi",
+            [self.feature_size, self.cluster_size],
+            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
+
+    vlad_theta = tf.matmul(input_, nonlocal_theta)
+    vlad_phi = tf.matmul(input_, nonlocal_phi)
+    vlad_theta = tf.reshape(vlad_theta, [-1, self.cluster_size, self.cluster_size])
+    vlad_phi = tf.reshape(vlad_phi, [-1, self.cluster_size, self.cluster_size])
+    vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
+    return vlad_softmax
+
+class NetVLADModelLF(models.BaseModel):
+  """Creates a NetVLAD based model.
+  Args:
+    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
+                 input features.
+    vocab_size: The number of classes in the dataset.
+    num_frames: A vector of length 'batch' which indicates the number of
+         frames for each video (before padding).
+  Returns:
+    A dictionary with a tensor containing the probability predictions of the
+    model in the 'predictions' key. The dimensions of the tensor are
+    'batch_size' x 'num_classes'.
+  """
+
+
+  def create_model(self,model_input,vocab_size,num_frames,iterations=None,add_batch_norm=None,sample_random_frames=None,cluster_size=None,hidden_size=None,is_training=True,**unused_params):
+    iterations = 300
+    add_batch_norm = True
+    random_frames = True
+    cluster_size = 64
+    hidden1_size = 1024
+    relu = False
+    dimred = -1
+    gating = True
+    remove_diag = False
+
+    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
+    if random_frames:
+      model_input = utils.SampleRandomFrames(model_input, num_frames,
+                                             iterations)
+    else:
+      model_input = utils.SampleRandomSequence(model_input, num_frames,
+                                               iterations)
+
+
+    max_frames = model_input.get_shape().as_list()[1]
+    feature_size = model_input.get_shape().as_list()[2]
+    reshaped_input = tf.reshape(model_input, [-1, feature_size])
+
+
+    video_NetVLAD = NetVLAD_NonLocal_types(1024,int(max_frames),int(cluster_size), add_batch_norm, is_training)
+    audio_NetVLAD = NetVLAD_NonLocal_types(128,int(max_frames),int(cluster_size/2), add_batch_norm, is_training)
+
+
+    if add_batch_norm:# and not lightvlad:
+      reshaped_input = slim.batch_norm(
+          reshaped_input,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="input_bn")
+
+    with tf.variable_scope("video_VLAD"):
+        vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
+
+    with tf.variable_scope("audio_VLAD"):
+        vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
+
+    vlad = tf.concat([vlad_video, vlad_audio],1)
+
+    vlad_dim = vlad.get_shape().as_list()[1]
+    hidden1_weights = tf.get_variable("hidden1_weights",
+      [vlad_dim, hidden1_size],
+      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
+
+    activation = tf.matmul(vlad, hidden1_weights)
+
+    if add_batch_norm and relu:
+      activation = slim.batch_norm(
+          activation,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="hidden1_bn")
+
+    else:
+      hidden1_biases = tf.get_variable("hidden1_biases",
+        [hidden1_size],
+        initializer = tf.random_normal_initializer(stddev=0.01))
+      tf.summary.histogram("hidden1_biases", hidden1_biases)
+      activation += hidden1_biases
+
+    if relu:
+      activation = tf.nn.relu6(activation)
+
+
+    if gating:
+        gating_weights = tf.get_variable("gating_weights_2",
+          [hidden1_size, hidden1_size],
+          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
+
+        gates = tf.matmul(activation, gating_weights)
+
+        if remove_diag:
+            #removes diagonals coefficients
+            diagonals = tf.matrix_diag_part(gating_weights)
+            gates = gates - tf.multiply(diagonals,activation)
+
+
+        if add_batch_norm:
+          gates = slim.batch_norm(
+              gates,
+              center=True,
+              scale=True,
+              is_training=is_training,
+              scope="gating_bn")
+        else:
+          gating_biases = tf.get_variable("gating_biases",
+            [cluster_size],
+            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
+          gates += gating_biases
+
+        gates = tf.sigmoid(gates)
+
+        activation = tf.multiply(activation,gates)
+
+    aggregated_model = getattr(video_level_models,
+                               'willow_MoeModel')
+
+
+    return aggregated_model().create_model(
+        model_input=activation,
+        vocab_size=vocab_size,
+        is_training=is_training,
+        **unused_params)
\ No newline at end of file
--- a/web/backend/yt8m/video_level_models.py
View file @e3b6e3b
+++ b/web/backend/yt8m/video_level_models.py
View file @e3b6e3b
@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS
 flags.DEFINE_integer(
     "moe_num_mixtures", 2,
     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
+# flags.DEFINE_integer(
+#     "moe_num_mixtures", 2,
+#     "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
+flags.DEFINE_float(
+    "moe_l2", 1e-8,
+    "L2 penalty for MoeModel.")
+flags.DEFINE_integer(
+    "moe_low_rank_gating", -1,
+    "Low rank gating for MoeModel.")
+flags.DEFINE_bool(
+    "moe_prob_gating", True,
+    "Prob gating for MoeModel.")
+flags.DEFINE_string(
+    "moe_prob_gating_input", "prob",
+    "input Prob gating for MoeModel.")
 class LogisticModel(models.BaseModel):
@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel):
     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
                                      [-1, vocab_size])
     return {"predictions": final_probabilities}
+
+
+class willow_MoeModel(models.BaseModel):
+  """A softmax over a mixture of logistic models (with L2 regularization)."""
+
+  def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params):
+    """Creates a Mixture of (Logistic) Experts model.
+     It also includes the possibility of gating the probabilities
+     The model consists of a per-class softmax distribution over a
+     configurable number of logistic classifiers. One of the classifiers in the
+     mixture is not trained, and always predicts 0.
+    Args:
+      model_input: 'batch_size' x 'num_features' matrix of input features.
+      vocab_size: The number of classes in the dataset.
+      is_training: Is this the training phase ?
+      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
+        always predicts the non-existence of an entity).
+      l2_penalty: How much to penalize the squared magnitudes of parameter
+        values.
+    Returns:
+      A dictionary with a tensor containing the probability predictions of the
+      model in the 'predictions' key. The dimensions of the tensor are
+      batch_size x num_classes.
+    """
+    num_mixtures = 8
+    low_rank_gating = FLAGS.moe_low_rank_gating
+    l2_penalty = FLAGS.moe_l2
+    gating_probabilities = FLAGS.moe_prob_gating
+    gating_input = FLAGS.moe_prob_gating_input
+
+    input_size = model_input.get_shape().as_list()[1]
+    remove_diag = False
+
+    if low_rank_gating == -1:
+      gate_activations = slim.fully_connected(
+          model_input,
+          vocab_size * (num_mixtures + 1),
+          activation_fn=None,
+          biases_initializer=None,
+          weights_regularizer=slim.l2_regularizer(l2_penalty),
+          scope="gates")
+    else:
+      gate_activations1 = slim.fully_connected(
+          model_input,
+          low_rank_gating,
+          activation_fn=None,
+          biases_initializer=None,
+          weights_regularizer=slim.l2_regularizer(l2_penalty),
+          scope="gates1")
+      gate_activations = slim.fully_connected(
+          gate_activations1,
+          vocab_size * (num_mixtures + 1),
+          activation_fn=None,
+          biases_initializer=None,
+          weights_regularizer=slim.l2_regularizer(l2_penalty),
+          scope="gates2")
+
+    expert_activations = slim.fully_connected(
+        model_input,
+        vocab_size * num_mixtures,
+        activation_fn=None,
+        weights_regularizer=slim.l2_regularizer(l2_penalty),
+        scope="experts")
+
+    gating_distribution = tf.nn.softmax(tf.reshape(
+        gate_activations,
+        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
+    expert_distribution = tf.nn.sigmoid(tf.reshape(
+        expert_activations,
+        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
+
+    probabilities_by_class_and_batch = tf.reduce_sum(
+        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
+    probabilities = tf.reshape(probabilities_by_class_and_batch,
+                               [-1, vocab_size])
+
+    if gating_probabilities:
+      if gating_input == 'prob':
+        gating_weights = tf.get_variable("gating_prob_weights",
+                                         [vocab_size, vocab_size],
+                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
+        gates = tf.matmul(probabilities, gating_weights)
+      else:
+        gating_weights = tf.get_variable("gating_prob_weights",
+                                         [input_size, vocab_size],
+                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
+
+        gates = tf.matmul(model_input, gating_weights)
+
+      if remove_diag:
+        # removes diagonals coefficients
+        diagonals = tf.matrix_diag_part(gating_weights)
+        gates = gates - tf.multiply(diagonals, probabilities)
+
+      gates = slim.batch_norm(
+          gates,
+          center=True,
+          scale=True,
+          is_training=is_training,
+          scope="gating_prob_bn")
+
+      gates = tf.sigmoid(gates)
+
+      probabilities = tf.multiply(probabilities, gates)
+
+    return {"predictions": probabilities}
\ No newline at end of file