final report almost done

윤영빈
Commit 7e0b563ac8ea88ce198992b6ad3aceb0eb28c3e4 7e0b563a 1 parent f2dfcea2
Showing 4 changed files with 23 additions and 2152 deletions
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
보고서/최종보고서-윤영빈.docx
보고서/최종보고서-윤영빈.pdf
--- a/web/backend/yt8m/frame_level_models.py
View file @7e0b563
+++ b/web/backend/yt8m/frame_level_models.py
View file @7e0b563
@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
 flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.")
 flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.")
-class DbofModel(models.BaseModel):
+class FrameLevelLogisticModel(models.BaseModel):
-  """Creates a Deep Bag of Frames model.
+  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-  The model projects the features for each frame into a higher dimensional
-  'clustering' space, pools across frames in that space, and then
-  uses a configurable video-level model to classify the now aggregated features.
-  The model will randomly sample either frames or sequences of frames during
-  training to speed up convergence.
-  """
-
-  ACT_FN_MAP = {
-      "sigmoid": tf.nn.sigmoid,
-      "relu6": tf.nn.relu6,
-  }
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_frames,
-                   iterations=None,
-                   add_batch_norm=None,
-                   sample_random_frames=None,
-                   cluster_size=None,
-                   hidden_size=None,
-                   is_training=True,
-                   **unused_params):
-    """See base class.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-        input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-        frames for each video (before padding).
-      iterations: the number of frames to be sampled.
-      add_batch_norm: whether to add batch norm during training.
-      sample_random_frames: whether to sample random frames or random sequences.
-      cluster_size: the output neuron number of the cluster layer.
-      hidden_size: the output neuron number of the hidden layer.
-      is_training: whether to build the graph in training mode.
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    iterations = iterations or FLAGS.iterations
-    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
-    random_frames = sample_random_frames or FLAGS.sample_random_frames
-    cluster_size = cluster_size or FLAGS.dbof_cluster_size
-    hidden1_size = hidden_size or FLAGS.dbof_hidden_size
-    act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
-    assert act_fn is not None, ("dbof_activation is not valid: %s." %
-                                FLAGS.dbof_activation)
-
     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    if random_frames:
-      model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                             iterations)
-    else:
-      model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                               iterations)
-    max_frames = model_input.get_shape().as_list()[1]
     feature_size = model_input.get_shape().as_list()[2]
-    reshaped_input = tf.reshape(model_input, [-1, feature_size])
-    tf.compat.v1.summary.histogram("input_hist", reshaped_input)
-    if add_batch_norm:
+    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-      reshaped_input = slim.batch_norm(reshaped_input,
+                              [-1, feature_size])
-                                       center=True,
+    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
-                                       scale=True,
-                                       is_training=is_training,
-                                       scope="input_bn")
-    cluster_weights = tf.compat.v1.get_variable(
+    output = slim.fully_connected(avg_pooled,
-        "cluster_weights", [feature_size, cluster_size],
+                                  vocab_size,
-        initializer=tf.random_normal_initializer(stddev=1 /
+                                  activation_fn=tf.nn.sigmoid,
-                                                 math.sqrt(feature_size)))
+                                  weights_regularizer=slim.l2_regularizer(1e-8))
-    tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
-    activation = tf.matmul(reshaped_input, cluster_weights)
-    if add_batch_norm:
-      activation = slim.batch_norm(activation,
-                                   center=True,
-                                   scale=True,
-                                   is_training=is_training,
-                                   scope="cluster_bn")
-    else:
-      cluster_biases = tf.compat.v1.get_variable(
-          "cluster_biases", [cluster_size],
-          initializer=tf.random_normal_initializer(stddev=1 /
-                                                   math.sqrt(feature_size)))
-      tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
-      activation += cluster_biases
-    activation = act_fn(activation)
-    tf.compat.v1.summary.histogram("cluster_output", activation)
-
-    activation = tf.reshape(activation, [-1, max_frames, cluster_size])
-    activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
-
-    hidden1_weights = tf.compat.v1.get_variable(
-        "hidden1_weights", [cluster_size, hidden1_size],
-        initializer=tf.random_normal_initializer(stddev=1 /
-                                                 math.sqrt(cluster_size)))
-    tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
-    activation = tf.matmul(activation, hidden1_weights)
-    if add_batch_norm:
-      activation = slim.batch_norm(activation,
-                                   center=True,
-                                   scale=True,
-                                   is_training=is_training,
-                                   scope="hidden1_bn")
-    else:
-      hidden1_biases = tf.compat.v1.get_variable(
-          "hidden1_biases", [hidden1_size],
-          initializer=tf.random_normal_initializer(stddev=0.01))
-      tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
-      activation += hidden1_biases
-    activation = act_fn(activation)
-    tf.compat.v1.summary.histogram("hidden1_output", activation)
-    aggregated_model = getattr(video_level_models,
+    return {"predictions": output}
-                               FLAGS.video_level_classifier_model)
-    return aggregated_model().create_model(model_input=activation,
-                                           vocab_size=vocab_size,
-                                           **unused_params)
 class NetVLAD_NonLocal_types():
     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types():
       return vlad_softmax
 class NetVLADModelLF(models.BaseModel):
-  """Creates a NetVLAD based model.
-  Args:
-    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                 input features.
-    vocab_size: The number of classes in the dataset.
-    num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-  Returns:
-    A dictionary with a tensor containing the probability predictions of the
-    model in the 'predictions' key. The dimensions of the tensor are
-    'batch_size' x 'num_classes'.
-  """
-
-
   def create_model(self,
                    model_input,
                    vocab_size,
@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel):
         is_training=is_training,
         **unused_params)
-class GruModel(models.BaseModel):
+class LstmModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-    """Creates a model which uses a stack of GRUs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    gru_size = 600
-    number_of_layers = 4
-    backward = False
-    random_frames = False
-    iterations = 30
-
-    if random_frames:
-      num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-      model_input = utils.SampleRandomFrames(model_input, num_frames_2,
-                                             iterations)
-    if backward:
+  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-        model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1)
+    lstm_size = FLAGS.lstm_cells
+    number_of_layers = FLAGS.lstm_layers
-    stacked_GRU = tf.contrib.rnn.MultiRNNCell(
+    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
             [
-                tf.contrib.rnn.GRUCell(gru_size)
+                tf.contrib.rnn.BasicLSTMCell(
+                    lstm_size, forget_bias=1.0)
                 for _ in range(number_of_layers)
-                ], state_is_tuple=False)
+                ])
     loss = 0.0
-    with tf.variable_scope("RNN"):
+
-      outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input,
+    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
                                        sequence_length=num_frames,
                                        dtype=tf.float32)
     aggregated_model = getattr(video_level_models,
-                               'MoeModel')
+                               FLAGS.video_level_classifier_model)
+
     return aggregated_model().create_model(
-        model_input=state,
+        model_input=state[-1].h,
         vocab_size=vocab_size,
-        is_training=is_training,
         **unused_params)
\ No newline at end of file
-
-
-class SoftDBoF():
-    def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
-        self.feature_size = feature_size
-        self.max_frames = max_frames
-        self.is_training = is_training
-        self.add_batch_norm = add_batch_norm
-        self.cluster_size = cluster_size
-        self.max_pool = max_pool
-
-    def forward(self, reshaped_input):
-
-        feature_size = self.feature_size
-        cluster_size = self.cluster_size
-        add_batch_norm = self.add_batch_norm
-        max_frames = self.max_frames
-        is_training = self.is_training
-        max_pool = self.max_pool
-
-        cluster_weights = tf.get_variable("cluster_weights",
-          [feature_size, cluster_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-
-        tf.summary.histogram("cluster_weights", cluster_weights)
-        activation = tf.matmul(reshaped_input, cluster_weights)
-
-        if add_batch_norm:
-          activation = slim.batch_norm(
-              activation,
-              center=True,
-              scale=True,
-              is_training=is_training,
-              scope="cluster_bn")
-        else:
-          cluster_biases = tf.get_variable("cluster_biases",
-            [cluster_size],
-            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-          tf.summary.histogram("cluster_biases", cluster_biases)
-          activation += cluster_biases
-
-        activation = tf.nn.softmax(activation)
-
-        activation = tf.reshape(activation, [-1, int(max_frames), int(cluster_size)])
-
-        activation_sum = tf.reduce_sum(activation,1)
-        activation_sum = tf.nn.l2_normalize(activation_sum,1)
-
-        if max_pool:
-            activation_max = tf.reduce_max(activation,1)
-            activation_max = tf.nn.l2_normalize(activation_max,1)
-            activation = tf.concat([activation_sum,activation_max],1)
-        else:
-            activation = activation_sum
-
-        return activation
-
-
-class LightVLAD_nonlocal():
-    def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
-        self.feature_size = feature_size
-        self.max_frames = max_frames
-        self.is_training = is_training
-        self.add_batch_norm = add_batch_norm
-        self.cluster_size = cluster_size
-
-    def forward(self,reshaped_input):
-
-
-        cluster_weights = tf.get_variable("cluster_weights",
-              [int(self.feature_size), int(self.cluster_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-
-        activation = tf.matmul(reshaped_input, cluster_weights)
-
-        if self.add_batch_norm:
-          activation = slim.batch_norm(
-              activation,
-              center=True,
-              scale=True,
-              is_training=self.is_training,
-              scope="cluster_bn")
-        else:
-          cluster_biases = tf.get_variable("cluster_biases",
-            [cluster_size],
-            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-          tf.summary.histogram("cluster_biases", cluster_biases)
-          activation += cluster_biases
-
-        activation = tf.nn.softmax(activation)
-
-        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
-
-        activation = tf.transpose(activation,perm=[0,2,1])
-
-        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
-        vlad = tf.matmul(activation,reshaped_input)
-
-        vlad = tf.reshape(vlad, [-1,self.feature_size])
-        vlad = nonLocal_block(vlad, feature_size=self.feature_size, hidden_size=self.feature_size//2, cluster_size=self.cluster_size)
-
-        vlad = tf.reshape(vlad, [-1,self.cluster_size,self.feature_size])
-        vlad = tf.transpose(vlad,perm=[0,2,1])
-
-        vlad = tf.nn.l2_normalize(vlad,1)
-
-        vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
-        vlad = tf.nn.l2_normalize(vlad,1)
-
-        return vlad
-
-class LightNetVLADModelLF(models.BaseModel):
-  """Creates a NetVLAD based model.
-  Args:
-    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                 input features.
-    vocab_size: The number of classes in the dataset.
-    num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-  Returns:
-    A dictionary with a tensor containing the probability predictions of the
-    model in the 'predictions' key. The dimensions of the tensor are
-    'batch_size' x 'num_classes'.
-  """
-
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_frames,
-                   iterations=None,
-                   add_batch_norm=None,
-                   sample_random_frames=None,
-                   cluster_size=None,
-                   hidden_size=None,
-                   is_training=True,
-                   **unused_params):
-    iterations = 300
-    add_batch_norm = True
-    random_frames = True
-    cluster_size = 64
-    hidden1_size = 1024
-    relu = False
-    dimred = -1
-    gating = True
-    remove_diag = False
-
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    if random_frames:
-      model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                             iterations)
-    else:
-      model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                               iterations)
-
-
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    reshaped_input = tf.reshape(model_input, [-1, feature_size])
-
-
-    video_NetVLAD = LightVLAD_nonlocal(1024,max_frames,cluster_size, add_batch_norm, is_training)
-    audio_NetVLAD = LightVLAD_nonlocal(128,max_frames,cluster_size/2, add_batch_norm, is_training)
-
-
-    if add_batch_norm:# and not lightvlad:
-      reshaped_input = slim.batch_norm(
-          reshaped_input,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="input_bn")
-
-    with tf.variable_scope("video_VLAD"):
-        vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
-
-    with tf.variable_scope("audio_VLAD"):
-        vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
-
-    vlad = tf.concat([vlad_video, vlad_audio],1)
-
-    vlad_dim = vlad.get_shape().as_list()[1]
-    hidden1_weights = tf.get_variable("hidden1_weights",
-      [vlad_dim, hidden1_size],
-      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-
-    activation = tf.matmul(vlad, hidden1_weights)
-
-    if add_batch_norm and relu:
-      activation = slim.batch_norm(
-          activation,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="hidden1_bn")
-
-    else:
-      hidden1_biases = tf.get_variable("hidden1_biases",
-        [hidden1_size],
-        initializer = tf.random_normal_initializer(stddev=0.01))
-      tf.summary.histogram("hidden1_biases", hidden1_biases)
-      activation += hidden1_biases
-
-    if relu:
-      activation = tf.nn.relu6(activation)
-
-
-    if gating:
-        gating_weights = tf.get_variable("gating_weights_2",
-          [hidden1_size, hidden1_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
-
-        gates = tf.matmul(activation, gating_weights)
-
-        if remove_diag:
-            #removes diagonals coefficients
-            diagonals = tf.matrix_diag_part(gating_weights)
-            gates = gates - tf.multiply(diagonals,activation)
-
-
-        if add_batch_norm:
-          gates = slim.batch_norm(
-              gates,
-              center=True,
-              scale=True,
-              is_training=is_training,
-              scope="gating_bn")
-        else:
-          gating_biases = tf.get_variable("gating_biases",
-            [cluster_size],
-            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-          gates += gating_biases
-
-        gates = tf.sigmoid(gates)
-
-        activation = tf.multiply(activation,gates)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-
-    return aggregated_model().create_model(
-        model_input=activation,
-        vocab_size=vocab_size,
-        is_training=is_training,
-        **unused_params)
-
-def nonLocal_block(vlad, feature_size, hidden_size, cluster_size):
-    nonlocal_theta = tf.get_variable("nonlocal_theta",
-          [feature_size, hidden_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-    nonlocal_phi = tf.get_variable("nonlocal_phi",
-          [feature_size, hidden_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-    nonlocal_g = tf.get_variable("nonlocal_g",
-          [feature_size, hidden_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-    nonlocal_out = tf.get_variable("nonlocal_out",
-          [hidden_size, feature_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden_size)))
-
-    vlad_theta = tf.matmul(vlad, nonlocal_theta)
-    vlad_phi = tf.matmul(vlad, nonlocal_phi)
-    vlad_g = tf.matmul(vlad, nonlocal_g)
-
-    vlad_theta = tf.reshape(vlad_theta, [-1, cluster_size, hidden_size])
-    vlad_phi = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
-    vlad_g = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
-
-    vlad_softmax = tf.nn.softmax(feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
-    vlad_g = tf.matmul(vlad_softmax, vlad_g)
-    vlad_g = tf.reshape(vlad_g, [-1, hidden_size])
-
-    vlad_g = tf.matmul(vlad_g, nonlocal_out)
-    vlad = vlad + vlad_g
-    return vlad
-
-class SoftDbofModelLF(models.BaseModel):
-  """Creates a Soft Deep Bag of Frames model.
-  The model projects the features for each frame into a higher dimensional
-  'clustering' space, pools across frames in that space, and then
-  uses a configurable video-level model to classify the now aggregated features.
-  The model will randomly sample either frames or sequences of frames during
-  training to speed up convergence.
-  Args:
-    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                 input features.
-    vocab_size: The number of classes in the dataset.
-    num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-  Returns:
-    A dictionary with a tensor containing the probability predictions of the
-    model in the 'predictions' key. The dimensions of the tensor are
-    'batch_size' x 'num_classes'.
-  """
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_frames,
-                   iterations=None,
-                   add_batch_norm=None,
-                   sample_random_frames=None,
-                   cluster_size=None,
-                   hidden_size=None,
-                   is_training=True,
-                   **unused_params):
-    iterations = 300
-    add_batch_norm = True
-    random_frames = True
-    cluster_size = 4000
-    hidden1_size = 1024
-    fc_dimred = True
-    relu = False
-    max_pool = False
-
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    if random_frames:
-      model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                             iterations)
-    else:
-      model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                               iterations)
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    reshaped_input = tf.reshape(model_input, [-1, feature_size])
-    tf.summary.histogram("input_hist", reshaped_input)
-
-    video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
-    audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
-
-
-    if add_batch_norm:
-      reshaped_input = slim.batch_norm(
-          reshaped_input,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="input_bn")
-
-    with tf.variable_scope("video_DBOF"):
-        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
-
-    with tf.variable_scope("audio_DBOF"):
-        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
-
-    dbof = tf.concat([dbof_video, dbof_audio],1)
-
-    dbof_dim = dbof.get_shape().as_list()[1]
-
-    if fc_dimred:
-        hidden1_weights = tf.get_variable("hidden1_weights",
-          [dbof_dim, hidden1_size],
-          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-        tf.summary.histogram("hidden1_weights", hidden1_weights)
-        activation = tf.matmul(dbof, hidden1_weights)
-
-        if add_batch_norm and relu:
-          activation = slim.batch_norm(
-              activation,
-              center=True,
-              scale=True,
-              is_training=is_training,
-              scope="hidden1_bn")
-        else:
-          hidden1_biases = tf.get_variable("hidden1_biases",
-            [hidden1_size],
-            initializer = tf.random_normal_initializer(stddev=0.01))
-          tf.summary.histogram("hidden1_biases", hidden1_biases)
-          activation += hidden1_biases
-
-        if relu:
-          activation = tf.nn.relu6(activation)
-        tf.summary.histogram("hidden1_output", activation)
-    else:
-        activation = dbof
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=activation,
-        vocab_size=vocab_size,
-        is_training=is_training,
-        **unused_params)
-
-
-
-class early_NetVLADModelLF(models.BaseModel):
-  """Creates a NetVLAD based model.
-  Args:
-    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                 input features.
-    vocab_size: The number of classes in the dataset.
-    num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-  Returns:
-    A dictionary with a tensor containing the probability predictions of the
-    model in the 'predictions' key. The dimensions of the tensor are
-    'batch_size' x 'num_classes'.
-  """
-
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_frames,
-                   iterations=None,
-                   add_batch_norm=None,
-                   sample_random_frames=None,
-                   cluster_size=None,
-                   hidden_size=None,
-                   is_training=True,
-                   **unused_params):
-    iterations = 300
-    add_batch_norm = True
-    random_frames = True
-    cluster_size = 64
-    hidden1_size = 1024
-    relu = False
-    dimred = -1
-    gating = True
-    remove_diag = False
-
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    if random_frames:
-      model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                             iterations)
-    else:
-      model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                               iterations)
-
-
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    reshaped_input = tf.reshape(model_input, [-1, feature_size])
-
-    video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training)
-
-    if add_batch_norm:# and not lightvlad:
-      reshaped_input = slim.batch_norm(
-          reshaped_input,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="input_bn")
-    with tf.variable_scope("video_audio_VLAD"):
-        vlad = video_audio_NetVLAD.forward(reshaped_input)
-
-    vlad_dim = vlad.get_shape().as_list()[1]
-    hidden1_weights = tf.get_variable("hidden1_weights",
-      [vlad_dim, hidden1_size],
-      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-
-    activation = tf.matmul(vlad, hidden1_weights)
-
-    if add_batch_norm and relu:
-      activation = slim.batch_norm(
-          activation,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="hidden1_bn")
-
-    else:
-      hidden1_biases = tf.get_variable("hidden1_biases",
-        [hidden1_size],
-        initializer = tf.random_normal_initializer(stddev=0.01))
-      tf.summary.histogram("hidden1_biases", hidden1_biases)
-      activation += hidden1_biases
-
-    if relu:
-      activation = tf.nn.relu6(activation)
-
-
-    if gating:
-        gating_weights = tf.get_variable("gating_weights_2",
-          [hidden1_size, hidden1_size],
-          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
-
-        gates = tf.matmul(activation, gating_weights)
-
-        if remove_diag:
-            #removes diagonals coefficients
-            diagonals = tf.matrix_diag_part(gating_weights)
-            gates = gates - tf.multiply(diagonals,activation)
-
-
-        if add_batch_norm:
-          gates = slim.batch_norm(
-              gates,
-              center=True,
-              scale=True,
-              is_training=is_training,
-              scope="gating_bn")
-        else:
-          gating_biases = tf.get_variable("gating_biases",
-            [cluster_size],
-            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-          gates += gating_biases
-
-        gates = tf.sigmoid(gates)
-
-        activation = tf.multiply(activation,gates)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-
-    return aggregated_model().create_model(
-        model_input=activation,
-        vocab_size=vocab_size,
-        is_training=is_training,
-        **unused_params)
-
-class NetVLAD_NonLocal():
-    def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
-        self.feature_size = feature_size
-        self.max_frames = max_frames
-        self.is_training = is_training
-        self.add_batch_norm = add_batch_norm
-        self.cluster_size = cluster_size
-
-    def forward(self,reshaped_input):
-
-        cluster_weights = tf.get_variable("cluster_weights",
-              [int(self.feature_size), int(self.cluster_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-
-        tf.summary.histogram("cluster_weights", cluster_weights)
-        activation = tf.matmul(reshaped_input, cluster_weights)
-
-        if self.add_batch_norm:
-          activation = slim.batch_norm(
-              activation,
-              center=True,
-              scale=True,
-              is_training=self.is_training,
-              scope="cluster_bn")
-        else:
-          cluster_biases = tf.get_variable("cluster_biases",
-            [cluster_size],
-            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-          tf.summary.histogram("cluster_biases", cluster_biases)
-          activation += cluster_biases
-
-        activation = tf.nn.softmax(activation)
-        tf.summary.histogram("cluster_output", activation)
-
-        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
-
-        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
-
-        cluster_weights2 = tf.get_variable("cluster_weights2",
-            [1,int(self.feature_size), int(self.cluster_size)],
-            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-
-        a = tf.multiply(a_sum,cluster_weights2)
-
-        activation = tf.transpose(activation,perm=[0,2,1])
-
-        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
-        vlad = tf.matmul(activation,reshaped_input)
-        vlad = tf.transpose(vlad,perm=[0,2,1])
-        vlad = tf.subtract(vlad,a)
-
-
-        vlad = tf.transpose(vlad,perm=[0,2,1])
-        vlad = tf.reshape(vlad, [-1, self.feature_size])
-
-        nonlocal_theta = tf.get_variable("nonlocal_theta",
-              [int(self.feature_size), int(self.cluster_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-        nonlocal_phi = tf.get_variable("nonlocal_phi",
-              [int(self.feature_size), int(self.cluster_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-        nonlocal_g = tf.get_variable("nonlocal_g",
-              [int(self.feature_size), int(self.cluster_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-        nonlocal_out = tf.get_variable("nonlocal_out",
-              [int(self.cluster_size), int(self.feature_size)],
-              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
-
-        vlad_theta = tf.matmul(vlad, nonlocal_theta)
-        vlad_phi = tf.matmul(vlad, nonlocal_phi)
-        vlad_g = tf.matmul(vlad, nonlocal_g)
-
-        vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size),int(self.cluster_size)])
-        vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
-        vlad_g = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
-
-        vlad_softmax = tf.nn.softmax(self.feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
-        vlad_g = tf.matmul(vlad_softmax, vlad_g)
-        vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
-
-        vlad_g = tf.matmul(vlad_g, nonlocal_out)
-        vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)])
-        vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)])
-        vlad = vlad + vlad_g
-
-        vlad = tf.transpose(vlad,perm=[0,2,1])
-        vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
-
-        vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
-        vlad = tf.nn.l2_normalize(vlad,1)
-
-        return vlad
-
-
-class SoftDbofModelLF_8k(models.BaseModel):
-  """Creates a Soft Deep Bag of Frames model.
-  The model projects the features for each frame into a higher dimensional
-  'clustering' space, pools across frames in that space, and then
-  uses a configurable video-level model to classify the now aggregated features.
-  The model will randomly sample either frames or sequences of frames during
-  training to speed up convergence.
-  Args:
-    model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                 input features.
-    vocab_size: The number of classes in the dataset.
-    num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-  Returns:
-    A dictionary with a tensor containing the probability predictions of the
-    model in the 'predictions' key. The dimensions of the tensor are
-    'batch_size' x 'num_classes'.
-  """
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_frames,
-                   iterations=None,
-                   add_batch_norm=None,
-                   sample_random_frames=None,
-                   cluster_size=None,
-                   hidden_size=None,
-                   is_training=True,
-                   **unused_params):
-    iterations = 300
-    add_batch_norm = True
-    random_frames = True
-    cluster_size = 2048
-    hidden1_size = 1024
-    fc_dimred = True
-    relu = False
-    max_pool = False
-
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    if random_frames:
-      model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                             iterations)
-    else:
-      model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                               iterations)
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    reshaped_input = tf.reshape(model_input, [-1, feature_size])
-    tf.summary.histogram("input_hist", reshaped_input)
-
-    video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
-    audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
-
-
-    if add_batch_norm:
-      reshaped_input = slim.batch_norm(
-          reshaped_input,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="input_bn")
-
-    with tf.variable_scope("video_DBOF"):
-        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
-
-    with tf.variable_scope("audio_DBOF"):
-        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
-
-    dbof = tf.concat([dbof_video, dbof_audio],1)
-
-    dbof_dim = dbof.get_shape().as_list()[1]
-
-    if fc_dimred:
-        hidden1_weights = tf.get_variable("hidden1_weights",
-          [dbof_dim, hidden1_size],
-          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-        tf.summary.histogram("hidden1_weights", hidden1_weights)
-        activation = tf.matmul(dbof, hidden1_weights)
-
-        if add_batch_norm and relu:
-          activation = slim.batch_norm(
-              activation,
-              center=True,
-              scale=True,
-              is_training=is_training,
-              scope="hidden1_bn")
-        else:
-          hidden1_biases = tf.get_variable("hidden1_biases",
-            [hidden1_size],
-            initializer = tf.random_normal_initializer(stddev=0.01))
-          tf.summary.histogram("hidden1_biases", hidden1_biases)
-          activation += hidden1_biases
-
-        if relu:
-          activation = tf.nn.relu6(activation)
-        tf.summary.histogram("hidden1_output", activation)
-    else:
-        activation = dbof
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-
-    return aggregated_model().create_model(
-        model_input=activation,
-        vocab_size=vocab_size,
-        is_training=is_training,
-        **unused_params)
-
-class FrameLevelLogisticModel(models.BaseModel):
-  """Creates a logistic classifier over the aggregated frame-level features."""
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """See base class.
-
-    This class is intended to be an example for implementors of frame level
-    models. If you want to train a model over averaged features it is more
-    efficient to average them beforehand rather than on the fly.
-
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-        input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-        frames for each video (before padding).
-
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    feature_size = model_input.get_shape().as_list()[2]
-
-    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                              [-1, feature_size])
-    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
-
-    output = slim.fully_connected(avg_pooled,
-                                  vocab_size,
-                                  activation_fn=tf.nn.sigmoid,
-                                  weights_regularizer=slim.l2_regularizer(1e-8))
-    
-    return {"predictions": output}
-
-class CNN(models.BaseModel):
-  """Creates a logistic classifier over the aggregated frame-level features."""
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """See base class.
-
-    This class is intended to be an example for implementors of frame level
-    models. If you want to train a model over averaged features it is more
-    efficient to average them beforehand rather than on the fly.
-
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-        input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-        frames for each video (before padding).
-
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    feature_size = model_input.get_shape().as_list()[2]
-
-    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                              [-1, feature_size])
-    
-    
-    
-    convK3 = slim.convolution(model_input,
-                        num_outputs=feature_size,
-                        kernel_size=3,
-                        scope='conv1')
-    
-    convK5 = slim.convolution(model_input,
-                        num_outputs=feature_size,
-                        kernel_size=5,
-                        scope='conv2')
-    
-    convK1 = slim.convolution(model_input,
-                        num_outputs=feature_size,
-                        kernel_size=5,
-                        scope='conv3')
-  
-    
-    avg_pooled = tf.reduce_sum(tf.concat([convK3,convK5,convK1],axis=1), axis=[1]) / denominators
-    
-    output = slim.fully_connected(avg_pooled,
-                                  vocab_size,
-                                  activation_fn=tf.nn.relu,
-                                  weights_regularizer=slim.l2_regularizer(1e-8))
-
-    return {"predictions": output}
-  
-class LstmModel(models.BaseModel):
-    
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """Creates a model which uses a stack of LSTMs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.BasicLSTMCell(
-                    lstm_size, forget_bias=1.0)
-                for _ in range(number_of_layers)
-                ])
-
-    loss = 0.0
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=state[-1].h,
-        vocab_size=vocab_size,
-        **unused_params)
-    
-class BNGRUModel(models.BaseModel):
-    
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    stacked_rnn = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.GRUCell(lstm_size)
-                for _ in range(number_of_layers)
-                ], state_is_tuple=False)
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    state = slim.batch_norm(
-        state,
-        center=True,
-        scale=True,
-        is_training=True,
-        scope='proj')
-
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-
-
-class GruModel2(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """Creates a model which uses a stack of LSTMs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.GRUCell(lstm_size)
-                for _ in range(number_of_layers)
-                ], state_is_tuple=False)
-
-    loss = 0.0
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-
-class BiGRUModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """Creates a model which uses a stack of LSTMs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    with tf.variable_scope('fw'):
-        rnn_fw = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.GRUCell(lstm_size)
-                for _ in range(number_of_layers)
-                ], state_is_tuple=False)
-
-
-    with tf.variable_scope('bw'):
-        rnn_bw = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.GRUCell(lstm_size)
-                for _ in range(number_of_layers)
-                ], state_is_tuple=False)
-
-    outputs, state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32, swap_memory=True)
-    state = tf.concat(state, axis=1)
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-    state = slim.batch_norm(
-          state,
-          center=True,
-          scale=True,
-          is_training=True,
-          scope='proj')
-
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-"""
-Copyright (c) 2017, University of Texas Southwestern Medical Center
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-* Neither the name of the University of Texas at Austin nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-Recurrent Weighted Average
-Implementation modified from: https://github.com/jostmey/rwa
-Paper:
-@article{ostmeyer2017machine,
-  title={Machine Learning on Sequential Data Using a Recurrent Weighted Average},
-  author={Ostmeyer, Jared and Cowell, Lindsay},
-  journal={arXiv preprint arXiv:1703.01253},
-  year={2017}
-}
-"""
-
-class RwaModel(models.BaseModel):
-
-
-    def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-
-        # constants
-
-        init_factor = 1.0
-        num_cells = FLAGS.lstm_cells
-        input_shape = model_input.get_shape().as_list()
-        batch_size, max_steps, num_features = input_shape
-
-        # trainable weights
-        s = weights_rwa.init_state(num_cells, "s", init_factor)
-        W_g = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_g")
-        W_u = weights_rwa.init_weight([num_features, num_cells], "W_u")
-        W_a = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_a")
-        b_g = weights_rwa.init_bias(num_cells, "b_g")
-        b_u = weights_rwa.init_bias(num_cells, "b_u")
-        b_a = weights_rwa.init_bias(num_cells, "b_a")
-
-        #pl = tf.placeholder(tf.float32, shape=[None, num_cells])
-        pl = tf.reshape(model_input, [-1, max_steps*num_features])[:, :num_cells]
-
-        # internal states
-        #n = tf.zeros([batch_size, num_cells])
-        #d = tf.zeros([batch_size, num_cells])
-        #h = tf.zeros([batch_size, num_cells])
-        #a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible
-        n = tf.zeros_like(pl)
-        d = tf.zeros_like(pl)
-        h = tf.zeros_like(pl)
-        a_max = tf.multiply(tf.ones_like(pl), -1E38)
-
-        # define model
-        h += tf.nn.tanh(tf.expand_dims(s, 0))
-
-        for i in range(max_steps):
-
-            x_step = model_input[:,i,:]
-            xh_join = tf.concat(axis=1, values=[x_step, h]) # Combine the features and hidden state into one tensor
-
-            u = tf.matmul(x_step, W_u)+b_u
-            g = tf.matmul(xh_join, W_g)+b_g
-            a = tf.matmul(xh_join, W_a)     # The bias term when factored out of the numerator and denominator cancels and is unnecessary
-
-            z = tf.multiply(u, tf.nn.tanh(g))
-
-            a_newmax = tf.maximum(a_max, a)
-            exp_diff = tf.exp(a_max-a_newmax)
-            exp_scaled = tf.exp(a-a_newmax)
-
-            n = tf.multiply(n, exp_diff)+tf.multiply(z, exp_scaled) # Numerically stable update of numerator
-            d = tf.multiply(d, exp_diff)+exp_scaled # Numerically stable update of denominator
-            h_new = tf.nn.tanh(tf.div(n, d))
-            a_max = a_newmax
-
-            h = tf.where(tf.greater(num_frames, i), h_new, h)    # Use new hidden state only if the sequence length has not been exceeded
-
-
-        aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-        return aggregated_model().create_model(
-            model_input=h,
-            vocab_size=vocab_size,
-            **unused_params)
-
-
-
-class DropoutGruModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """Creates a model which uses a stack of LSTMs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.DropoutWrapper(
-                    tf.contrib.rnn.GRUCell(lstm_size), 0.9, 0.9)
-                for _ in range(number_of_layers)
-                ], state_is_tuple=False)
-
-    loss = 0.0
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    aggregated_model = FrameLevelLogisticModel;
-    return aggregated_model().create_model(
-        model_input=outputs,
-        vocab_size=vocab_size,
-        num_frames=num_frames,
-        **unused_params)
-
-
-
-
-class ResRnnModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    lstm_size = 1152
-    number_of_layers = 3
-
-    #from rnn_cell_modern import Delta_RNN as drnn
-    from rnn_wrappers_modern import MultiRNNCell as mrnn
-
-    cells = []
-    for i in range(number_of_layers):
-        with tf.variable_scope('cell_'+str(i)):
-            cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0))
-
-    stacked_rnn = mrnn(cells, use_residual_connections=True, state_is_tuple=True)
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=state[-1].h,
-        vocab_size=vocab_size,
-        **unused_params)
-
-
-class LateVladModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    model_input = utils.SampleRandomSequence(model_input, num_frames, 128)
-
-    input_v = model_input[:,:,:1024]
-    input_a = model_input[:,:,1024:]
-
-    K = 8
-
-    with tf.variable_scope('video'):
-        x = input_v
-        input_shape = x.get_shape().as_list()
-        _, N, D = input_shape
-        c_bound = math.sqrt(1. / (K * D))
-        c = tf.get_variable(name='c',
-                            shape=[K, N],
-                            dtype=tf.float32,
-                            initializer=tf.random_uniform_initializer(-c_bound, c_bound))
-        a = slim.convolution(x,
-                             num_outputs=K,
-                             kernel_size=1,
-                             data_format='NWC',
-                             scope='conv')
-        a = tf.nn.softmax(a)
-        v = []
-        for k in range(K):
-          t = x-c[k][None, :, None]
-          t = tf.multiply(t, a[:,:,k][:,:,None])
-          t = tf.reduce_sum(t, 1)
-          t = tf.nn.l2_normalize(t, dim=1)
-          v.append(t)
-        v = tf.stack(v, axis=1)
-        v = tf.reshape(v, [-1, K*D])
-
-        proj_weights = tf.get_variable("proj_weights",
-          [K*D, 1024],
-          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
-        activation_v = tf.matmul(v, proj_weights)
-
-    with tf.variable_scope('audio'):
-        x = input_a
-        input_shape = x.get_shape().as_list()
-        _, N, D = input_shape
-        c_bound = math.sqrt(1. / (K * D))
-        c = tf.get_variable(name='c',
-                            shape=[K, N],
-                            dtype=tf.float32,
-                            initializer=tf.random_uniform_initializer(-c_bound, c_bound))
-        a = slim.convolution(x,
-                             num_outputs=K,
-                             kernel_size=1,
-                             data_format='NWC',
-                             scope='conv')
-        a = tf.nn.softmax(a)
-        v = []
-        for k in range(K):
-          t = x-c[k][None, :, None]
-          t = tf.multiply(t, a[:,:,k][:,:,None])
-          t = tf.reduce_sum(t, 1)
-          t = tf.nn.l2_normalize(t, dim=1)
-          v.append(t)
-        v = tf.stack(v, axis=1)
-        v = tf.reshape(v, [-1, K*D])
-
-        proj_weights = tf.get_variable("proj_weights",
-          [K*D, 1024],
-          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
-        activation_a = tf.matmul(v, proj_weights)
-
-    activation = tf.concat([activation_v, activation_a], axis=1)
-
-    activation = slim.batch_norm(
-          activation,
-          center=True,
-          scale=True,
-          is_training=True,
-          scope='proj')
-
-    activation = tf.nn.relu6(activation)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=activation,
-        vocab_size=vocab_size,
-        **unused_params)
-    
-class LNBLstmModel(models.BaseModel):
-    
-  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-    """Creates a model which uses a stack of LSTMs to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-
-    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-            [
-                tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_size, dropout_keep_prob=0.50)
-                for _ in range(number_of_layers)
-                ])
-
-    loss = 0.0
-
-    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                       sequence_length=num_frames,
-                                       dtype=tf.float32)
-
-    aggregated_model = getattr(video_level_models,
-                               FLAGS.video_level_classifier_model)
-
-    return aggregated_model().create_model(
-        model_input=state[-1].h,
-        vocab_size=vocab_size,
-        **unused_params)
-    
-class audio_avgShort_twowayGRUModel(models.BaseModel):
-    
-  def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-    """Creates a model which uses a Bidirectional GRU and mean audio features to represent the video.
-                      ---->first half GRU----->
-                      -                       -
-    visual_feature ----                       concat---------------->
-                      -                       -                     -
-                      ---->second half GRU---->                     concat -----> video level classifier
-                                                                    -
-                                              mean audio features--->
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    stride = FLAGS.stride
-    max_frames = model_input.get_shape().as_list()[1]
-
-    video_input = model_input[:,:,:1024]
-    audio_input = model_input[:,:,1024:]
-
-    first_num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-    audio_den = tf.reshape(tf.tile(first_num_frames, [1, 128]), [-1, 128])
-    mean_audio = tf.reduce_sum(audio_input, 1) / tf.maximum(audio_den, 1)
-
-    pooled_input, num_frames = self.avg_pooled_func(video_input, num_frames, stride)
-
-    pooled_input = slim.batch_norm(
-      pooled_input,
-      center=True,
-      scale=True,
-      is_training=is_training,
-      scope="hidden1_bn")
-
-    mean_audio = slim.batch_norm(
-      mean_audio,
-      center=True,
-      scale=True,
-      is_training=is_training,
-      scope="hidden1_bn_audio")
-
-    fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-    bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-
-    fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], 
-        sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
-    bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], 
-        sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
-
-    state = tf.concat([fw_state, bw_state], 1)
-    state = tf.concat([state, mean_audio], 1)
-
-    aggregated_model = getattr(video_level_models,
-                               'linear_res_mix_act_MoeModel')
-
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-  def avg_pooled_func(self, model_input, num_frames_in, stride):
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    num_frames = num_frames_in // stride
-    step = max_frames//stride
-
-    first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-    first_layer_input = tf.reduce_sum(first_layer_input, 1)
-
-    first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
-    denominators = tf.reshape(
-        tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
-    first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
-
-    return first_layer_avg_pooled, num_frames
-
-
-class resav_ConvModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-    """Creates a model which uses a Convolutional model to represent the video.
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    stride = FLAGS.stride
-    conv_length = FLAGS.conv_length
-    conv_hidden1 = FLAGS.conv_hidden1
-    conv_hidden2 = FLAGS.conv_hidden2
-    conv_hidden3 = FLAGS.conv_hidden3
-    mean_feature = tf.reduce_mean(model_input, 1)
-    feature_size = model_input.get_shape().as_list()[2]
-
-    pooled_input = self.avg_pooled_func(model_input, stride)
-
-    # To shape : 'batch_size' x 'max_frames' x 1 x 'num_features'
-    input_expand = tf.expand_dims(pooled_input, -1)
-    input_expand = tf.transpose(input_expand, [0,1,3,2])
-
-    # conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden
-    conv_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_1')
-    conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_1"))
-    conv_out = slim.conv2d(conv_out, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_2')
-    conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_2")
-    res_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_1_1')
-    res_out = res_out + conv_out
-    res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool1')
-
-    conv_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_1')
-    conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_1"))
-    conv_out = slim.conv2d(conv_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_2')
-    conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_2")
-    res_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_2_1')
-    res_out = res_out + conv_out
-    res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool2')
-
-    conv_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_1')
-    conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_1"))
-    conv_out = slim.conv2d(conv_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_2')
-    conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_2")
-    res_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_3_1')
-    res_out = res_out + conv_out
-    res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool3')
-
-    a = res_out.get_shape().as_list()[1]
-    b = res_out.get_shape().as_list()[2]
-    c = res_out.get_shape().as_list()[3]
-    
-    print(res_out.get_shape().as_list())
-    
-    res_out = tf.reshape(res_out, [-1, a*b*c])
-
-    state = tf.concat([res_out, mean_feature], 1)
-
-    aggregated_model = getattr(video_level_models,
-                               'linear_res_mix_act_MoeModel')
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-  def avg_pooled_func(self, model_input, stride):
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    step = max_frames//stride
-
-    first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-    first_layer_input = tf.reduce_mean(first_layer_input, 1)
-
-    return first_layer_input
-
-class pur_twowayGRUModel(models.BaseModel):
-
-  def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-    """Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video.
-                      ---->first half GRU----->
-                      -                       -
-    video_feature ----                       concat---------------->video level classifier
-                      -                       -
-                      ---->second half GRU---->
-    Args:
-      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                   input features.
-      vocab_size: The number of classes in the dataset.
-      num_frames: A vector of length 'batch' which indicates the number of
-           frames for each video (before padding).
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      'batch_size' x 'num_classes'.
-    """
-    lstm_size = FLAGS.lstm_cells
-    number_of_layers = FLAGS.lstm_layers
-    stride = FLAGS.stride
-    max_frames = model_input.get_shape().as_list()[1]
-
-    pooled_input, num_frames = self.avg_pooled_func(model_input, num_frames, stride)
-
-    pooled_input = slim.batch_norm(
-      pooled_input,
-      center=True,
-      scale=True,
-      is_training=is_training,
-      scope="hidden1_bn")
-
-  
-    fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-    bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-
-    fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], 
-        sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
-    bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], 
-        sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
-
-    state = tf.concat([fw_state, bw_state], 1)
-
-    aggregated_model = getattr(video_level_models,
-                               'linear_res_mix_act_MoeModel')
-
-    return aggregated_model().create_model(
-        model_input=state,
-        vocab_size=vocab_size,
-        **unused_params)
-
-  def avg_pooled_func(self, model_input, num_frames_in, stride):
-    max_frames = model_input.get_shape().as_list()[1]
-    feature_size = model_input.get_shape().as_list()[2]
-    num_frames = num_frames_in // stride
-    step = max_frames//stride
-
-    first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-    first_layer_input = tf.reduce_sum(first_layer_input, 1)
-
-    first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
-    denominators = tf.reshape(
-        tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
-    first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
-
-    return first_layer_avg_pooled, num_frames
\ No newline at end of file
--- a/web/backend/yt8m/video_level_models.py
View file @7e0b563
+++ b/web/backend/yt8m/video_level_models.py
View file @7e0b563
@@ -137,8 +137,6 @@ class MoeModel(models.BaseModel):
     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
                                      [-1, vocab_size])
-    
-    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@", final_probabilities_by_class_and_batch)
     return {"predictions": final_probabilities}
@@ -252,481 +250,3 @@ class willow_MoeModel(models.BaseModel):
       probabilities = tf.multiply(probabilities, gates)
     return {"predictions": probabilities}
\ No newline at end of file
-
-class willow_MoeModel_moe4(models.BaseModel):
-  """A softmax over a mixture of logistic models (with L2 regularization)."""
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   is_training,
-                   num_mixtures=None,
-                   l2_penalty=1e-8,
-                   **unused_params):
-    """Creates a Mixture of (Logistic) Experts model.
-     It also includes the possibility of gating the probabilities
-     The model consists of a per-class softmax distribution over a
-     configurable number of logistic classifiers. One of the classifiers in the
-     mixture is not trained, and always predicts 0.
-    Args:
-      model_input: 'batch_size' x 'num_features' matrix of input features.
-      vocab_size: The number of classes in the dataset.
-      is_training: Is this the training phase ?
-      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-        always predicts the non-existence of an entity).
-      l2_penalty: How much to penalize the squared magnitudes of parameter
-        values.
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      batch_size x num_classes.
-    """
-    num_mixtures = 4
-    low_rank_gating = FLAGS.moe_low_rank_gating
-    l2_penalty = FLAGS.moe_l2
-    gating_probabilities = FLAGS.moe_prob_gating
-    gating_input = FLAGS.moe_prob_gating_input
-
-    input_size = model_input.get_shape().as_list()[1]
-    remove_diag = False
-
-    if low_rank_gating == -1:
-      gate_activations = slim.fully_connected(
-          model_input,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates")
-    else:
-      gate_activations1 = slim.fully_connected(
-          model_input,
-          low_rank_gating,
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates1")
-      gate_activations = slim.fully_connected(
-          gate_activations1,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates2")
-
-    expert_activations = slim.fully_connected(
-        model_input,
-        vocab_size * num_mixtures,
-        activation_fn=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="experts")
-
-    gating_distribution = tf.nn.softmax(tf.reshape(
-        gate_activations,
-        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-    expert_distribution = tf.nn.sigmoid(tf.reshape(
-        expert_activations,
-        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
-
-    probabilities_by_class_and_batch = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-    probabilities = tf.reshape(probabilities_by_class_and_batch,
-                               [-1, vocab_size])
-
-    if gating_probabilities:
-      if gating_input == 'prob':
-        gating_weights = tf.get_variable("gating_prob_weights",
-                                         [vocab_size, vocab_size],
-                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-        gates = tf.matmul(probabilities, gating_weights)
-      else:
-        gating_weights = tf.get_variable("gating_prob_weights",
-                                         [input_size, vocab_size],
-                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-
-        gates = tf.matmul(model_input, gating_weights)
-
-      if remove_diag:
-        # removes diagonals coefficients
-        diagonals = tf.matrix_diag_part(gating_weights)
-        gates = gates - tf.multiply(diagonals, probabilities)
-
-      gates = slim.batch_norm(
-          gates,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="gating_prob_bn")
-
-      gates = tf.sigmoid(gates)
-
-      probabilities = tf.multiply(probabilities, gates)
-
-    return {"predictions": probabilities}
-
-class willow_MoeModel_moe4_noGP(models.BaseModel):
-  """A softmax over a mixture of logistic models (with L2 regularization)."""
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   is_training,
-                   num_mixtures=None,
-                   l2_penalty=1e-8,
-                   **unused_params):
-    """Creates a Mixture of (Logistic) Experts model.
-     It also includes the possibility of gating the probabilities
-     The model consists of a per-class softmax distribution over a
-     configurable number of logistic classifiers. One of the classifiers in the
-     mixture is not trained, and always predicts 0.
-    Args:
-      model_input: 'batch_size' x 'num_features' matrix of input features.
-      vocab_size: The number of classes in the dataset.
-      is_training: Is this the training phase ?
-      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-        always predicts the non-existence of an entity).
-      l2_penalty: How much to penalize the squared magnitudes of parameter
-        values.
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      batch_size x num_classes.
-    """
-    num_mixtures = 4
-    low_rank_gating = FLAGS.moe_low_rank_gating
-    l2_penalty = FLAGS.moe_l2
-    gating_probabilities = False
-    gating_input = FLAGS.moe_prob_gating_input
-
-    input_size = model_input.get_shape().as_list()[1]
-    remove_diag = False
-
-    if low_rank_gating == -1:
-      gate_activations = slim.fully_connected(
-          model_input,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates")
-    else:
-      gate_activations1 = slim.fully_connected(
-          model_input,
-          low_rank_gating,
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates1")
-      gate_activations = slim.fully_connected(
-          gate_activations1,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates2")
-
-    expert_activations = slim.fully_connected(
-        model_input,
-        vocab_size * num_mixtures,
-        activation_fn=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="experts")
-
-    gating_distribution = tf.nn.softmax(tf.reshape(
-        gate_activations,
-        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-    expert_distribution = tf.nn.sigmoid(tf.reshape(
-        expert_activations,
-        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
-
-    probabilities_by_class_and_batch = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-    probabilities = tf.reshape(probabilities_by_class_and_batch,
-                               [-1, vocab_size])
-
-    return {"predictions": probabilities}
-
-class willow_MoeModel_moe2_noGP(models.BaseModel):
-  """A softmax over a mixture of logistic models (with L2 regularization)."""
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   is_training,
-                   num_mixtures=None,
-                   l2_penalty=1e-8,
-                   **unused_params):
-    """Creates a Mixture of (Logistic) Experts model.
-     It also includes the possibility of gating the probabilities
-     The model consists of a per-class softmax distribution over a
-     configurable number of logistic classifiers. One of the classifiers in the
-     mixture is not trained, and always predicts 0.
-    Args:
-      model_input: 'batch_size' x 'num_features' matrix of input features.
-      vocab_size: The number of classes in the dataset.
-      is_training: Is this the training phase ?
-      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-        always predicts the non-existence of an entity).
-      l2_penalty: How much to penalize the squared magnitudes of parameter
-        values.
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      batch_size x num_classes.
-    """
-    num_mixtures = 2
-    low_rank_gating = FLAGS.moe_low_rank_gating
-    l2_penalty = FLAGS.moe_l2
-    gating_probabilities = False
-    gating_input = FLAGS.moe_prob_gating_input
-
-    input_size = model_input.get_shape().as_list()[1]
-    remove_diag = False
-
-    if low_rank_gating == -1:
-      gate_activations = slim.fully_connected(
-          model_input,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates")
-    else:
-      gate_activations1 = slim.fully_connected(
-          model_input,
-          low_rank_gating,
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates1")
-      gate_activations = slim.fully_connected(
-          gate_activations1,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates2")
-
-    expert_activations = slim.fully_connected(
-        model_input,
-        vocab_size * num_mixtures,
-        activation_fn=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="experts")
-
-    gating_distribution = tf.nn.softmax(tf.reshape(
-        gate_activations,
-        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-    expert_distribution = tf.nn.sigmoid(tf.reshape(
-        expert_activations,
-        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
-
-    probabilities_by_class_and_batch = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-    probabilities = tf.reshape(probabilities_by_class_and_batch,
-                               [-1, vocab_size])
-
-    return {"predictions": probabilities}
-
-
-class willow_MoeModel_moe2(models.BaseModel):
-  """A softmax over a mixture of logistic models (with L2 regularization)."""
-
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   is_training,
-                   num_mixtures=None,
-                   l2_penalty=1e-8,
-                   **unused_params):
-    """Creates a Mixture of (Logistic) Experts model.
-     It also includes the possibility of gating the probabilities
-     The model consists of a per-class softmax distribution over a
-     configurable number of logistic classifiers. One of the classifiers in the
-     mixture is not trained, and always predicts 0.
-    Args:
-      model_input: 'batch_size' x 'num_features' matrix of input features.
-      vocab_size: The number of classes in the dataset.
-      is_training: Is this the training phase ?
-      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-        always predicts the non-existence of an entity).
-      l2_penalty: How much to penalize the squared magnitudes of parameter
-        values.
-    Returns:
-      A dictionary with a tensor containing the probability predictions of the
-      model in the 'predictions' key. The dimensions of the tensor are
-      batch_size x num_classes.
-    """
-    num_mixtures = 2
-    low_rank_gating = FLAGS.moe_low_rank_gating
-    l2_penalty = FLAGS.moe_l2
-    gating_probabilities = FLAGS.moe_prob_gating
-    gating_input = FLAGS.moe_prob_gating_input
-
-    input_size = model_input.get_shape().as_list()[1]
-    remove_diag = False
-
-    if low_rank_gating == -1:
-      gate_activations = slim.fully_connected(
-          model_input,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates")
-    else:
-      gate_activations1 = slim.fully_connected(
-          model_input,
-          low_rank_gating,
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates1")
-      gate_activations = slim.fully_connected(
-          gate_activations1,
-          vocab_size * (num_mixtures + 1),
-          activation_fn=None,
-          biases_initializer=None,
-          weights_regularizer=slim.l2_regularizer(l2_penalty),
-          scope="gates2")
-
-    expert_activations = slim.fully_connected(
-        model_input,
-        vocab_size * num_mixtures,
-        activation_fn=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="experts")
-
-    gating_distribution = tf.nn.softmax(tf.reshape(
-        gate_activations,
-        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-    expert_distribution = tf.nn.sigmoid(tf.reshape(
-        expert_activations,
-        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
-
-    probabilities_by_class_and_batch = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-    probabilities = tf.reshape(probabilities_by_class_and_batch,
-                               [-1, vocab_size])
-
-    if gating_probabilities:
-      if gating_input == 'prob':
-        gating_weights = tf.get_variable("gating_prob_weights",
-                                         [vocab_size, vocab_size],
-                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-        gates = tf.matmul(probabilities, gating_weights)
-      else:
-        gating_weights = tf.get_variable("gating_prob_weights",
-                                         [input_size, vocab_size],
-                                         initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-
-        gates = tf.matmul(model_input, gating_weights)
-
-      if remove_diag:
-        # removes diagonals coefficients
-        diagonals = tf.matrix_diag_part(gating_weights)
-        gates = gates - tf.multiply(diagonals, probabilities)
-
-      gates = slim.batch_norm(
-          gates,
-          center=True,
-          scale=True,
-          is_training=is_training,
-          scope="gating_prob_bn")
-
-      gates = tf.sigmoid(gates)
-
-      probabilities = tf.multiply(probabilities, gates)
-
-    return {"predictions": probabilities}
-  
-class linear_res_mix_act_MoeModel(models.BaseModel):
-  """A softmax over a mixture of logistic models (with L2 regularization).
-                   
-                     -----linear_layers(1) + sigmoid activation-------------
-                     -                                                     -
-                     -----linear_layers(2) + relu activation----------------
-                     -                                                     -
-  input_features -----                                                     -------moe-----output
-                     -                                                     -
-                     -----linear_layers(3) + elu activation-----------------
-                     -                                                     - 
-                     -----linear_layers(4) + tanh activation----------------
-  """
-  def create_model(self,
-                   model_input,
-                   vocab_size,
-                   num_mixtures=None,
-                   num_hiddens=None,
-                   num_maxout = None,
-                   l2_penalty=1e-8,
-                   **unused_params):
-
-    num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
-    num_hiddens = num_hiddens or FLAGS.moe_num_hiddens
-    num_maxout = num_maxout or FLAGS.num_maxout
-
-    hidden_sigmoid = slim.fully_connected(
-      model_input,
-      num_hiddens,
-      activation_fn=tf.nn.sigmoid,
-      weights_regularizer=slim.l2_regularizer(l2_penalty),
-      scope='hidden_sigmoid'
-    )
-    hidden_relu = slim.fully_connected(
-      model_input,
-      num_hiddens,
-      activation_fn=tf.nn.relu,
-      weights_regularizer=slim.l2_regularizer(l2_penalty),
-      scope='hidden_relu'
-    )
-    hidden_elu = slim.fully_connected(
-      model_input,
-      num_hiddens,
-      activation_fn=tf.nn.elu,
-      weights_regularizer=slim.l2_regularizer(l2_penalty),
-      scope='hidden_elu'
-    )
-    hidden_tanh = slim.fully_connected(
-      model_input,
-      num_hiddens,
-      activation_fn=tf.nn.tanh,
-      weights_regularizer=slim.l2_regularizer(l2_penalty),
-      scope='hidden_tanh'
-    )
-
-    linear_input = slim.fully_connected(
-      model_input,
-      num_hiddens,
-      activation_fn=None,
-      weights_regularizer=slim.l2_regularizer(l2_penalty),
-      scope='hidden_linear'
-    )
-
-
-    gate_activations = slim.fully_connected(
-        model_input,
-        vocab_size * (num_mixtures + 1),
-        activation_fn=None,
-        biases_initializer=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="gates")
-    expert_activations = slim.fully_connected(
-        tf.concat([hidden_sigmoid+0.25*linear_input, hidden_relu+0.25*linear_input, hidden_elu+0.25*linear_input, hidden_tanh+0.25*linear_input], 1),
-        vocab_size * num_mixtures,
-        activation_fn=None,
-        weights_regularizer=slim.l2_regularizer(l2_penalty),
-        scope="experts")
-
-    gating_distribution = tf.nn.softmax(tf.reshape(
-        gate_activations,
-        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-    expert_distribution = tf.nn.sigmoid(tf.reshape(
-        expert_activations,
-        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
-
-    final_probabilities_by_class_and_batch = tf.reduce_sum(
-        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-    final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
-                                     [-1, vocab_size])
-    return {"predictions": final_probabilities}
\ No newline at end of file
--- a/보고서/최종보고서-윤영빈.docx
View file @7e0b563
+++ b/보고서/최종보고서-윤영빈.docx
View file @7e0b563
--- a/보고서/최종보고서-윤영빈.pdf 0 → 100644
View file @7e0b563
+++ b/보고서/최종보고서-윤영빈.pdf 0 → 100644
View file @7e0b563