final report almost done

윤영빈
Commit 7e0b563ac8ea88ce198992b6ad3aceb0eb28c3e4 7e0b563a 1 parent f2dfcea2
Showing 4 changed files with 30 additions and 2159 deletions
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
보고서/최종보고서-윤영빈.docx
보고서/최종보고서-윤영빈.pdf
--- a/web/backend/yt8m/frame_level_models.py
View file @7e0b563
+++ b/web/backend/yt8m/frame_level_models.py
View file @7e0b563
@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
 flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.")
 flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.")
 
- class DbofModel(models.BaseModel):
-   """Creates a Deep Bag of Frames model.
-   The model projects the features for each frame into a higher dimensional
-   'clustering' space, pools across frames in that space, and then
-   uses a configurable video-level model to classify the now aggregated features.
-   The model will randomly sample either frames or sequences of frames during
-   training to speed up convergence.
-   """
- 
-   ACT_FN_MAP = {
-       "sigmoid": tf.nn.sigmoid,
-       "relu6": tf.nn.relu6,
-   }
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_frames,
-                    iterations=None,
-                    add_batch_norm=None,
-                    sample_random_frames=None,
-                    cluster_size=None,
-                    hidden_size=None,
-                    is_training=True,
-                    **unused_params):
-     """See base class.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-         input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
-       iterations: the number of frames to be sampled.
-       add_batch_norm: whether to add batch norm during training.
-       sample_random_frames: whether to sample random frames or random sequences.
-       cluster_size: the output neuron number of the cluster layer.
-       hidden_size: the output neuron number of the hidden layer.
-       is_training: whether to build the graph in training mode.
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     iterations = iterations or FLAGS.iterations
-     add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
-     random_frames = sample_random_frames or FLAGS.sample_random_frames
-     cluster_size = cluster_size or FLAGS.dbof_cluster_size
-     hidden1_size = hidden_size or FLAGS.dbof_hidden_size
-     act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
-     assert act_fn is not None, ("dbof_activation is not valid: %s." %
-                                 FLAGS.dbof_activation)
- 
+ class FrameLevelLogisticModel(models.BaseModel):
+   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     if random_frames:
-       model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                              iterations)
-     else:
-       model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                                iterations)
-     max_frames = model_input.get_shape().as_list()[1]
     feature_size = model_input.get_shape().as_list()[2]
-     reshaped_input = tf.reshape(model_input, [-1, feature_size])
-     tf.compat.v1.summary.histogram("input_hist", reshaped_input)
 
-     if add_batch_norm:
-       reshaped_input = slim.batch_norm(reshaped_input,
-                                        center=True,
-                                        scale=True,
-                                        is_training=is_training,
-                                        scope="input_bn")
- 
-     cluster_weights = tf.compat.v1.get_variable(
-         "cluster_weights", [feature_size, cluster_size],
-         initializer=tf.random_normal_initializer(stddev=1 /
-                                                  math.sqrt(feature_size)))
-     tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
-     activation = tf.matmul(reshaped_input, cluster_weights)
-     if add_batch_norm:
-       activation = slim.batch_norm(activation,
-                                    center=True,
-                                    scale=True,
-                                    is_training=is_training,
-                                    scope="cluster_bn")
-     else:
-       cluster_biases = tf.compat.v1.get_variable(
-           "cluster_biases", [cluster_size],
-           initializer=tf.random_normal_initializer(stddev=1 /
-                                                    math.sqrt(feature_size)))
-       tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
-       activation += cluster_biases
-     activation = act_fn(activation)
-     tf.compat.v1.summary.histogram("cluster_output", activation)
- 
-     activation = tf.reshape(activation, [-1, max_frames, cluster_size])
-     activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
- 
-     hidden1_weights = tf.compat.v1.get_variable(
-         "hidden1_weights", [cluster_size, hidden1_size],
-         initializer=tf.random_normal_initializer(stddev=1 /
-                                                  math.sqrt(cluster_size)))
-     tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
-     activation = tf.matmul(activation, hidden1_weights)
-     if add_batch_norm:
-       activation = slim.batch_norm(activation,
-                                    center=True,
-                                    scale=True,
-                                    is_training=is_training,
-                                    scope="hidden1_bn")
-     else:
-       hidden1_biases = tf.compat.v1.get_variable(
-           "hidden1_biases", [hidden1_size],
-           initializer=tf.random_normal_initializer(stddev=0.01))
-       tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
-       activation += hidden1_biases
-     activation = act_fn(activation)
-     tf.compat.v1.summary.histogram("hidden1_output", activation)
+     denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
+                               [-1, feature_size])
+     avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
-     return aggregated_model().create_model(model_input=activation,
-                                            vocab_size=vocab_size,
-                                            **unused_params)
+     output = slim.fully_connected(avg_pooled,
+                                   vocab_size,
+                                   activation_fn=tf.nn.sigmoid,
+                                   weights_regularizer=slim.l2_regularizer(1e-8))
+     
+     return {"predictions": output}
 
 class NetVLAD_NonLocal_types():
     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types():
       return vlad_softmax
 
 class NetVLADModelLF(models.BaseModel):
-   """Creates a NetVLAD based model.
-   Args:
-     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                  input features.
-     vocab_size: The number of classes in the dataset.
-     num_frames: A vector of length 'batch' which indicates the number of
-          frames for each video (before padding).
-   Returns:
-     A dictionary with a tensor containing the probability predictions of the
-     model in the 'predictions' key. The dimensions of the tensor are
-     'batch_size' x 'num_classes'.
-   """
- 
- 
   def create_model(self,
                    model_input,
                    vocab_size,
@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel):
         is_training=is_training,
         **unused_params)
 
- class GruModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-     """Creates a model which uses a stack of GRUs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     gru_size = 600
-     number_of_layers = 4
-     backward = False
-     random_frames = False
-     iterations = 30
- 
-     if random_frames:
-       num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-       model_input = utils.SampleRandomFrames(model_input, num_frames_2,
-                                              iterations)
- 
-     if backward:
-         model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1)
+ class LstmModel(models.BaseModel):
+     
+   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
+     lstm_size = FLAGS.lstm_cells
+     number_of_layers = FLAGS.lstm_layers
 
-     stacked_GRU = tf.contrib.rnn.MultiRNNCell(
+     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
             [
-                 tf.contrib.rnn.GRUCell(gru_size)
+                 tf.contrib.rnn.BasicLSTMCell(
+                     lstm_size, forget_bias=1.0)
                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
+                 ])
 
     loss = 0.0
-     with tf.variable_scope("RNN"):
-       outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input,
-                                          sequence_length=num_frames,
-                                          dtype=tf.float32)
+ 
+     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
+                                        sequence_length=num_frames,
+                                        dtype=tf.float32)
 
     aggregated_model = getattr(video_level_models,
-                                'MoeModel')
+                                FLAGS.video_level_classifier_model)
+ 
     return aggregated_model().create_model(
-         model_input=state,
+         model_input=state[-1].h,
         vocab_size=vocab_size,
-         is_training=is_training,
         **unused_params)
- 
- 
- 
- class SoftDBoF():
-     def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
-         self.feature_size = feature_size
-         self.max_frames = max_frames
-         self.is_training = is_training
-         self.add_batch_norm = add_batch_norm
-         self.cluster_size = cluster_size
-         self.max_pool = max_pool
- 
-     def forward(self, reshaped_input):
- 
-         feature_size = self.feature_size
-         cluster_size = self.cluster_size
-         add_batch_norm = self.add_batch_norm
-         max_frames = self.max_frames
-         is_training = self.is_training
-         max_pool = self.max_pool
- 
-         cluster_weights = tf.get_variable("cluster_weights",
-           [feature_size, cluster_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
- 
-         tf.summary.histogram("cluster_weights", cluster_weights)
-         activation = tf.matmul(reshaped_input, cluster_weights)
- 
-         if add_batch_norm:
-           activation = slim.batch_norm(
-               activation,
-               center=True,
-               scale=True,
-               is_training=is_training,
-               scope="cluster_bn")
-         else:
-           cluster_biases = tf.get_variable("cluster_biases",
-             [cluster_size],
-             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-           tf.summary.histogram("cluster_biases", cluster_biases)
-           activation += cluster_biases
- 
-         activation = tf.nn.softmax(activation)
- 
-         activation = tf.reshape(activation, [-1, int(max_frames), int(cluster_size)])
- 
-         activation_sum = tf.reduce_sum(activation,1)
-         activation_sum = tf.nn.l2_normalize(activation_sum,1)
- 
-         if max_pool:
-             activation_max = tf.reduce_max(activation,1)
-             activation_max = tf.nn.l2_normalize(activation_max,1)
-             activation = tf.concat([activation_sum,activation_max],1)
-         else:
-             activation = activation_sum
- 
-         return activation
- 
- 
- class LightVLAD_nonlocal():
-     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
-         self.feature_size = feature_size
-         self.max_frames = max_frames
-         self.is_training = is_training
-         self.add_batch_norm = add_batch_norm
-         self.cluster_size = cluster_size
- 
-     def forward(self,reshaped_input):
- 
- 
-         cluster_weights = tf.get_variable("cluster_weights",
-               [int(self.feature_size), int(self.cluster_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
- 
-         activation = tf.matmul(reshaped_input, cluster_weights)
- 
-         if self.add_batch_norm:
-           activation = slim.batch_norm(
-               activation,
-               center=True,
-               scale=True,
-               is_training=self.is_training,
-               scope="cluster_bn")
-         else:
-           cluster_biases = tf.get_variable("cluster_biases",
-             [cluster_size],
-             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-           tf.summary.histogram("cluster_biases", cluster_biases)
-           activation += cluster_biases
- 
-         activation = tf.nn.softmax(activation)
- 
-         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
- 
-         activation = tf.transpose(activation,perm=[0,2,1])
- 
-         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
-         vlad = tf.matmul(activation,reshaped_input)
- 
-         vlad = tf.reshape(vlad, [-1,self.feature_size])
-         vlad = nonLocal_block(vlad, feature_size=self.feature_size, hidden_size=self.feature_size//2, cluster_size=self.cluster_size)
- 
-         vlad = tf.reshape(vlad, [-1,self.cluster_size,self.feature_size])
-         vlad = tf.transpose(vlad,perm=[0,2,1])
- 
-         vlad = tf.nn.l2_normalize(vlad,1)
- 
-         vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
-         vlad = tf.nn.l2_normalize(vlad,1)
- 
-         return vlad
- 
- class LightNetVLADModelLF(models.BaseModel):
-   """Creates a NetVLAD based model.
-   Args:
-     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                  input features.
-     vocab_size: The number of classes in the dataset.
-     num_frames: A vector of length 'batch' which indicates the number of
-          frames for each video (before padding).
-   Returns:
-     A dictionary with a tensor containing the probability predictions of the
-     model in the 'predictions' key. The dimensions of the tensor are
-     'batch_size' x 'num_classes'.
-   """
- 
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_frames,
-                    iterations=None,
-                    add_batch_norm=None,
-                    sample_random_frames=None,
-                    cluster_size=None,
-                    hidden_size=None,
-                    is_training=True,
-                    **unused_params):
-     iterations = 300
-     add_batch_norm = True
-     random_frames = True
-     cluster_size = 64
-     hidden1_size = 1024
-     relu = False
-     dimred = -1
-     gating = True
-     remove_diag = False
- 
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     if random_frames:
-       model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                              iterations)
-     else:
-       model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                                iterations)
- 
- 
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     reshaped_input = tf.reshape(model_input, [-1, feature_size])
- 
- 
-     video_NetVLAD = LightVLAD_nonlocal(1024,max_frames,cluster_size, add_batch_norm, is_training)
-     audio_NetVLAD = LightVLAD_nonlocal(128,max_frames,cluster_size/2, add_batch_norm, is_training)
- 
- 
-     if add_batch_norm:# and not lightvlad:
-       reshaped_input = slim.batch_norm(
-           reshaped_input,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="input_bn")
- 
-     with tf.variable_scope("video_VLAD"):
-         vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
- 
-     with tf.variable_scope("audio_VLAD"):
-         vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
- 
-     vlad = tf.concat([vlad_video, vlad_audio],1)
- 
-     vlad_dim = vlad.get_shape().as_list()[1]
-     hidden1_weights = tf.get_variable("hidden1_weights",
-       [vlad_dim, hidden1_size],
-       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
- 
-     activation = tf.matmul(vlad, hidden1_weights)
- 
-     if add_batch_norm and relu:
-       activation = slim.batch_norm(
-           activation,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="hidden1_bn")
- 
-     else:
-       hidden1_biases = tf.get_variable("hidden1_biases",
-         [hidden1_size],
-         initializer = tf.random_normal_initializer(stddev=0.01))
-       tf.summary.histogram("hidden1_biases", hidden1_biases)
-       activation += hidden1_biases
- 
-     if relu:
-       activation = tf.nn.relu6(activation)
- 
- 
-     if gating:
-         gating_weights = tf.get_variable("gating_weights_2",
-           [hidden1_size, hidden1_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
- 
-         gates = tf.matmul(activation, gating_weights)
- 
-         if remove_diag:
-             #removes diagonals coefficients
-             diagonals = tf.matrix_diag_part(gating_weights)
-             gates = gates - tf.multiply(diagonals,activation)
- 
- 
-         if add_batch_norm:
-           gates = slim.batch_norm(
-               gates,
-               center=True,
-               scale=True,
-               is_training=is_training,
-               scope="gating_bn")
-         else:
-           gating_biases = tf.get_variable("gating_biases",
-             [cluster_size],
-             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-           gates += gating_biases
- 
-         gates = tf.sigmoid(gates)
- 
-         activation = tf.multiply(activation,gates)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
- 
-     return aggregated_model().create_model(
-         model_input=activation,
-         vocab_size=vocab_size,
-         is_training=is_training,
-         **unused_params)
- 
- def nonLocal_block(vlad, feature_size, hidden_size, cluster_size):
-     nonlocal_theta = tf.get_variable("nonlocal_theta",
-           [feature_size, hidden_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-     nonlocal_phi = tf.get_variable("nonlocal_phi",
-           [feature_size, hidden_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-     nonlocal_g = tf.get_variable("nonlocal_g",
-           [feature_size, hidden_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
-     nonlocal_out = tf.get_variable("nonlocal_out",
-           [hidden_size, feature_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden_size)))
- 
-     vlad_theta = tf.matmul(vlad, nonlocal_theta)
-     vlad_phi = tf.matmul(vlad, nonlocal_phi)
-     vlad_g = tf.matmul(vlad, nonlocal_g)
- 
-     vlad_theta = tf.reshape(vlad_theta, [-1, cluster_size, hidden_size])
-     vlad_phi = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
-     vlad_g = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
- 
-     vlad_softmax = tf.nn.softmax(feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
-     vlad_g = tf.matmul(vlad_softmax, vlad_g)
-     vlad_g = tf.reshape(vlad_g, [-1, hidden_size])
- 
-     vlad_g = tf.matmul(vlad_g, nonlocal_out)
-     vlad = vlad + vlad_g
-     return vlad
- 
- class SoftDbofModelLF(models.BaseModel):
-   """Creates a Soft Deep Bag of Frames model.
-   The model projects the features for each frame into a higher dimensional
-   'clustering' space, pools across frames in that space, and then
-   uses a configurable video-level model to classify the now aggregated features.
-   The model will randomly sample either frames or sequences of frames during
-   training to speed up convergence.
-   Args:
-     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                  input features.
-     vocab_size: The number of classes in the dataset.
-     num_frames: A vector of length 'batch' which indicates the number of
-          frames for each video (before padding).
-   Returns:
-     A dictionary with a tensor containing the probability predictions of the
-     model in the 'predictions' key. The dimensions of the tensor are
-     'batch_size' x 'num_classes'.
-   """
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_frames,
-                    iterations=None,
-                    add_batch_norm=None,
-                    sample_random_frames=None,
-                    cluster_size=None,
-                    hidden_size=None,
-                    is_training=True,
-                    **unused_params):
-     iterations = 300
-     add_batch_norm = True
-     random_frames = True
-     cluster_size = 4000
-     hidden1_size = 1024
-     fc_dimred = True
-     relu = False
-     max_pool = False
- 
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     if random_frames:
-       model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                              iterations)
-     else:
-       model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                                iterations)
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     reshaped_input = tf.reshape(model_input, [-1, feature_size])
-     tf.summary.histogram("input_hist", reshaped_input)
- 
-     video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
-     audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
- 
- 
-     if add_batch_norm:
-       reshaped_input = slim.batch_norm(
-           reshaped_input,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="input_bn")
- 
-     with tf.variable_scope("video_DBOF"):
-         dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
- 
-     with tf.variable_scope("audio_DBOF"):
-         dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
- 
-     dbof = tf.concat([dbof_video, dbof_audio],1)
- 
-     dbof_dim = dbof.get_shape().as_list()[1]
- 
-     if fc_dimred:
-         hidden1_weights = tf.get_variable("hidden1_weights",
-           [dbof_dim, hidden1_size],
-           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-         tf.summary.histogram("hidden1_weights", hidden1_weights)
-         activation = tf.matmul(dbof, hidden1_weights)
- 
-         if add_batch_norm and relu:
-           activation = slim.batch_norm(
-               activation,
-               center=True,
-               scale=True,
-               is_training=is_training,
-               scope="hidden1_bn")
-         else:
-           hidden1_biases = tf.get_variable("hidden1_biases",
-             [hidden1_size],
-             initializer = tf.random_normal_initializer(stddev=0.01))
-           tf.summary.histogram("hidden1_biases", hidden1_biases)
-           activation += hidden1_biases
- 
-         if relu:
-           activation = tf.nn.relu6(activation)
-         tf.summary.histogram("hidden1_output", activation)
-     else:
-         activation = dbof
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=activation,
-         vocab_size=vocab_size,
-         is_training=is_training,
-         **unused_params)
- 
- 
- 
- class early_NetVLADModelLF(models.BaseModel):
-   """Creates a NetVLAD based model.
-   Args:
-     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                  input features.
-     vocab_size: The number of classes in the dataset.
-     num_frames: A vector of length 'batch' which indicates the number of
-          frames for each video (before padding).
-   Returns:
-     A dictionary with a tensor containing the probability predictions of the
-     model in the 'predictions' key. The dimensions of the tensor are
-     'batch_size' x 'num_classes'.
-   """
- 
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_frames,
-                    iterations=None,
-                    add_batch_norm=None,
-                    sample_random_frames=None,
-                    cluster_size=None,
-                    hidden_size=None,
-                    is_training=True,
-                    **unused_params):
-     iterations = 300
-     add_batch_norm = True
-     random_frames = True
-     cluster_size = 64
-     hidden1_size = 1024
-     relu = False
-     dimred = -1
-     gating = True
-     remove_diag = False
- 
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     if random_frames:
-       model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                              iterations)
-     else:
-       model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                                iterations)
- 
- 
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     reshaped_input = tf.reshape(model_input, [-1, feature_size])
- 
-     video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training)
- 
-     if add_batch_norm:# and not lightvlad:
-       reshaped_input = slim.batch_norm(
-           reshaped_input,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="input_bn")
-     with tf.variable_scope("video_audio_VLAD"):
-         vlad = video_audio_NetVLAD.forward(reshaped_input)
- 
-     vlad_dim = vlad.get_shape().as_list()[1]
-     hidden1_weights = tf.get_variable("hidden1_weights",
-       [vlad_dim, hidden1_size],
-       initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
- 
-     activation = tf.matmul(vlad, hidden1_weights)
- 
-     if add_batch_norm and relu:
-       activation = slim.batch_norm(
-           activation,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="hidden1_bn")
- 
-     else:
-       hidden1_biases = tf.get_variable("hidden1_biases",
-         [hidden1_size],
-         initializer = tf.random_normal_initializer(stddev=0.01))
-       tf.summary.histogram("hidden1_biases", hidden1_biases)
-       activation += hidden1_biases
- 
-     if relu:
-       activation = tf.nn.relu6(activation)
- 
- 
-     if gating:
-         gating_weights = tf.get_variable("gating_weights_2",
-           [hidden1_size, hidden1_size],
-           initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
- 
-         gates = tf.matmul(activation, gating_weights)
- 
-         if remove_diag:
-             #removes diagonals coefficients
-             diagonals = tf.matrix_diag_part(gating_weights)
-             gates = gates - tf.multiply(diagonals,activation)
- 
- 
-         if add_batch_norm:
-           gates = slim.batch_norm(
-               gates,
-               center=True,
-               scale=True,
-               is_training=is_training,
-               scope="gating_bn")
-         else:
-           gating_biases = tf.get_variable("gating_biases",
-             [cluster_size],
-             initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
-           gates += gating_biases
- 
-         gates = tf.sigmoid(gates)
- 
-         activation = tf.multiply(activation,gates)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
- 
-     return aggregated_model().create_model(
-         model_input=activation,
-         vocab_size=vocab_size,
-         is_training=is_training,
-         **unused_params)
- 
- class NetVLAD_NonLocal():
-     def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
-         self.feature_size = feature_size
-         self.max_frames = max_frames
-         self.is_training = is_training
-         self.add_batch_norm = add_batch_norm
-         self.cluster_size = cluster_size
- 
-     def forward(self,reshaped_input):
- 
-         cluster_weights = tf.get_variable("cluster_weights",
-               [int(self.feature_size), int(self.cluster_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
- 
-         tf.summary.histogram("cluster_weights", cluster_weights)
-         activation = tf.matmul(reshaped_input, cluster_weights)
- 
-         if self.add_batch_norm:
-           activation = slim.batch_norm(
-               activation,
-               center=True,
-               scale=True,
-               is_training=self.is_training,
-               scope="cluster_bn")
-         else:
-           cluster_biases = tf.get_variable("cluster_biases",
-             [cluster_size],
-             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-           tf.summary.histogram("cluster_biases", cluster_biases)
-           activation += cluster_biases
- 
-         activation = tf.nn.softmax(activation)
-         tf.summary.histogram("cluster_output", activation)
- 
-         activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
- 
-         a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
- 
-         cluster_weights2 = tf.get_variable("cluster_weights2",
-             [1,int(self.feature_size), int(self.cluster_size)],
-             initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
- 
-         a = tf.multiply(a_sum,cluster_weights2)
- 
-         activation = tf.transpose(activation,perm=[0,2,1])
- 
-         reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
-         vlad = tf.matmul(activation,reshaped_input)
-         vlad = tf.transpose(vlad,perm=[0,2,1])
-         vlad = tf.subtract(vlad,a)
- 
- 
-         vlad = tf.transpose(vlad,perm=[0,2,1])
-         vlad = tf.reshape(vlad, [-1, self.feature_size])
- 
-         nonlocal_theta = tf.get_variable("nonlocal_theta",
-               [int(self.feature_size), int(self.cluster_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-         nonlocal_phi = tf.get_variable("nonlocal_phi",
-               [int(self.feature_size), int(self.cluster_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-         nonlocal_g = tf.get_variable("nonlocal_g",
-               [int(self.feature_size), int(self.cluster_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
-         nonlocal_out = tf.get_variable("nonlocal_out",
-               [int(self.cluster_size), int(self.feature_size)],
-               initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
- 
-         vlad_theta = tf.matmul(vlad, nonlocal_theta)
-         vlad_phi = tf.matmul(vlad, nonlocal_phi)
-         vlad_g = tf.matmul(vlad, nonlocal_g)
- 
-         vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size),int(self.cluster_size)])
-         vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
-         vlad_g = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
- 
-         vlad_softmax = tf.nn.softmax(self.feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
-         vlad_g = tf.matmul(vlad_softmax, vlad_g)
-         vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
- 
-         vlad_g = tf.matmul(vlad_g, nonlocal_out)
-         vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)])
-         vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)])
-         vlad = vlad + vlad_g
- 
-         vlad = tf.transpose(vlad,perm=[0,2,1])
-         vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
- 
-         vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
-         vlad = tf.nn.l2_normalize(vlad,1)
- 
-         return vlad
- 
- 
- class SoftDbofModelLF_8k(models.BaseModel):
-   """Creates a Soft Deep Bag of Frames model.
-   The model projects the features for each frame into a higher dimensional
-   'clustering' space, pools across frames in that space, and then
-   uses a configurable video-level model to classify the now aggregated features.
-   The model will randomly sample either frames or sequences of frames during
-   training to speed up convergence.
-   Args:
-     model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                  input features.
-     vocab_size: The number of classes in the dataset.
-     num_frames: A vector of length 'batch' which indicates the number of
-          frames for each video (before padding).
-   Returns:
-     A dictionary with a tensor containing the probability predictions of the
-     model in the 'predictions' key. The dimensions of the tensor are
-     'batch_size' x 'num_classes'.
-   """
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_frames,
-                    iterations=None,
-                    add_batch_norm=None,
-                    sample_random_frames=None,
-                    cluster_size=None,
-                    hidden_size=None,
-                    is_training=True,
-                    **unused_params):
-     iterations = 300
-     add_batch_norm = True
-     random_frames = True
-     cluster_size = 2048
-     hidden1_size = 1024
-     fc_dimred = True
-     relu = False
-     max_pool = False
- 
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     if random_frames:
-       model_input = utils.SampleRandomFrames(model_input, num_frames,
-                                              iterations)
-     else:
-       model_input = utils.SampleRandomSequence(model_input, num_frames,
-                                                iterations)
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     reshaped_input = tf.reshape(model_input, [-1, feature_size])
-     tf.summary.histogram("input_hist", reshaped_input)
- 
-     video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
-     audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
- 
- 
-     if add_batch_norm:
-       reshaped_input = slim.batch_norm(
-           reshaped_input,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="input_bn")
- 
-     with tf.variable_scope("video_DBOF"):
-         dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
- 
-     with tf.variable_scope("audio_DBOF"):
-         dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
- 
-     dbof = tf.concat([dbof_video, dbof_audio],1)
- 
-     dbof_dim = dbof.get_shape().as_list()[1]
- 
-     if fc_dimred:
-         hidden1_weights = tf.get_variable("hidden1_weights",
-           [dbof_dim, hidden1_size],
-           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
-         tf.summary.histogram("hidden1_weights", hidden1_weights)
-         activation = tf.matmul(dbof, hidden1_weights)
- 
-         if add_batch_norm and relu:
-           activation = slim.batch_norm(
-               activation,
-               center=True,
-               scale=True,
-               is_training=is_training,
-               scope="hidden1_bn")
-         else:
-           hidden1_biases = tf.get_variable("hidden1_biases",
-             [hidden1_size],
-             initializer = tf.random_normal_initializer(stddev=0.01))
-           tf.summary.histogram("hidden1_biases", hidden1_biases)
-           activation += hidden1_biases
- 
-         if relu:
-           activation = tf.nn.relu6(activation)
-         tf.summary.histogram("hidden1_output", activation)
-     else:
-         activation = dbof
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
- 
-     return aggregated_model().create_model(
-         model_input=activation,
-         vocab_size=vocab_size,
-         is_training=is_training,
-         **unused_params)
- 
- class FrameLevelLogisticModel(models.BaseModel):
-   """Creates a logistic classifier over the aggregated frame-level features."""
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """See base class.
- 
-     This class is intended to be an example for implementors of frame level
-     models. If you want to train a model over averaged features it is more
-     efficient to average them beforehand rather than on the fly.
- 
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-         input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
- 
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     feature_size = model_input.get_shape().as_list()[2]
- 
-     denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                               [-1, feature_size])
-     avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
- 
-     output = slim.fully_connected(avg_pooled,
-                                   vocab_size,
-                                   activation_fn=tf.nn.sigmoid,
-                                   weights_regularizer=slim.l2_regularizer(1e-8))
-     
-     return {"predictions": output}
- 
- class CNN(models.BaseModel):
-   """Creates a logistic classifier over the aggregated frame-level features."""
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """See base class.
- 
-     This class is intended to be an example for implementors of frame level
-     models. If you want to train a model over averaged features it is more
-     efficient to average them beforehand rather than on the fly.
- 
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-         input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-         frames for each video (before padding).
- 
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     feature_size = model_input.get_shape().as_list()[2]
- 
-     denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
-                               [-1, feature_size])
-     
-     
-     
-     convK3 = slim.convolution(model_input,
-                         num_outputs=feature_size,
-                         kernel_size=3,
-                         scope='conv1')
-     
-     convK5 = slim.convolution(model_input,
-                         num_outputs=feature_size,
-                         kernel_size=5,
-                         scope='conv2')
-     
-     convK1 = slim.convolution(model_input,
-                         num_outputs=feature_size,
-                         kernel_size=5,
-                         scope='conv3')
-   
-     
-     avg_pooled = tf.reduce_sum(tf.concat([convK3,convK5,convK1],axis=1), axis=[1]) / denominators
-     
-     output = slim.fully_connected(avg_pooled,
-                                   vocab_size,
-                                   activation_fn=tf.nn.relu,
-                                   weights_regularizer=slim.l2_regularizer(1e-8))
- 
-     return {"predictions": output}
-   
- class LstmModel(models.BaseModel):
-     
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """Creates a model which uses a stack of LSTMs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.BasicLSTMCell(
-                     lstm_size, forget_bias=1.0)
-                 for _ in range(number_of_layers)
-                 ])
- 
-     loss = 0.0
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=state[-1].h,
-         vocab_size=vocab_size,
-         **unused_params)
-     
- class BNGRUModel(models.BaseModel):
-     
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     stacked_rnn = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.GRUCell(lstm_size)
-                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     state = slim.batch_norm(
-         state,
-         center=True,
-         scale=True,
-         is_training=True,
-         scope='proj')
- 
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
- 
- 
- class GruModel2(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """Creates a model which uses a stack of LSTMs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.GRUCell(lstm_size)
-                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
- 
-     loss = 0.0
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
- 
- class BiGRUModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """Creates a model which uses a stack of LSTMs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     with tf.variable_scope('fw'):
-         rnn_fw = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.GRUCell(lstm_size)
-                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
- 
- 
-     with tf.variable_scope('bw'):
-         rnn_bw = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.GRUCell(lstm_size)
-                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
- 
-     outputs, state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32, swap_memory=True)
-     state = tf.concat(state, axis=1)
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
-     state = slim.batch_norm(
-           state,
-           center=True,
-           scale=True,
-           is_training=True,
-           scope='proj')
- 
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
- """
- Copyright (c) 2017, University of Texas Southwestern Medical Center
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of the University of Texas at Austin nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- Recurrent Weighted Average
- Implementation modified from: https://github.com/jostmey/rwa
- Paper:
- @article{ostmeyer2017machine,
-   title={Machine Learning on Sequential Data Using a Recurrent Weighted Average},
-   author={Ostmeyer, Jared and Cowell, Lindsay},
-   journal={arXiv preprint arXiv:1703.01253},
-   year={2017}
- }
- """
- 
- class RwaModel(models.BaseModel):
- 
- 
-     def create_model(self, model_input, vocab_size, num_frames, **unused_params):
- 
-         # constants
- 
-         init_factor = 1.0
-         num_cells = FLAGS.lstm_cells
-         input_shape = model_input.get_shape().as_list()
-         batch_size, max_steps, num_features = input_shape
- 
-         # trainable weights
-         s = weights_rwa.init_state(num_cells, "s", init_factor)
-         W_g = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_g")
-         W_u = weights_rwa.init_weight([num_features, num_cells], "W_u")
-         W_a = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_a")
-         b_g = weights_rwa.init_bias(num_cells, "b_g")
-         b_u = weights_rwa.init_bias(num_cells, "b_u")
-         b_a = weights_rwa.init_bias(num_cells, "b_a")
- 
-         #pl = tf.placeholder(tf.float32, shape=[None, num_cells])
-         pl = tf.reshape(model_input, [-1, max_steps*num_features])[:, :num_cells]
- 
-         # internal states
-         #n = tf.zeros([batch_size, num_cells])
-         #d = tf.zeros([batch_size, num_cells])
-         #h = tf.zeros([batch_size, num_cells])
-         #a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible
-         n = tf.zeros_like(pl)
-         d = tf.zeros_like(pl)
-         h = tf.zeros_like(pl)
-         a_max = tf.multiply(tf.ones_like(pl), -1E38)
- 
-         # define model
-         h += tf.nn.tanh(tf.expand_dims(s, 0))
- 
-         for i in range(max_steps):
- 
-             x_step = model_input[:,i,:]
-             xh_join = tf.concat(axis=1, values=[x_step, h]) # Combine the features and hidden state into one tensor
- 
-             u = tf.matmul(x_step, W_u)+b_u
-             g = tf.matmul(xh_join, W_g)+b_g
-             a = tf.matmul(xh_join, W_a)     # The bias term when factored out of the numerator and denominator cancels and is unnecessary
- 
-             z = tf.multiply(u, tf.nn.tanh(g))
- 
-             a_newmax = tf.maximum(a_max, a)
-             exp_diff = tf.exp(a_max-a_newmax)
-             exp_scaled = tf.exp(a-a_newmax)
- 
-             n = tf.multiply(n, exp_diff)+tf.multiply(z, exp_scaled) # Numerically stable update of numerator
-             d = tf.multiply(d, exp_diff)+exp_scaled # Numerically stable update of denominator
-             h_new = tf.nn.tanh(tf.div(n, d))
-             a_max = a_newmax
- 
-             h = tf.where(tf.greater(num_frames, i), h_new, h)    # Use new hidden state only if the sequence length has not been exceeded
- 
- 
-         aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
-         return aggregated_model().create_model(
-             model_input=h,
-             vocab_size=vocab_size,
-             **unused_params)
- 
- 
- 
- class DropoutGruModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """Creates a model which uses a stack of LSTMs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.DropoutWrapper(
-                     tf.contrib.rnn.GRUCell(lstm_size), 0.9, 0.9)
-                 for _ in range(number_of_layers)
-                 ], state_is_tuple=False)
- 
-     loss = 0.0
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     aggregated_model = FrameLevelLogisticModel;
-     return aggregated_model().create_model(
-         model_input=outputs,
-         vocab_size=vocab_size,
-         num_frames=num_frames,
-         **unused_params)
- 
- 
- 
- 
- class ResRnnModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     lstm_size = 1152
-     number_of_layers = 3
- 
-     #from rnn_cell_modern import Delta_RNN as drnn
-     from rnn_wrappers_modern import MultiRNNCell as mrnn
- 
-     cells = []
-     for i in range(number_of_layers):
-         with tf.variable_scope('cell_'+str(i)):
-             cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0))
- 
-     stacked_rnn = mrnn(cells, use_residual_connections=True, state_is_tuple=True)
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=state[-1].h,
-         vocab_size=vocab_size,
-         **unused_params)
- 
- 
- class LateVladModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     model_input = utils.SampleRandomSequence(model_input, num_frames, 128)
- 
-     input_v = model_input[:,:,:1024]
-     input_a = model_input[:,:,1024:]
- 
-     K = 8
- 
-     with tf.variable_scope('video'):
-         x = input_v
-         input_shape = x.get_shape().as_list()
-         _, N, D = input_shape
-         c_bound = math.sqrt(1. / (K * D))
-         c = tf.get_variable(name='c',
-                             shape=[K, N],
-                             dtype=tf.float32,
-                             initializer=tf.random_uniform_initializer(-c_bound, c_bound))
-         a = slim.convolution(x,
-                              num_outputs=K,
-                              kernel_size=1,
-                              data_format='NWC',
-                              scope='conv')
-         a = tf.nn.softmax(a)
-         v = []
-         for k in range(K):
-           t = x-c[k][None, :, None]
-           t = tf.multiply(t, a[:,:,k][:,:,None])
-           t = tf.reduce_sum(t, 1)
-           t = tf.nn.l2_normalize(t, dim=1)
-           v.append(t)
-         v = tf.stack(v, axis=1)
-         v = tf.reshape(v, [-1, K*D])
- 
-         proj_weights = tf.get_variable("proj_weights",
-           [K*D, 1024],
-           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
-         activation_v = tf.matmul(v, proj_weights)
- 
-     with tf.variable_scope('audio'):
-         x = input_a
-         input_shape = x.get_shape().as_list()
-         _, N, D = input_shape
-         c_bound = math.sqrt(1. / (K * D))
-         c = tf.get_variable(name='c',
-                             shape=[K, N],
-                             dtype=tf.float32,
-                             initializer=tf.random_uniform_initializer(-c_bound, c_bound))
-         a = slim.convolution(x,
-                              num_outputs=K,
-                              kernel_size=1,
-                              data_format='NWC',
-                              scope='conv')
-         a = tf.nn.softmax(a)
-         v = []
-         for k in range(K):
-           t = x-c[k][None, :, None]
-           t = tf.multiply(t, a[:,:,k][:,:,None])
-           t = tf.reduce_sum(t, 1)
-           t = tf.nn.l2_normalize(t, dim=1)
-           v.append(t)
-         v = tf.stack(v, axis=1)
-         v = tf.reshape(v, [-1, K*D])
- 
-         proj_weights = tf.get_variable("proj_weights",
-           [K*D, 1024],
-           initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
-         activation_a = tf.matmul(v, proj_weights)
- 
-     activation = tf.concat([activation_v, activation_a], axis=1)
- 
-     activation = slim.batch_norm(
-           activation,
-           center=True,
-           scale=True,
-           is_training=True,
-           scope='proj')
- 
-     activation = tf.nn.relu6(activation)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=activation,
-         vocab_size=vocab_size,
-         **unused_params)
-     
- class LNBLstmModel(models.BaseModel):
-     
-   def create_model(self, model_input, vocab_size, num_frames, **unused_params):
-     """Creates a model which uses a stack of LSTMs to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
- 
-     stacked_lstm = tf.contrib.rnn.MultiRNNCell(
-             [
-                 tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_size, dropout_keep_prob=0.50)
-                 for _ in range(number_of_layers)
-                 ])
- 
-     loss = 0.0
- 
-     outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
-                                        sequence_length=num_frames,
-                                        dtype=tf.float32)
- 
-     aggregated_model = getattr(video_level_models,
-                                FLAGS.video_level_classifier_model)
- 
-     return aggregated_model().create_model(
-         model_input=state[-1].h,
-         vocab_size=vocab_size,
-         **unused_params)
-     
- class audio_avgShort_twowayGRUModel(models.BaseModel):
-     
-   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-     """Creates a model which uses a Bidirectional GRU and mean audio features to represent the video.
-                       ---->first half GRU----->
-                       -                       -
-     visual_feature ----                       concat---------------->
-                       -                       -                     -
-                       ---->second half GRU---->                     concat -----> video level classifier
-                                                                     -
-                                               mean audio features--->
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     stride = FLAGS.stride
-     max_frames = model_input.get_shape().as_list()[1]
- 
-     video_input = model_input[:,:,:1024]
-     audio_input = model_input[:,:,1024:]
- 
-     first_num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
-     audio_den = tf.reshape(tf.tile(first_num_frames, [1, 128]), [-1, 128])
-     mean_audio = tf.reduce_sum(audio_input, 1) / tf.maximum(audio_den, 1)
- 
-     pooled_input, num_frames = self.avg_pooled_func(video_input, num_frames, stride)
- 
-     pooled_input = slim.batch_norm(
-       pooled_input,
-       center=True,
-       scale=True,
-       is_training=is_training,
-       scope="hidden1_bn")
- 
-     mean_audio = slim.batch_norm(
-       mean_audio,
-       center=True,
-       scale=True,
-       is_training=is_training,
-       scope="hidden1_bn_audio")
- 
-     fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-     bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
- 
-     fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], 
-         sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
-     bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], 
-         sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
- 
-     state = tf.concat([fw_state, bw_state], 1)
-     state = tf.concat([state, mean_audio], 1)
- 
-     aggregated_model = getattr(video_level_models,
-                                'linear_res_mix_act_MoeModel')
- 
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
-   def avg_pooled_func(self, model_input, num_frames_in, stride):
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     num_frames = num_frames_in // stride
-     step = max_frames//stride
- 
-     first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-     first_layer_input = tf.reduce_sum(first_layer_input, 1)
- 
-     first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
-     denominators = tf.reshape(
-         tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
-     first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
- 
-     return first_layer_avg_pooled, num_frames
- 
- 
- class resav_ConvModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-     """Creates a model which uses a Convolutional model to represent the video.
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     stride = FLAGS.stride
-     conv_length = FLAGS.conv_length
-     conv_hidden1 = FLAGS.conv_hidden1
-     conv_hidden2 = FLAGS.conv_hidden2
-     conv_hidden3 = FLAGS.conv_hidden3
-     mean_feature = tf.reduce_mean(model_input, 1)
-     feature_size = model_input.get_shape().as_list()[2]
- 
-     pooled_input = self.avg_pooled_func(model_input, stride)
- 
-     # To shape : 'batch_size' x 'max_frames' x 1 x 'num_features'
-     input_expand = tf.expand_dims(pooled_input, -1)
-     input_expand = tf.transpose(input_expand, [0,1,3,2])
- 
-     # conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden
-     conv_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_1')
-     conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_1"))
-     conv_out = slim.conv2d(conv_out, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_2')
-     conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_2")
-     res_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_1_1')
-     res_out = res_out + conv_out
-     res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool1')
- 
-     conv_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_1')
-     conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_1"))
-     conv_out = slim.conv2d(conv_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_2')
-     conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_2")
-     res_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_2_1')
-     res_out = res_out + conv_out
-     res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool2')
- 
-     conv_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_1')
-     conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_1"))
-     conv_out = slim.conv2d(conv_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_2')
-     conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_2")
-     res_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_3_1')
-     res_out = res_out + conv_out
-     res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool3')
- 
-     a = res_out.get_shape().as_list()[1]
-     b = res_out.get_shape().as_list()[2]
-     c = res_out.get_shape().as_list()[3]
-     
-     print(res_out.get_shape().as_list())
-     
-     res_out = tf.reshape(res_out, [-1, a*b*c])
- 
-     state = tf.concat([res_out, mean_feature], 1)
- 
-     aggregated_model = getattr(video_level_models,
-                                'linear_res_mix_act_MoeModel')
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
-   def avg_pooled_func(self, model_input, stride):
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     step = max_frames//stride
- 
-     first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-     first_layer_input = tf.reduce_mean(first_layer_input, 1)
- 
-     return first_layer_input
- 
- class pur_twowayGRUModel(models.BaseModel):
- 
-   def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
-     """Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video.
-                       ---->first half GRU----->
-                       -                       -
-     video_feature ----                       concat---------------->video level classifier
-                       -                       -
-                       ---->second half GRU---->
-     Args:
-       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
-                    input features.
-       vocab_size: The number of classes in the dataset.
-       num_frames: A vector of length 'batch' which indicates the number of
-            frames for each video (before padding).
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       'batch_size' x 'num_classes'.
-     """
-     lstm_size = FLAGS.lstm_cells
-     number_of_layers = FLAGS.lstm_layers
-     stride = FLAGS.stride
-     max_frames = model_input.get_shape().as_list()[1]
- 
-     pooled_input, num_frames = self.avg_pooled_func(model_input, num_frames, stride)
- 
-     pooled_input = slim.batch_norm(
-       pooled_input,
-       center=True,
-       scale=True,
-       is_training=is_training,
-       scope="hidden1_bn")
- 
-   
-     fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
-     bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
- 
-     fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], 
-         sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
-     bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], 
-         sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
- 
-     state = tf.concat([fw_state, bw_state], 1)
- 
-     aggregated_model = getattr(video_level_models,
-                                'linear_res_mix_act_MoeModel')
- 
-     return aggregated_model().create_model(
-         model_input=state,
-         vocab_size=vocab_size,
-         **unused_params)
- 
-   def avg_pooled_func(self, model_input, num_frames_in, stride):
-     max_frames = model_input.get_shape().as_list()[1]
-     feature_size = model_input.get_shape().as_list()[2]
-     num_frames = num_frames_in // stride
-     step = max_frames//stride
- 
-     first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
-     first_layer_input = tf.reduce_sum(first_layer_input, 1)
- 
-     first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
-     denominators = tf.reshape(
-         tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
-     first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
- 
-     return first_layer_avg_pooled, num_frames
\ No newline at end of file
+     
\ No newline at end of file
--- a/web/backend/yt8m/video_level_models.py
View file @7e0b563
+++ b/web/backend/yt8m/video_level_models.py
View file @7e0b563
@@ -136,9 +136,7 @@ class MoeModel(models.BaseModel):
         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
                                      [-1, vocab_size])
-     
-     
-     print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@", final_probabilities_by_class_and_batch)
+   
     return {"predictions": final_probabilities}
 
 
@@ -251,482 +249,4 @@ class willow_MoeModel(models.BaseModel):
 
       probabilities = tf.multiply(probabilities, gates)
 
-     return {"predictions": probabilities}
- 
- class willow_MoeModel_moe4(models.BaseModel):
-   """A softmax over a mixture of logistic models (with L2 regularization)."""
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    is_training,
-                    num_mixtures=None,
-                    l2_penalty=1e-8,
-                    **unused_params):
-     """Creates a Mixture of (Logistic) Experts model.
-      It also includes the possibility of gating the probabilities
-      The model consists of a per-class softmax distribution over a
-      configurable number of logistic classifiers. One of the classifiers in the
-      mixture is not trained, and always predicts 0.
-     Args:
-       model_input: 'batch_size' x 'num_features' matrix of input features.
-       vocab_size: The number of classes in the dataset.
-       is_training: Is this the training phase ?
-       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-         always predicts the non-existence of an entity).
-       l2_penalty: How much to penalize the squared magnitudes of parameter
-         values.
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       batch_size x num_classes.
-     """
-     num_mixtures = 4
-     low_rank_gating = FLAGS.moe_low_rank_gating
-     l2_penalty = FLAGS.moe_l2
-     gating_probabilities = FLAGS.moe_prob_gating
-     gating_input = FLAGS.moe_prob_gating_input
- 
-     input_size = model_input.get_shape().as_list()[1]
-     remove_diag = False
- 
-     if low_rank_gating == -1:
-       gate_activations = slim.fully_connected(
-           model_input,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates")
-     else:
-       gate_activations1 = slim.fully_connected(
-           model_input,
-           low_rank_gating,
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates1")
-       gate_activations = slim.fully_connected(
-           gate_activations1,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates2")
- 
-     expert_activations = slim.fully_connected(
-         model_input,
-         vocab_size * num_mixtures,
-         activation_fn=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="experts")
- 
-     gating_distribution = tf.nn.softmax(tf.reshape(
-         gate_activations,
-         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-     expert_distribution = tf.nn.sigmoid(tf.reshape(
-         expert_activations,
-         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
- 
-     probabilities_by_class_and_batch = tf.reduce_sum(
-         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-     probabilities = tf.reshape(probabilities_by_class_and_batch,
-                                [-1, vocab_size])
- 
-     if gating_probabilities:
-       if gating_input == 'prob':
-         gating_weights = tf.get_variable("gating_prob_weights",
-                                          [vocab_size, vocab_size],
-                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-         gates = tf.matmul(probabilities, gating_weights)
-       else:
-         gating_weights = tf.get_variable("gating_prob_weights",
-                                          [input_size, vocab_size],
-                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
- 
-         gates = tf.matmul(model_input, gating_weights)
- 
-       if remove_diag:
-         # removes diagonals coefficients
-         diagonals = tf.matrix_diag_part(gating_weights)
-         gates = gates - tf.multiply(diagonals, probabilities)
- 
-       gates = slim.batch_norm(
-           gates,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="gating_prob_bn")
- 
-       gates = tf.sigmoid(gates)
- 
-       probabilities = tf.multiply(probabilities, gates)
- 
-     return {"predictions": probabilities}
- 
- class willow_MoeModel_moe4_noGP(models.BaseModel):
-   """A softmax over a mixture of logistic models (with L2 regularization)."""
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    is_training,
-                    num_mixtures=None,
-                    l2_penalty=1e-8,
-                    **unused_params):
-     """Creates a Mixture of (Logistic) Experts model.
-      It also includes the possibility of gating the probabilities
-      The model consists of a per-class softmax distribution over a
-      configurable number of logistic classifiers. One of the classifiers in the
-      mixture is not trained, and always predicts 0.
-     Args:
-       model_input: 'batch_size' x 'num_features' matrix of input features.
-       vocab_size: The number of classes in the dataset.
-       is_training: Is this the training phase ?
-       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-         always predicts the non-existence of an entity).
-       l2_penalty: How much to penalize the squared magnitudes of parameter
-         values.
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       batch_size x num_classes.
-     """
-     num_mixtures = 4
-     low_rank_gating = FLAGS.moe_low_rank_gating
-     l2_penalty = FLAGS.moe_l2
-     gating_probabilities = False
-     gating_input = FLAGS.moe_prob_gating_input
- 
-     input_size = model_input.get_shape().as_list()[1]
-     remove_diag = False
- 
-     if low_rank_gating == -1:
-       gate_activations = slim.fully_connected(
-           model_input,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates")
-     else:
-       gate_activations1 = slim.fully_connected(
-           model_input,
-           low_rank_gating,
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates1")
-       gate_activations = slim.fully_connected(
-           gate_activations1,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates2")
- 
-     expert_activations = slim.fully_connected(
-         model_input,
-         vocab_size * num_mixtures,
-         activation_fn=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="experts")
- 
-     gating_distribution = tf.nn.softmax(tf.reshape(
-         gate_activations,
-         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-     expert_distribution = tf.nn.sigmoid(tf.reshape(
-         expert_activations,
-         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
- 
-     probabilities_by_class_and_batch = tf.reduce_sum(
-         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-     probabilities = tf.reshape(probabilities_by_class_and_batch,
-                                [-1, vocab_size])
- 
-     return {"predictions": probabilities}
- 
- class willow_MoeModel_moe2_noGP(models.BaseModel):
-   """A softmax over a mixture of logistic models (with L2 regularization)."""
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    is_training,
-                    num_mixtures=None,
-                    l2_penalty=1e-8,
-                    **unused_params):
-     """Creates a Mixture of (Logistic) Experts model.
-      It also includes the possibility of gating the probabilities
-      The model consists of a per-class softmax distribution over a
-      configurable number of logistic classifiers. One of the classifiers in the
-      mixture is not trained, and always predicts 0.
-     Args:
-       model_input: 'batch_size' x 'num_features' matrix of input features.
-       vocab_size: The number of classes in the dataset.
-       is_training: Is this the training phase ?
-       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-         always predicts the non-existence of an entity).
-       l2_penalty: How much to penalize the squared magnitudes of parameter
-         values.
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       batch_size x num_classes.
-     """
-     num_mixtures = 2
-     low_rank_gating = FLAGS.moe_low_rank_gating
-     l2_penalty = FLAGS.moe_l2
-     gating_probabilities = False
-     gating_input = FLAGS.moe_prob_gating_input
- 
-     input_size = model_input.get_shape().as_list()[1]
-     remove_diag = False
- 
-     if low_rank_gating == -1:
-       gate_activations = slim.fully_connected(
-           model_input,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates")
-     else:
-       gate_activations1 = slim.fully_connected(
-           model_input,
-           low_rank_gating,
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates1")
-       gate_activations = slim.fully_connected(
-           gate_activations1,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates2")
- 
-     expert_activations = slim.fully_connected(
-         model_input,
-         vocab_size * num_mixtures,
-         activation_fn=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="experts")
- 
-     gating_distribution = tf.nn.softmax(tf.reshape(
-         gate_activations,
-         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-     expert_distribution = tf.nn.sigmoid(tf.reshape(
-         expert_activations,
-         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
- 
-     probabilities_by_class_and_batch = tf.reduce_sum(
-         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-     probabilities = tf.reshape(probabilities_by_class_and_batch,
-                                [-1, vocab_size])
- 
-     return {"predictions": probabilities}
- 
- 
- class willow_MoeModel_moe2(models.BaseModel):
-   """A softmax over a mixture of logistic models (with L2 regularization)."""
- 
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    is_training,
-                    num_mixtures=None,
-                    l2_penalty=1e-8,
-                    **unused_params):
-     """Creates a Mixture of (Logistic) Experts model.
-      It also includes the possibility of gating the probabilities
-      The model consists of a per-class softmax distribution over a
-      configurable number of logistic classifiers. One of the classifiers in the
-      mixture is not trained, and always predicts 0.
-     Args:
-       model_input: 'batch_size' x 'num_features' matrix of input features.
-       vocab_size: The number of classes in the dataset.
-       is_training: Is this the training phase ?
-       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
-         always predicts the non-existence of an entity).
-       l2_penalty: How much to penalize the squared magnitudes of parameter
-         values.
-     Returns:
-       A dictionary with a tensor containing the probability predictions of the
-       model in the 'predictions' key. The dimensions of the tensor are
-       batch_size x num_classes.
-     """
-     num_mixtures = 2
-     low_rank_gating = FLAGS.moe_low_rank_gating
-     l2_penalty = FLAGS.moe_l2
-     gating_probabilities = FLAGS.moe_prob_gating
-     gating_input = FLAGS.moe_prob_gating_input
- 
-     input_size = model_input.get_shape().as_list()[1]
-     remove_diag = False
- 
-     if low_rank_gating == -1:
-       gate_activations = slim.fully_connected(
-           model_input,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates")
-     else:
-       gate_activations1 = slim.fully_connected(
-           model_input,
-           low_rank_gating,
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates1")
-       gate_activations = slim.fully_connected(
-           gate_activations1,
-           vocab_size * (num_mixtures + 1),
-           activation_fn=None,
-           biases_initializer=None,
-           weights_regularizer=slim.l2_regularizer(l2_penalty),
-           scope="gates2")
- 
-     expert_activations = slim.fully_connected(
-         model_input,
-         vocab_size * num_mixtures,
-         activation_fn=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="experts")
- 
-     gating_distribution = tf.nn.softmax(tf.reshape(
-         gate_activations,
-         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-     expert_distribution = tf.nn.sigmoid(tf.reshape(
-         expert_activations,
-         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
- 
-     probabilities_by_class_and_batch = tf.reduce_sum(
-         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-     probabilities = tf.reshape(probabilities_by_class_and_batch,
-                                [-1, vocab_size])
- 
-     if gating_probabilities:
-       if gating_input == 'prob':
-         gating_weights = tf.get_variable("gating_prob_weights",
-                                          [vocab_size, vocab_size],
-                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
-         gates = tf.matmul(probabilities, gating_weights)
-       else:
-         gating_weights = tf.get_variable("gating_prob_weights",
-                                          [input_size, vocab_size],
-                                          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
- 
-         gates = tf.matmul(model_input, gating_weights)
- 
-       if remove_diag:
-         # removes diagonals coefficients
-         diagonals = tf.matrix_diag_part(gating_weights)
-         gates = gates - tf.multiply(diagonals, probabilities)
- 
-       gates = slim.batch_norm(
-           gates,
-           center=True,
-           scale=True,
-           is_training=is_training,
-           scope="gating_prob_bn")
- 
-       gates = tf.sigmoid(gates)
- 
-       probabilities = tf.multiply(probabilities, gates)
- 
-     return {"predictions": probabilities}
-   
- class linear_res_mix_act_MoeModel(models.BaseModel):
-   """A softmax over a mixture of logistic models (with L2 regularization).
-                    
-                      -----linear_layers(1) + sigmoid activation-------------
-                      -                                                     -
-                      -----linear_layers(2) + relu activation----------------
-                      -                                                     -
-   input_features -----                                                     -------moe-----output
-                      -                                                     -
-                      -----linear_layers(3) + elu activation-----------------
-                      -                                                     - 
-                      -----linear_layers(4) + tanh activation----------------
-   """
-   def create_model(self,
-                    model_input,
-                    vocab_size,
-                    num_mixtures=None,
-                    num_hiddens=None,
-                    num_maxout = None,
-                    l2_penalty=1e-8,
-                    **unused_params):
- 
-     num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
-     num_hiddens = num_hiddens or FLAGS.moe_num_hiddens
-     num_maxout = num_maxout or FLAGS.num_maxout
- 
-     hidden_sigmoid = slim.fully_connected(
-       model_input,
-       num_hiddens,
-       activation_fn=tf.nn.sigmoid,
-       weights_regularizer=slim.l2_regularizer(l2_penalty),
-       scope='hidden_sigmoid'
-     )
-     hidden_relu = slim.fully_connected(
-       model_input,
-       num_hiddens,
-       activation_fn=tf.nn.relu,
-       weights_regularizer=slim.l2_regularizer(l2_penalty),
-       scope='hidden_relu'
-     )
-     hidden_elu = slim.fully_connected(
-       model_input,
-       num_hiddens,
-       activation_fn=tf.nn.elu,
-       weights_regularizer=slim.l2_regularizer(l2_penalty),
-       scope='hidden_elu'
-     )
-     hidden_tanh = slim.fully_connected(
-       model_input,
-       num_hiddens,
-       activation_fn=tf.nn.tanh,
-       weights_regularizer=slim.l2_regularizer(l2_penalty),
-       scope='hidden_tanh'
-     )
- 
-     linear_input = slim.fully_connected(
-       model_input,
-       num_hiddens,
-       activation_fn=None,
-       weights_regularizer=slim.l2_regularizer(l2_penalty),
-       scope='hidden_linear'
-     )
- 
- 
-     gate_activations = slim.fully_connected(
-         model_input,
-         vocab_size * (num_mixtures + 1),
-         activation_fn=None,
-         biases_initializer=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="gates")
-     expert_activations = slim.fully_connected(
-         tf.concat([hidden_sigmoid+0.25*linear_input, hidden_relu+0.25*linear_input, hidden_elu+0.25*linear_input, hidden_tanh+0.25*linear_input], 1),
-         vocab_size * num_mixtures,
-         activation_fn=None,
-         weights_regularizer=slim.l2_regularizer(l2_penalty),
-         scope="experts")
- 
-     gating_distribution = tf.nn.softmax(tf.reshape(
-         gate_activations,
-         [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
-     expert_distribution = tf.nn.sigmoid(tf.reshape(
-         expert_activations,
-         [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures
- 
-     final_probabilities_by_class_and_batch = tf.reduce_sum(
-         gating_distribution[:, :num_mixtures] * expert_distribution, 1)
-     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
-                                      [-1, vocab_size])
-     return {"predictions": final_probabilities}
\ No newline at end of file
+     return {"predictions": probabilities}
\ No newline at end of file
--- a/보고서/최종보고서-윤영빈.docx
View file @7e0b563
+++ b/보고서/최종보고서-윤영빈.docx
View file @7e0b563
--- a/보고서/최종보고서-윤영빈.pdf 0 → 100644
View file @7e0b563
+++ b/보고서/최종보고서-윤영빈.pdf 0 → 100644
View file @7e0b563