윤영빈

added GruModel

{
"CurrentProjectSetting": null
}
\ No newline at end of file
{
"ExpandedNodes": [
"",
"\\web",
"\\web\\backend",
"\\web\\backend\\yt8m"
],
"SelectedNode": "\\web\\backend\\yt8m\\frame_level_models.py",
"PreviewInSolutionExplorer": false
}
\ No newline at end of file
No preview for this file type
......@@ -46,7 +46,7 @@ flags.DEFINE_string(
"Some Frame-Level models can be decomposed into a "
"generalized pooling operation followed by a "
"classifier layer")
flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
flags.DEFINE_integer("lstm_cells", 512, "Number of LSTM cells.")
flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
......@@ -215,6 +215,54 @@ class LstmModel(models.BaseModel):
vocab_size=vocab_size,
**unused_params)
class GruModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
"""Creates a model which uses a stack of GRUs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
gru_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
backward = False
random_frames = False
iterations = 30
if random_frames:
num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
model_input = utils.SampleRandomFrames(model_input, num_frames_2,
iterations)
if backward:
model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1)
stacked_GRU = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(gru_size)
for _ in range(number_of_layers)
], state_is_tuple=False)
loss = 0.0
with tf.variable_scope("RNN"):
outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
'MoeModel')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
class FrameLevelLogisticModel(models.BaseModel):
"""Creates a logistic classifier over the aggregated frame-level features."""
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
......@@ -238,7 +286,7 @@ class FrameLevelLogisticModel(models.BaseModel):
"""
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
feature_size = model_input.get_shape().as_list()[2]
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
......@@ -249,50 +297,6 @@ class FrameLevelLogisticModel(models.BaseModel):
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class CNN(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""def model(features, labels, mode, params):"""
"""CNN classifier model."""
images = features["image"]
labels = labels["label"]
tf.summary.image("images", images)
drop_rate = 0.0
features = images
for i, filters in enumerate([32, 64, 128]):
features = tf.layers.conv2d(
features, filters=filters, kernel_size=3, padding="same",
name="conv_%d" % (i + 1))
features = tf.layers.max_pooling2d(
inputs=features, pool_size=2, strides=2, padding="same",
name="pool_%d" % (i + 1))
features = tf.contrib.layers.flatten(features)
features = tf.layers.dropout(features, drop_rate)
features = tf.layers.dense(features, 512, name="dense_1")
features = tf.layers.dropout(features, drop_rate)
logits = tf.layers.dense(features, params.num_classes, activation=None,
name="dense_2")
predictions = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(
labels=labels, logits=logits)
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": predictions}, loss
class NetVLAD_NonLocal_types():
def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
......
......@@ -83,7 +83,7 @@ if __name__ == "__main__":
"regularization_penalty", 1.0,
"How much weight to give to the regularization loss (the label loss has "
"a weight of 1).")
flags.DEFINE_float("base_learning_rate", 0.0006,
flags.DEFINE_float("base_learning_rate", 0.001,
"Which learning rate to start with.")
flags.DEFINE_float(
"learning_rate_decay", 0.8,
......