윤영빈

NetVLAD model test

......@@ -20,8 +20,6 @@
![profit_hunter](/img/profit_hunter.png)
* 팀명 **Profit Hunter**
* 윤영빈(컴퓨터공학과, 2015104192)
* 윤준현(컴퓨터공학과, 2015104193)
* 이현규(컴퓨터공학과, 2015104209)
* 이태현(컴퓨터공학과, 2015104208)
## Links
......
......@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.")
class FrameLevelLogisticModel(models.BaseModel):
"""Creates a logistic classifier over the aggregated frame-level features."""
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
feature_size = model_input.get_shape().as_list()[2]
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class DbofModel(models.BaseModel):
"""Creates a Deep Bag of Frames model.
......@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel):
tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
for _ in range(number_of_layers)
])
_, state = tf.nn.dynamic_rnn(stacked_lstm,
model_input,
sequence_length=num_frames,
......@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel):
return aggregated_model().create_model(model_input=state[-1].h,
vocab_size=vocab_size,
**unused_params)
class FrameLevelLogisticModel(models.BaseModel):
"""Creates a logistic classifier over the aggregated frame-level features."""
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
feature_size = model_input.get_shape().as_list()[2]
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class CNN(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""def model(features, labels, mode, params):"""
"""CNN classifier model."""
images = features["image"]
labels = labels["label"]
tf.summary.image("images", images)
drop_rate = 0.0
features = images
for i, filters in enumerate([32, 64, 128]):
features = tf.layers.conv2d(
features, filters=filters, kernel_size=3, padding="same",
name="conv_%d" % (i + 1))
features = tf.layers.max_pooling2d(
inputs=features, pool_size=2, strides=2, padding="same",
name="pool_%d" % (i + 1))
features = tf.contrib.layers.flatten(features)
features = tf.layers.dropout(features, drop_rate)
features = tf.layers.dense(features, 512, name="dense_1")
features = tf.layers.dropout(features, drop_rate)
logits = tf.layers.dense(features, params.num_classes, activation=None,
name="dense_2")
predictions = tf.argmax(logits, axis=1)
loss = tf.losses.sparse_softmax_cross_entropy(
labels=labels, logits=logits)
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": predictions}, loss
class NetVLAD_NonLocal_types():
def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
self.feature_size = feature_size
self.max_frames = max_frames
self.is_training = is_training
self.add_batch_norm = add_batch_norm
self.cluster_size = cluster_size
def forward(self,reshaped_input):
cluster_weights = tf.get_variable("cluster_weights",
[self.feature_size, self.cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
tf.summary.histogram("cluster_weights", cluster_weights)
activation = tf.matmul(reshaped_input, cluster_weights)
if self.add_batch_norm:
activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn")
else:
cluster_biases = tf.get_variable("cluster_biases",
[cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
tf.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
activation = tf.nn.softmax(activation)
tf.summary.histogram("cluster_output", activation)
activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
cluster_weights2 = tf.get_variable("cluster_weights2",
[1,self.feature_size, self.cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
a = tf.multiply(a_sum,cluster_weights2)
activation = tf.transpose(activation,perm=[0,2,1])
reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
vlad = tf.matmul(activation,reshaped_input)
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.subtract(vlad,a)
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.reshape(vlad, [-1, self.feature_size])
vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64))
nonlocal_g = tf.get_variable("nonlocal_g",
[self.feature_size, self.cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
nonlocal_out = tf.get_variable("nonlocal_out",
[self.cluster_size, self.feature_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
vlad_g = tf.matmul(vlad, nonlocal_g)
vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size])
vlad_g = tf.matmul(vlad_softmax, vlad_g)
vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
vlad_g = tf.matmul(vlad_g, nonlocal_out)
vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size])
vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size])
vlad = vlad + vlad_g
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
vlad = tf.nn.l2_normalize(vlad,1)
return vlad
def embedgaussian_relation(self, input_, temp=1/float(32)):
nonlocal_theta = tf.get_variable("nonlocal_theta",
[self.feature_size, self.cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
nonlocal_phi = tf.get_variable("nonlocal_phi",
[self.feature_size, self.cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
vlad_theta = tf.matmul(input_, nonlocal_theta)
vlad_phi = tf.matmul(input_, nonlocal_phi)
vlad_theta = tf.reshape(vlad_theta, [-1, self.cluster_size, self.cluster_size])
vlad_phi = tf.reshape(vlad_phi, [-1, self.cluster_size, self.cluster_size])
vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
return vlad_softmax
class NetVLADModelLF(models.BaseModel):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,model_input,vocab_size,num_frames,iterations=None,add_batch_norm=None,sample_random_frames=None,cluster_size=None,hidden_size=None,is_training=True,**unused_params):
iterations = 300
add_batch_norm = True
random_frames = True
cluster_size = 64
hidden1_size = 1024
relu = False
dimred = -1
gating = True
remove_diag = False
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
video_NetVLAD = NetVLAD_NonLocal_types(1024,int(max_frames),int(cluster_size), add_batch_norm, is_training)
audio_NetVLAD = NetVLAD_NonLocal_types(128,int(max_frames),int(cluster_size/2), add_batch_norm, is_training)
if add_batch_norm:# and not lightvlad:
reshaped_input = slim.batch_norm(
reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
with tf.variable_scope("video_VLAD"):
vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
with tf.variable_scope("audio_VLAD"):
vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
vlad = tf.concat([vlad_video, vlad_audio],1)
vlad_dim = vlad.get_shape().as_list()[1]
hidden1_weights = tf.get_variable("hidden1_weights",
[vlad_dim, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
activation = tf.matmul(vlad, hidden1_weights)
if add_batch_norm and relu:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.get_variable("hidden1_biases",
[hidden1_size],
initializer = tf.random_normal_initializer(stddev=0.01))
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
if relu:
activation = tf.nn.relu6(activation)
if gating:
gating_weights = tf.get_variable("gating_weights_2",
[hidden1_size, hidden1_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
gates = tf.matmul(activation, gating_weights)
if remove_diag:
#removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals,activation)
if add_batch_norm:
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_bn")
else:
gating_biases = tf.get_variable("gating_biases",
[cluster_size],
initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
gates += gating_biases
gates = tf.sigmoid(gates)
activation = tf.multiply(activation,gates)
aggregated_model = getattr(video_level_models,
'willow_MoeModel')
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
\ No newline at end of file
......
......@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS
flags.DEFINE_integer(
"moe_num_mixtures", 2,
"The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
# flags.DEFINE_integer(
# "moe_num_mixtures", 2,
# "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
flags.DEFINE_float(
"moe_l2", 1e-8,
"L2 penalty for MoeModel.")
flags.DEFINE_integer(
"moe_low_rank_gating", -1,
"Low rank gating for MoeModel.")
flags.DEFINE_bool(
"moe_prob_gating", True,
"Prob gating for MoeModel.")
flags.DEFINE_string(
"moe_prob_gating_input", "prob",
"input Prob gating for MoeModel.")
class LogisticModel(models.BaseModel):
......@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel):
final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
[-1, vocab_size])
return {"predictions": final_probabilities}
class willow_MoeModel(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures = 8
low_rank_gating = FLAGS.moe_low_rank_gating
l2_penalty = FLAGS.moe_l2
gating_probabilities = FLAGS.moe_prob_gating
gating_input = FLAGS.moe_prob_gating_input
input_size = model_input.get_shape().as_list()[1]
remove_diag = False
if low_rank_gating == -1:
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
else:
gate_activations1 = slim.fully_connected(
model_input,
low_rank_gating,
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates1")
gate_activations = slim.fully_connected(
gate_activations1,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates2")
expert_activations = slim.fully_connected(
model_input,
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
probabilities = tf.reshape(probabilities_by_class_and_batch,
[-1, vocab_size])
if gating_probabilities:
if gating_input == 'prob':
gating_weights = tf.get_variable("gating_prob_weights",
[vocab_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(probabilities, gating_weights)
else:
gating_weights = tf.get_variable("gating_prob_weights",
[input_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(model_input, gating_weights)
if remove_diag:
# removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals, probabilities)
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_prob_bn")
gates = tf.sigmoid(gates)
probabilities = tf.multiply(probabilities, gates)
return {"predictions": probabilities}
\ No newline at end of file
......