윤영빈

final report almost done

......@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.")
flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.")
class DbofModel(models.BaseModel):
"""Creates a Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
"""
ACT_FN_MAP = {
"sigmoid": tf.nn.sigmoid,
"relu6": tf.nn.relu6,
}
def create_model(self,
model_input,
vocab_size,
num_frames,
iterations=None,
add_batch_norm=None,
sample_random_frames=None,
cluster_size=None,
hidden_size=None,
is_training=True,
**unused_params):
"""See base class.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
iterations: the number of frames to be sampled.
add_batch_norm: whether to add batch norm during training.
sample_random_frames: whether to sample random frames or random sequences.
cluster_size: the output neuron number of the cluster layer.
hidden_size: the output neuron number of the hidden layer.
is_training: whether to build the graph in training mode.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
iterations = iterations or FLAGS.iterations
add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
random_frames = sample_random_frames or FLAGS.sample_random_frames
cluster_size = cluster_size or FLAGS.dbof_cluster_size
hidden1_size = hidden_size or FLAGS.dbof_hidden_size
act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
assert act_fn is not None, ("dbof_activation is not valid: %s." %
FLAGS.dbof_activation)
class FrameLevelLogisticModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
tf.compat.v1.summary.histogram("input_hist", reshaped_input)
if add_batch_norm:
reshaped_input = slim.batch_norm(reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
cluster_weights = tf.compat.v1.get_variable(
"cluster_weights", [feature_size, cluster_size],
initializer=tf.random_normal_initializer(stddev=1 /
math.sqrt(feature_size)))
tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
activation = tf.matmul(reshaped_input, cluster_weights)
if add_batch_norm:
activation = slim.batch_norm(activation,
center=True,
scale=True,
is_training=is_training,
scope="cluster_bn")
else:
cluster_biases = tf.compat.v1.get_variable(
"cluster_biases", [cluster_size],
initializer=tf.random_normal_initializer(stddev=1 /
math.sqrt(feature_size)))
tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
activation = act_fn(activation)
tf.compat.v1.summary.histogram("cluster_output", activation)
activation = tf.reshape(activation, [-1, max_frames, cluster_size])
activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
hidden1_weights = tf.compat.v1.get_variable(
"hidden1_weights", [cluster_size, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 /
math.sqrt(cluster_size)))
tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
activation = tf.matmul(activation, hidden1_weights)
if add_batch_norm:
activation = slim.batch_norm(activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.compat.v1.get_variable(
"hidden1_biases", [hidden1_size],
initializer=tf.random_normal_initializer(stddev=0.01))
tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
activation = act_fn(activation)
tf.compat.v1.summary.histogram("hidden1_output", activation)
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(model_input=activation,
vocab_size=vocab_size,
**unused_params)
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class NetVLAD_NonLocal_types():
def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
......@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types():
return vlad_softmax
class NetVLADModelLF(models.BaseModel):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,
model_input,
vocab_size,
......@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel):
is_training=is_training,
**unused_params)
class GruModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
"""Creates a model which uses a stack of GRUs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
gru_size = 600
number_of_layers = 4
backward = False
random_frames = False
iterations = 30
if random_frames:
num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
model_input = utils.SampleRandomFrames(model_input, num_frames_2,
iterations)
if backward:
model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1)
class LstmModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_GRU = tf.contrib.rnn.MultiRNNCell(
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(gru_size)
tf.contrib.rnn.BasicLSTMCell(
lstm_size, forget_bias=1.0)
for _ in range(number_of_layers)
], state_is_tuple=False)
])
loss = 0.0
with tf.variable_scope("RNN"):
outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input,
sequence_length=num_frames,
dtype=tf.float32)
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
'MoeModel')
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=state,
model_input=state[-1].h,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
class SoftDBoF():
def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
self.feature_size = feature_size
self.max_frames = max_frames
self.is_training = is_training
self.add_batch_norm = add_batch_norm
self.cluster_size = cluster_size
self.max_pool = max_pool
def forward(self, reshaped_input):
feature_size = self.feature_size
cluster_size = self.cluster_size
add_batch_norm = self.add_batch_norm
max_frames = self.max_frames
is_training = self.is_training
max_pool = self.max_pool
cluster_weights = tf.get_variable("cluster_weights",
[feature_size, cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
tf.summary.histogram("cluster_weights", cluster_weights)
activation = tf.matmul(reshaped_input, cluster_weights)
if add_batch_norm:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="cluster_bn")
else:
cluster_biases = tf.get_variable("cluster_biases",
[cluster_size],
initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
tf.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
activation = tf.nn.softmax(activation)
activation = tf.reshape(activation, [-1, int(max_frames), int(cluster_size)])
activation_sum = tf.reduce_sum(activation,1)
activation_sum = tf.nn.l2_normalize(activation_sum,1)
if max_pool:
activation_max = tf.reduce_max(activation,1)
activation_max = tf.nn.l2_normalize(activation_max,1)
activation = tf.concat([activation_sum,activation_max],1)
else:
activation = activation_sum
return activation
class LightVLAD_nonlocal():
def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
self.feature_size = feature_size
self.max_frames = max_frames
self.is_training = is_training
self.add_batch_norm = add_batch_norm
self.cluster_size = cluster_size
def forward(self,reshaped_input):
cluster_weights = tf.get_variable("cluster_weights",
[int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
activation = tf.matmul(reshaped_input, cluster_weights)
if self.add_batch_norm:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=self.is_training,
scope="cluster_bn")
else:
cluster_biases = tf.get_variable("cluster_biases",
[cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
tf.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
activation = tf.nn.softmax(activation)
activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
activation = tf.transpose(activation,perm=[0,2,1])
reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
vlad = tf.matmul(activation,reshaped_input)
vlad = tf.reshape(vlad, [-1,self.feature_size])
vlad = nonLocal_block(vlad, feature_size=self.feature_size, hidden_size=self.feature_size//2, cluster_size=self.cluster_size)
vlad = tf.reshape(vlad, [-1,self.cluster_size,self.feature_size])
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.nn.l2_normalize(vlad,1)
vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
vlad = tf.nn.l2_normalize(vlad,1)
return vlad
class LightNetVLADModelLF(models.BaseModel):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,
model_input,
vocab_size,
num_frames,
iterations=None,
add_batch_norm=None,
sample_random_frames=None,
cluster_size=None,
hidden_size=None,
is_training=True,
**unused_params):
iterations = 300
add_batch_norm = True
random_frames = True
cluster_size = 64
hidden1_size = 1024
relu = False
dimred = -1
gating = True
remove_diag = False
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
video_NetVLAD = LightVLAD_nonlocal(1024,max_frames,cluster_size, add_batch_norm, is_training)
audio_NetVLAD = LightVLAD_nonlocal(128,max_frames,cluster_size/2, add_batch_norm, is_training)
if add_batch_norm:# and not lightvlad:
reshaped_input = slim.batch_norm(
reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
with tf.variable_scope("video_VLAD"):
vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
with tf.variable_scope("audio_VLAD"):
vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
vlad = tf.concat([vlad_video, vlad_audio],1)
vlad_dim = vlad.get_shape().as_list()[1]
hidden1_weights = tf.get_variable("hidden1_weights",
[vlad_dim, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
activation = tf.matmul(vlad, hidden1_weights)
if add_batch_norm and relu:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.get_variable("hidden1_biases",
[hidden1_size],
initializer = tf.random_normal_initializer(stddev=0.01))
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
if relu:
activation = tf.nn.relu6(activation)
if gating:
gating_weights = tf.get_variable("gating_weights_2",
[hidden1_size, hidden1_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
gates = tf.matmul(activation, gating_weights)
if remove_diag:
#removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals,activation)
if add_batch_norm:
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_bn")
else:
gating_biases = tf.get_variable("gating_biases",
[cluster_size],
initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
gates += gating_biases
gates = tf.sigmoid(gates)
activation = tf.multiply(activation,gates)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
def nonLocal_block(vlad, feature_size, hidden_size, cluster_size):
nonlocal_theta = tf.get_variable("nonlocal_theta",
[feature_size, hidden_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
nonlocal_phi = tf.get_variable("nonlocal_phi",
[feature_size, hidden_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
nonlocal_g = tf.get_variable("nonlocal_g",
[feature_size, hidden_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
nonlocal_out = tf.get_variable("nonlocal_out",
[hidden_size, feature_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden_size)))
vlad_theta = tf.matmul(vlad, nonlocal_theta)
vlad_phi = tf.matmul(vlad, nonlocal_phi)
vlad_g = tf.matmul(vlad, nonlocal_g)
vlad_theta = tf.reshape(vlad_theta, [-1, cluster_size, hidden_size])
vlad_phi = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
vlad_g = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
vlad_softmax = tf.nn.softmax(feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
vlad_g = tf.matmul(vlad_softmax, vlad_g)
vlad_g = tf.reshape(vlad_g, [-1, hidden_size])
vlad_g = tf.matmul(vlad_g, nonlocal_out)
vlad = vlad + vlad_g
return vlad
class SoftDbofModelLF(models.BaseModel):
"""Creates a Soft Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,
model_input,
vocab_size,
num_frames,
iterations=None,
add_batch_norm=None,
sample_random_frames=None,
cluster_size=None,
hidden_size=None,
is_training=True,
**unused_params):
iterations = 300
add_batch_norm = True
random_frames = True
cluster_size = 4000
hidden1_size = 1024
fc_dimred = True
relu = False
max_pool = False
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
tf.summary.histogram("input_hist", reshaped_input)
video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
if add_batch_norm:
reshaped_input = slim.batch_norm(
reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
with tf.variable_scope("video_DBOF"):
dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
with tf.variable_scope("audio_DBOF"):
dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
dbof = tf.concat([dbof_video, dbof_audio],1)
dbof_dim = dbof.get_shape().as_list()[1]
if fc_dimred:
hidden1_weights = tf.get_variable("hidden1_weights",
[dbof_dim, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
tf.summary.histogram("hidden1_weights", hidden1_weights)
activation = tf.matmul(dbof, hidden1_weights)
if add_batch_norm and relu:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.get_variable("hidden1_biases",
[hidden1_size],
initializer = tf.random_normal_initializer(stddev=0.01))
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
if relu:
activation = tf.nn.relu6(activation)
tf.summary.histogram("hidden1_output", activation)
else:
activation = dbof
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
class early_NetVLADModelLF(models.BaseModel):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,
model_input,
vocab_size,
num_frames,
iterations=None,
add_batch_norm=None,
sample_random_frames=None,
cluster_size=None,
hidden_size=None,
is_training=True,
**unused_params):
iterations = 300
add_batch_norm = True
random_frames = True
cluster_size = 64
hidden1_size = 1024
relu = False
dimred = -1
gating = True
remove_diag = False
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training)
if add_batch_norm:# and not lightvlad:
reshaped_input = slim.batch_norm(
reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
with tf.variable_scope("video_audio_VLAD"):
vlad = video_audio_NetVLAD.forward(reshaped_input)
vlad_dim = vlad.get_shape().as_list()[1]
hidden1_weights = tf.get_variable("hidden1_weights",
[vlad_dim, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
activation = tf.matmul(vlad, hidden1_weights)
if add_batch_norm and relu:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.get_variable("hidden1_biases",
[hidden1_size],
initializer = tf.random_normal_initializer(stddev=0.01))
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
if relu:
activation = tf.nn.relu6(activation)
if gating:
gating_weights = tf.get_variable("gating_weights_2",
[hidden1_size, hidden1_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
gates = tf.matmul(activation, gating_weights)
if remove_diag:
#removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals,activation)
if add_batch_norm:
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_bn")
else:
gating_biases = tf.get_variable("gating_biases",
[cluster_size],
initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
gates += gating_biases
gates = tf.sigmoid(gates)
activation = tf.multiply(activation,gates)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
class NetVLAD_NonLocal():
def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
self.feature_size = feature_size
self.max_frames = max_frames
self.is_training = is_training
self.add_batch_norm = add_batch_norm
self.cluster_size = cluster_size
def forward(self,reshaped_input):
cluster_weights = tf.get_variable("cluster_weights",
[int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
tf.summary.histogram("cluster_weights", cluster_weights)
activation = tf.matmul(reshaped_input, cluster_weights)
if self.add_batch_norm:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=self.is_training,
scope="cluster_bn")
else:
cluster_biases = tf.get_variable("cluster_biases",
[cluster_size],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
tf.summary.histogram("cluster_biases", cluster_biases)
activation += cluster_biases
activation = tf.nn.softmax(activation)
tf.summary.histogram("cluster_output", activation)
activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
cluster_weights2 = tf.get_variable("cluster_weights2",
[1,int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
a = tf.multiply(a_sum,cluster_weights2)
activation = tf.transpose(activation,perm=[0,2,1])
reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
vlad = tf.matmul(activation,reshaped_input)
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.subtract(vlad,a)
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.reshape(vlad, [-1, self.feature_size])
nonlocal_theta = tf.get_variable("nonlocal_theta",
[int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
nonlocal_phi = tf.get_variable("nonlocal_phi",
[int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
nonlocal_g = tf.get_variable("nonlocal_g",
[int(self.feature_size), int(self.cluster_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
nonlocal_out = tf.get_variable("nonlocal_out",
[int(self.cluster_size), int(self.feature_size)],
initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
vlad_theta = tf.matmul(vlad, nonlocal_theta)
vlad_phi = tf.matmul(vlad, nonlocal_phi)
vlad_g = tf.matmul(vlad, nonlocal_g)
vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size),int(self.cluster_size)])
vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
vlad_g = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
vlad_softmax = tf.nn.softmax(self.feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
vlad_g = tf.matmul(vlad_softmax, vlad_g)
vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
vlad_g = tf.matmul(vlad_g, nonlocal_out)
vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)])
vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)])
vlad = vlad + vlad_g
vlad = tf.transpose(vlad,perm=[0,2,1])
vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
vlad = tf.nn.l2_normalize(vlad,1)
return vlad
class SoftDbofModelLF_8k(models.BaseModel):
"""Creates a Soft Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def create_model(self,
model_input,
vocab_size,
num_frames,
iterations=None,
add_batch_norm=None,
sample_random_frames=None,
cluster_size=None,
hidden_size=None,
is_training=True,
**unused_params):
iterations = 300
add_batch_norm = True
random_frames = True
cluster_size = 2048
hidden1_size = 1024
fc_dimred = True
relu = False
max_pool = False
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
if random_frames:
model_input = utils.SampleRandomFrames(model_input, num_frames,
iterations)
else:
model_input = utils.SampleRandomSequence(model_input, num_frames,
iterations)
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
reshaped_input = tf.reshape(model_input, [-1, feature_size])
tf.summary.histogram("input_hist", reshaped_input)
video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
if add_batch_norm:
reshaped_input = slim.batch_norm(
reshaped_input,
center=True,
scale=True,
is_training=is_training,
scope="input_bn")
with tf.variable_scope("video_DBOF"):
dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
with tf.variable_scope("audio_DBOF"):
dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
dbof = tf.concat([dbof_video, dbof_audio],1)
dbof_dim = dbof.get_shape().as_list()[1]
if fc_dimred:
hidden1_weights = tf.get_variable("hidden1_weights",
[dbof_dim, hidden1_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
tf.summary.histogram("hidden1_weights", hidden1_weights)
activation = tf.matmul(dbof, hidden1_weights)
if add_batch_norm and relu:
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
else:
hidden1_biases = tf.get_variable("hidden1_biases",
[hidden1_size],
initializer = tf.random_normal_initializer(stddev=0.01))
tf.summary.histogram("hidden1_biases", hidden1_biases)
activation += hidden1_biases
if relu:
activation = tf.nn.relu6(activation)
tf.summary.histogram("hidden1_output", activation)
else:
activation = dbof
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
is_training=is_training,
**unused_params)
class FrameLevelLogisticModel(models.BaseModel):
"""Creates a logistic classifier over the aggregated frame-level features."""
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
feature_size = model_input.get_shape().as_list()[2]
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class CNN(models.BaseModel):
"""Creates a logistic classifier over the aggregated frame-level features."""
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
feature_size = model_input.get_shape().as_list()[2]
denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
[-1, feature_size])
convK3 = slim.convolution(model_input,
num_outputs=feature_size,
kernel_size=3,
scope='conv1')
convK5 = slim.convolution(model_input,
num_outputs=feature_size,
kernel_size=5,
scope='conv2')
convK1 = slim.convolution(model_input,
num_outputs=feature_size,
kernel_size=5,
scope='conv3')
avg_pooled = tf.reduce_sum(tf.concat([convK3,convK5,convK1],axis=1), axis=[1]) / denominators
output = slim.fully_connected(avg_pooled,
vocab_size,
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(1e-8))
return {"predictions": output}
class LstmModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.BasicLSTMCell(
lstm_size, forget_bias=1.0)
for _ in range(number_of_layers)
])
loss = 0.0
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=state[-1].h,
vocab_size=vocab_size,
**unused_params)
class BNGRUModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_rnn = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(lstm_size)
for _ in range(number_of_layers)
], state_is_tuple=False)
outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
state = slim.batch_norm(
state,
center=True,
scale=True,
is_training=True,
scope='proj')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
class GruModel2(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(lstm_size)
for _ in range(number_of_layers)
], state_is_tuple=False)
loss = 0.0
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
class BiGRUModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
with tf.variable_scope('fw'):
rnn_fw = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(lstm_size)
for _ in range(number_of_layers)
], state_is_tuple=False)
with tf.variable_scope('bw'):
rnn_bw = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.GRUCell(lstm_size)
for _ in range(number_of_layers)
], state_is_tuple=False)
outputs, state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, model_input,
sequence_length=num_frames,
dtype=tf.float32, swap_memory=True)
state = tf.concat(state, axis=1)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
state = slim.batch_norm(
state,
center=True,
scale=True,
is_training=True,
scope='proj')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
"""
Copyright (c) 2017, University of Texas Southwestern Medical Center
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Texas at Austin nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Recurrent Weighted Average
Implementation modified from: https://github.com/jostmey/rwa
Paper:
@article{ostmeyer2017machine,
title={Machine Learning on Sequential Data Using a Recurrent Weighted Average},
author={Ostmeyer, Jared and Cowell, Lindsay},
journal={arXiv preprint arXiv:1703.01253},
year={2017}
}
"""
class RwaModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
# constants
init_factor = 1.0
num_cells = FLAGS.lstm_cells
input_shape = model_input.get_shape().as_list()
batch_size, max_steps, num_features = input_shape
# trainable weights
s = weights_rwa.init_state(num_cells, "s", init_factor)
W_g = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_g")
W_u = weights_rwa.init_weight([num_features, num_cells], "W_u")
W_a = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_a")
b_g = weights_rwa.init_bias(num_cells, "b_g")
b_u = weights_rwa.init_bias(num_cells, "b_u")
b_a = weights_rwa.init_bias(num_cells, "b_a")
#pl = tf.placeholder(tf.float32, shape=[None, num_cells])
pl = tf.reshape(model_input, [-1, max_steps*num_features])[:, :num_cells]
# internal states
#n = tf.zeros([batch_size, num_cells])
#d = tf.zeros([batch_size, num_cells])
#h = tf.zeros([batch_size, num_cells])
#a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible
n = tf.zeros_like(pl)
d = tf.zeros_like(pl)
h = tf.zeros_like(pl)
a_max = tf.multiply(tf.ones_like(pl), -1E38)
# define model
h += tf.nn.tanh(tf.expand_dims(s, 0))
for i in range(max_steps):
x_step = model_input[:,i,:]
xh_join = tf.concat(axis=1, values=[x_step, h]) # Combine the features and hidden state into one tensor
u = tf.matmul(x_step, W_u)+b_u
g = tf.matmul(xh_join, W_g)+b_g
a = tf.matmul(xh_join, W_a) # The bias term when factored out of the numerator and denominator cancels and is unnecessary
z = tf.multiply(u, tf.nn.tanh(g))
a_newmax = tf.maximum(a_max, a)
exp_diff = tf.exp(a_max-a_newmax)
exp_scaled = tf.exp(a-a_newmax)
n = tf.multiply(n, exp_diff)+tf.multiply(z, exp_scaled) # Numerically stable update of numerator
d = tf.multiply(d, exp_diff)+exp_scaled # Numerically stable update of denominator
h_new = tf.nn.tanh(tf.div(n, d))
a_max = a_newmax
h = tf.where(tf.greater(num_frames, i), h_new, h) # Use new hidden state only if the sequence length has not been exceeded
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=h,
vocab_size=vocab_size,
**unused_params)
class DropoutGruModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.GRUCell(lstm_size), 0.9, 0.9)
for _ in range(number_of_layers)
], state_is_tuple=False)
loss = 0.0
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
aggregated_model = FrameLevelLogisticModel;
return aggregated_model().create_model(
model_input=outputs,
vocab_size=vocab_size,
num_frames=num_frames,
**unused_params)
class ResRnnModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
lstm_size = 1152
number_of_layers = 3
#from rnn_cell_modern import Delta_RNN as drnn
from rnn_wrappers_modern import MultiRNNCell as mrnn
cells = []
for i in range(number_of_layers):
with tf.variable_scope('cell_'+str(i)):
cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0))
stacked_rnn = mrnn(cells, use_residual_connections=True, state_is_tuple=True)
outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=state[-1].h,
vocab_size=vocab_size,
**unused_params)
class LateVladModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
model_input = utils.SampleRandomSequence(model_input, num_frames, 128)
input_v = model_input[:,:,:1024]
input_a = model_input[:,:,1024:]
K = 8
with tf.variable_scope('video'):
x = input_v
input_shape = x.get_shape().as_list()
_, N, D = input_shape
c_bound = math.sqrt(1. / (K * D))
c = tf.get_variable(name='c',
shape=[K, N],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(-c_bound, c_bound))
a = slim.convolution(x,
num_outputs=K,
kernel_size=1,
data_format='NWC',
scope='conv')
a = tf.nn.softmax(a)
v = []
for k in range(K):
t = x-c[k][None, :, None]
t = tf.multiply(t, a[:,:,k][:,:,None])
t = tf.reduce_sum(t, 1)
t = tf.nn.l2_normalize(t, dim=1)
v.append(t)
v = tf.stack(v, axis=1)
v = tf.reshape(v, [-1, K*D])
proj_weights = tf.get_variable("proj_weights",
[K*D, 1024],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
activation_v = tf.matmul(v, proj_weights)
with tf.variable_scope('audio'):
x = input_a
input_shape = x.get_shape().as_list()
_, N, D = input_shape
c_bound = math.sqrt(1. / (K * D))
c = tf.get_variable(name='c',
shape=[K, N],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(-c_bound, c_bound))
a = slim.convolution(x,
num_outputs=K,
kernel_size=1,
data_format='NWC',
scope='conv')
a = tf.nn.softmax(a)
v = []
for k in range(K):
t = x-c[k][None, :, None]
t = tf.multiply(t, a[:,:,k][:,:,None])
t = tf.reduce_sum(t, 1)
t = tf.nn.l2_normalize(t, dim=1)
v.append(t)
v = tf.stack(v, axis=1)
v = tf.reshape(v, [-1, K*D])
proj_weights = tf.get_variable("proj_weights",
[K*D, 1024],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
activation_a = tf.matmul(v, proj_weights)
activation = tf.concat([activation_v, activation_a], axis=1)
activation = slim.batch_norm(
activation,
center=True,
scale=True,
is_training=True,
scope='proj')
activation = tf.nn.relu6(activation)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=activation,
vocab_size=vocab_size,
**unused_params)
class LNBLstmModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, **unused_params):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stacked_lstm = tf.contrib.rnn.MultiRNNCell(
[
tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_size, dropout_keep_prob=0.50)
for _ in range(number_of_layers)
])
loss = 0.0
outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
sequence_length=num_frames,
dtype=tf.float32)
aggregated_model = getattr(video_level_models,
FLAGS.video_level_classifier_model)
return aggregated_model().create_model(
model_input=state[-1].h,
vocab_size=vocab_size,
**unused_params)
class audio_avgShort_twowayGRUModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
"""Creates a model which uses a Bidirectional GRU and mean audio features to represent the video.
---->first half GRU----->
- -
visual_feature ---- concat---------------->
- - -
---->second half GRU----> concat -----> video level classifier
-
mean audio features--->
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
stride = FLAGS.stride
max_frames = model_input.get_shape().as_list()[1]
video_input = model_input[:,:,:1024]
audio_input = model_input[:,:,1024:]
first_num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
audio_den = tf.reshape(tf.tile(first_num_frames, [1, 128]), [-1, 128])
mean_audio = tf.reduce_sum(audio_input, 1) / tf.maximum(audio_den, 1)
pooled_input, num_frames = self.avg_pooled_func(video_input, num_frames, stride)
pooled_input = slim.batch_norm(
pooled_input,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
mean_audio = slim.batch_norm(
mean_audio,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn_audio")
fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:],
sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:],
sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
state = tf.concat([fw_state, bw_state], 1)
state = tf.concat([state, mean_audio], 1)
aggregated_model = getattr(video_level_models,
'linear_res_mix_act_MoeModel')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
def avg_pooled_func(self, model_input, num_frames_in, stride):
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
num_frames = num_frames_in // stride
step = max_frames//stride
first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
first_layer_input = tf.reduce_sum(first_layer_input, 1)
first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
denominators = tf.reshape(
tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
return first_layer_avg_pooled, num_frames
class resav_ConvModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
"""Creates a model which uses a Convolutional model to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
stride = FLAGS.stride
conv_length = FLAGS.conv_length
conv_hidden1 = FLAGS.conv_hidden1
conv_hidden2 = FLAGS.conv_hidden2
conv_hidden3 = FLAGS.conv_hidden3
mean_feature = tf.reduce_mean(model_input, 1)
feature_size = model_input.get_shape().as_list()[2]
pooled_input = self.avg_pooled_func(model_input, stride)
# To shape : 'batch_size' x 'max_frames' x 1 x 'num_features'
input_expand = tf.expand_dims(pooled_input, -1)
input_expand = tf.transpose(input_expand, [0,1,3,2])
# conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden
conv_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_1')
conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_1"))
conv_out = slim.conv2d(conv_out, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_2')
conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_2")
res_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_1_1')
res_out = res_out + conv_out
res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool1')
conv_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_1')
conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_1"))
conv_out = slim.conv2d(conv_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_2')
conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_2")
res_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_2_1')
res_out = res_out + conv_out
res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool2')
conv_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_1')
conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_1"))
conv_out = slim.conv2d(conv_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_2')
conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_2")
res_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_3_1')
res_out = res_out + conv_out
res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool3')
a = res_out.get_shape().as_list()[1]
b = res_out.get_shape().as_list()[2]
c = res_out.get_shape().as_list()[3]
print(res_out.get_shape().as_list())
res_out = tf.reshape(res_out, [-1, a*b*c])
state = tf.concat([res_out, mean_feature], 1)
aggregated_model = getattr(video_level_models,
'linear_res_mix_act_MoeModel')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
def avg_pooled_func(self, model_input, stride):
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
step = max_frames//stride
first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
first_layer_input = tf.reduce_mean(first_layer_input, 1)
return first_layer_input
class pur_twowayGRUModel(models.BaseModel):
def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
"""Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video.
---->first half GRU----->
- -
video_feature ---- concat---------------->video level classifier
- -
---->second half GRU---->
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size = FLAGS.lstm_cells
number_of_layers = FLAGS.lstm_layers
stride = FLAGS.stride
max_frames = model_input.get_shape().as_list()[1]
pooled_input, num_frames = self.avg_pooled_func(model_input, num_frames, stride)
pooled_input = slim.batch_norm(
pooled_input,
center=True,
scale=True,
is_training=is_training,
scope="hidden1_bn")
fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:],
sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:],
sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
state = tf.concat([fw_state, bw_state], 1)
aggregated_model = getattr(video_level_models,
'linear_res_mix_act_MoeModel')
return aggregated_model().create_model(
model_input=state,
vocab_size=vocab_size,
**unused_params)
def avg_pooled_func(self, model_input, num_frames_in, stride):
max_frames = model_input.get_shape().as_list()[1]
feature_size = model_input.get_shape().as_list()[2]
num_frames = num_frames_in // stride
step = max_frames//stride
first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
first_layer_input = tf.reduce_sum(first_layer_input, 1)
first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
denominators = tf.reshape(
tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
return first_layer_avg_pooled, num_frames
\ No newline at end of file
\ No newline at end of file
......
......@@ -136,9 +136,7 @@ class MoeModel(models.BaseModel):
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
[-1, vocab_size])
print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@", final_probabilities_by_class_and_batch)
return {"predictions": final_probabilities}
......@@ -251,482 +249,4 @@ class willow_MoeModel(models.BaseModel):
probabilities = tf.multiply(probabilities, gates)
return {"predictions": probabilities}
class willow_MoeModel_moe4(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def create_model(self,
model_input,
vocab_size,
is_training,
num_mixtures=None,
l2_penalty=1e-8,
**unused_params):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures = 4
low_rank_gating = FLAGS.moe_low_rank_gating
l2_penalty = FLAGS.moe_l2
gating_probabilities = FLAGS.moe_prob_gating
gating_input = FLAGS.moe_prob_gating_input
input_size = model_input.get_shape().as_list()[1]
remove_diag = False
if low_rank_gating == -1:
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
else:
gate_activations1 = slim.fully_connected(
model_input,
low_rank_gating,
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates1")
gate_activations = slim.fully_connected(
gate_activations1,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates2")
expert_activations = slim.fully_connected(
model_input,
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
probabilities = tf.reshape(probabilities_by_class_and_batch,
[-1, vocab_size])
if gating_probabilities:
if gating_input == 'prob':
gating_weights = tf.get_variable("gating_prob_weights",
[vocab_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(probabilities, gating_weights)
else:
gating_weights = tf.get_variable("gating_prob_weights",
[input_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(model_input, gating_weights)
if remove_diag:
# removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals, probabilities)
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_prob_bn")
gates = tf.sigmoid(gates)
probabilities = tf.multiply(probabilities, gates)
return {"predictions": probabilities}
class willow_MoeModel_moe4_noGP(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def create_model(self,
model_input,
vocab_size,
is_training,
num_mixtures=None,
l2_penalty=1e-8,
**unused_params):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures = 4
low_rank_gating = FLAGS.moe_low_rank_gating
l2_penalty = FLAGS.moe_l2
gating_probabilities = False
gating_input = FLAGS.moe_prob_gating_input
input_size = model_input.get_shape().as_list()[1]
remove_diag = False
if low_rank_gating == -1:
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
else:
gate_activations1 = slim.fully_connected(
model_input,
low_rank_gating,
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates1")
gate_activations = slim.fully_connected(
gate_activations1,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates2")
expert_activations = slim.fully_connected(
model_input,
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
probabilities = tf.reshape(probabilities_by_class_and_batch,
[-1, vocab_size])
return {"predictions": probabilities}
class willow_MoeModel_moe2_noGP(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def create_model(self,
model_input,
vocab_size,
is_training,
num_mixtures=None,
l2_penalty=1e-8,
**unused_params):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures = 2
low_rank_gating = FLAGS.moe_low_rank_gating
l2_penalty = FLAGS.moe_l2
gating_probabilities = False
gating_input = FLAGS.moe_prob_gating_input
input_size = model_input.get_shape().as_list()[1]
remove_diag = False
if low_rank_gating == -1:
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
else:
gate_activations1 = slim.fully_connected(
model_input,
low_rank_gating,
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates1")
gate_activations = slim.fully_connected(
gate_activations1,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates2")
expert_activations = slim.fully_connected(
model_input,
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
probabilities = tf.reshape(probabilities_by_class_and_batch,
[-1, vocab_size])
return {"predictions": probabilities}
class willow_MoeModel_moe2(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def create_model(self,
model_input,
vocab_size,
is_training,
num_mixtures=None,
l2_penalty=1e-8,
**unused_params):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures = 2
low_rank_gating = FLAGS.moe_low_rank_gating
l2_penalty = FLAGS.moe_l2
gating_probabilities = FLAGS.moe_prob_gating
gating_input = FLAGS.moe_prob_gating_input
input_size = model_input.get_shape().as_list()[1]
remove_diag = False
if low_rank_gating == -1:
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
else:
gate_activations1 = slim.fully_connected(
model_input,
low_rank_gating,
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates1")
gate_activations = slim.fully_connected(
gate_activations1,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates2")
expert_activations = slim.fully_connected(
model_input,
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
probabilities = tf.reshape(probabilities_by_class_and_batch,
[-1, vocab_size])
if gating_probabilities:
if gating_input == 'prob':
gating_weights = tf.get_variable("gating_prob_weights",
[vocab_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(probabilities, gating_weights)
else:
gating_weights = tf.get_variable("gating_prob_weights",
[input_size, vocab_size],
initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
gates = tf.matmul(model_input, gating_weights)
if remove_diag:
# removes diagonals coefficients
diagonals = tf.matrix_diag_part(gating_weights)
gates = gates - tf.multiply(diagonals, probabilities)
gates = slim.batch_norm(
gates,
center=True,
scale=True,
is_training=is_training,
scope="gating_prob_bn")
gates = tf.sigmoid(gates)
probabilities = tf.multiply(probabilities, gates)
return {"predictions": probabilities}
class linear_res_mix_act_MoeModel(models.BaseModel):
"""A softmax over a mixture of logistic models (with L2 regularization).
-----linear_layers(1) + sigmoid activation-------------
- -
-----linear_layers(2) + relu activation----------------
- -
input_features ----- -------moe-----output
- -
-----linear_layers(3) + elu activation-----------------
- -
-----linear_layers(4) + tanh activation----------------
"""
def create_model(self,
model_input,
vocab_size,
num_mixtures=None,
num_hiddens=None,
num_maxout = None,
l2_penalty=1e-8,
**unused_params):
num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
num_hiddens = num_hiddens or FLAGS.moe_num_hiddens
num_maxout = num_maxout or FLAGS.num_maxout
hidden_sigmoid = slim.fully_connected(
model_input,
num_hiddens,
activation_fn=tf.nn.sigmoid,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope='hidden_sigmoid'
)
hidden_relu = slim.fully_connected(
model_input,
num_hiddens,
activation_fn=tf.nn.relu,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope='hidden_relu'
)
hidden_elu = slim.fully_connected(
model_input,
num_hiddens,
activation_fn=tf.nn.elu,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope='hidden_elu'
)
hidden_tanh = slim.fully_connected(
model_input,
num_hiddens,
activation_fn=tf.nn.tanh,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope='hidden_tanh'
)
linear_input = slim.fully_connected(
model_input,
num_hiddens,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope='hidden_linear'
)
gate_activations = slim.fully_connected(
model_input,
vocab_size * (num_mixtures + 1),
activation_fn=None,
biases_initializer=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="gates")
expert_activations = slim.fully_connected(
tf.concat([hidden_sigmoid+0.25*linear_input, hidden_relu+0.25*linear_input, hidden_elu+0.25*linear_input, hidden_tanh+0.25*linear_input], 1),
vocab_size * num_mixtures,
activation_fn=None,
weights_regularizer=slim.l2_regularizer(l2_penalty),
scope="experts")
gating_distribution = tf.nn.softmax(tf.reshape(
gate_activations,
[-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
expert_distribution = tf.nn.sigmoid(tf.reshape(
expert_activations,
[-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
final_probabilities_by_class_and_batch = tf.reduce_sum(
gating_distribution[:, :num_mixtures] * expert_distribution, 1)
final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
[-1, vocab_size])
return {"predictions": final_probabilities}
\ No newline at end of file
return {"predictions": probabilities}
\ No newline at end of file
......
No preview for this file type