Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2020-2-capstone-design2
/
2015104192
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
윤영빈
2020-12-09 21:05:16 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
7e0b563ac8ea88ce198992b6ad3aceb0eb28c3e4
7e0b563a
1 parent
f2dfcea2
final report almost done
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
30 additions
and
2159 deletions
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
보고서/최종보고서-윤영빈.docx
보고서/최종보고서-윤영빈.pdf
web/backend/yt8m/frame_level_models.py
View file @
7e0b563
...
...
@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
flags
.
DEFINE_integer
(
"conv_hidden3"
,
1024
,
"Number of cnn hidden."
)
flags
.
DEFINE_integer
(
"stride"
,
10
,
"Number of stride for short rnn."
)
class
DbofModel
(
models
.
BaseModel
):
"""Creates a Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
"""
ACT_FN_MAP
=
{
"sigmoid"
:
tf
.
nn
.
sigmoid
,
"relu6"
:
tf
.
nn
.
relu6
,
}
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
"""See base class.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
iterations: the number of frames to be sampled.
add_batch_norm: whether to add batch norm during training.
sample_random_frames: whether to sample random frames or random sequences.
cluster_size: the output neuron number of the cluster layer.
hidden_size: the output neuron number of the hidden layer.
is_training: whether to build the graph in training mode.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
iterations
=
iterations
or
FLAGS
.
iterations
add_batch_norm
=
add_batch_norm
or
FLAGS
.
dbof_add_batch_norm
random_frames
=
sample_random_frames
or
FLAGS
.
sample_random_frames
cluster_size
=
cluster_size
or
FLAGS
.
dbof_cluster_size
hidden1_size
=
hidden_size
or
FLAGS
.
dbof_hidden_size
act_fn
=
self
.
ACT_FN_MAP
.
get
(
FLAGS
.
dbof_activation
)
assert
act_fn
is
not
None
,
(
"dbof_activation is not valid:
%
s."
%
FLAGS
.
dbof_activation
)
class
FrameLevelLogisticModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
tf
.
compat
.
v1
.
summary
.
histogram
(
"input_hist"
,
reshaped_input
)
if
add_batch_norm
:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
cluster_weights
=
tf
.
compat
.
v1
.
get_variable
(
"cluster_weights"
,
[
feature_size
,
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
tf
.
compat
.
v1
.
summary
.
histogram
(
"cluster_weights"
,
cluster_weights
)
activation
=
tf
.
matmul
(
reshaped_input
,
cluster_weights
)
if
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"cluster_bn"
)
else
:
cluster_biases
=
tf
.
compat
.
v1
.
get_variable
(
"cluster_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
tf
.
compat
.
v1
.
summary
.
histogram
(
"cluster_biases"
,
cluster_biases
)
activation
+=
cluster_biases
activation
=
act_fn
(
activation
)
tf
.
compat
.
v1
.
summary
.
histogram
(
"cluster_output"
,
activation
)
activation
=
tf
.
reshape
(
activation
,
[
-
1
,
max_frames
,
cluster_size
])
activation
=
utils
.
FramePooling
(
activation
,
FLAGS
.
dbof_pooling_method
)
hidden1_weights
=
tf
.
compat
.
v1
.
get_variable
(
"hidden1_weights"
,
[
cluster_size
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
tf
.
compat
.
v1
.
summary
.
histogram
(
"hidden1_weights"
,
hidden1_weights
)
activation
=
tf
.
matmul
(
activation
,
hidden1_weights
)
if
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
compat
.
v1
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
compat
.
v1
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
activation
=
act_fn
(
activation
)
tf
.
compat
.
v1
.
summary
.
histogram
(
"hidden1_output"
,
activation
)
denominators
=
tf
.
reshape
(
tf
.
tile
(
num_frames
,
[
1
,
feature_size
]),
[
-
1
,
feature_size
])
avg_pooled
=
tf
.
reduce_sum
(
model_input
,
axis
=
[
1
])
/
denominators
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
**
unused_params
)
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
output
}
class
NetVLAD_NonLocal_types
():
def
__init__
(
self
,
feature_size
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
):
...
...
@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types():
return
vlad_softmax
class
NetVLADModelLF
(
models
.
BaseModel
):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
...
...
@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel):
is_training
=
is_training
,
**
unused_params
)
class
GruModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
is_training
=
True
,
**
unused_params
):
"""Creates a model which uses a stack of GRUs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
gru_size
=
600
number_of_layers
=
4
backward
=
False
random_frames
=
False
iterations
=
30
if
random_frames
:
num_frames_2
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames_2
,
iterations
)
if
backward
:
model_input
=
tf
.
reverse_sequence
(
model_input
,
num_frames
,
seq_axis
=
1
)
class
LstmModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_
GRU
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
stacked_
lstm
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
GRUCell
(
gru_size
)
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
lstm_size
,
forget_bias
=
1.0
)
for
_
in
range
(
number_of_layers
)
]
,
state_is_tuple
=
False
)
])
loss
=
0.0
with
tf
.
variable_scope
(
"RNN"
):
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_GRU
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
'MoeModel'
)
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
model_input
=
state
[
-
1
]
.
h
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
class
SoftDBoF
():
def
__init__
(
self
,
feature_size
,
max_frames
,
cluster_size
,
max_pool
,
add_batch_norm
,
is_training
):
self
.
feature_size
=
feature_size
self
.
max_frames
=
max_frames
self
.
is_training
=
is_training
self
.
add_batch_norm
=
add_batch_norm
self
.
cluster_size
=
cluster_size
self
.
max_pool
=
max_pool
def
forward
(
self
,
reshaped_input
):
feature_size
=
self
.
feature_size
cluster_size
=
self
.
cluster_size
add_batch_norm
=
self
.
add_batch_norm
max_frames
=
self
.
max_frames
is_training
=
self
.
is_training
max_pool
=
self
.
max_pool
cluster_weights
=
tf
.
get_variable
(
"cluster_weights"
,
[
feature_size
,
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_weights"
,
cluster_weights
)
activation
=
tf
.
matmul
(
reshaped_input
,
cluster_weights
)
if
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"cluster_bn"
)
else
:
cluster_biases
=
tf
.
get_variable
(
"cluster_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_biases"
,
cluster_biases
)
activation
+=
cluster_biases
activation
=
tf
.
nn
.
softmax
(
activation
)
activation
=
tf
.
reshape
(
activation
,
[
-
1
,
int
(
max_frames
),
int
(
cluster_size
)])
activation_sum
=
tf
.
reduce_sum
(
activation
,
1
)
activation_sum
=
tf
.
nn
.
l2_normalize
(
activation_sum
,
1
)
if
max_pool
:
activation_max
=
tf
.
reduce_max
(
activation
,
1
)
activation_max
=
tf
.
nn
.
l2_normalize
(
activation_max
,
1
)
activation
=
tf
.
concat
([
activation_sum
,
activation_max
],
1
)
else
:
activation
=
activation_sum
return
activation
class
LightVLAD_nonlocal
():
def
__init__
(
self
,
feature_size
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
):
self
.
feature_size
=
feature_size
self
.
max_frames
=
max_frames
self
.
is_training
=
is_training
self
.
add_batch_norm
=
add_batch_norm
self
.
cluster_size
=
cluster_size
def
forward
(
self
,
reshaped_input
):
cluster_weights
=
tf
.
get_variable
(
"cluster_weights"
,
[
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
activation
=
tf
.
matmul
(
reshaped_input
,
cluster_weights
)
if
self
.
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
self
.
is_training
,
scope
=
"cluster_bn"
)
else
:
cluster_biases
=
tf
.
get_variable
(
"cluster_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_biases"
,
cluster_biases
)
activation
+=
cluster_biases
activation
=
tf
.
nn
.
softmax
(
activation
)
activation
=
tf
.
reshape
(
activation
,
[
-
1
,
self
.
max_frames
,
self
.
cluster_size
])
activation
=
tf
.
transpose
(
activation
,
perm
=
[
0
,
2
,
1
])
reshaped_input
=
tf
.
reshape
(
reshaped_input
,[
-
1
,
self
.
max_frames
,
self
.
feature_size
])
vlad
=
tf
.
matmul
(
activation
,
reshaped_input
)
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
self
.
feature_size
])
vlad
=
nonLocal_block
(
vlad
,
feature_size
=
self
.
feature_size
,
hidden_size
=
self
.
feature_size
//
2
,
cluster_size
=
self
.
cluster_size
)
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
self
.
cluster_size
,
self
.
feature_size
])
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
vlad
=
tf
.
reshape
(
vlad
,[
-
1
,
int
(
self
.
cluster_size
*
self
.
feature_size
)])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
return
vlad
class
LightNetVLADModelLF
(
models
.
BaseModel
):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
iterations
=
300
add_batch_norm
=
True
random_frames
=
True
cluster_size
=
64
hidden1_size
=
1024
relu
=
False
dimred
=
-
1
gating
=
True
remove_diag
=
False
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
video_NetVLAD
=
LightVLAD_nonlocal
(
1024
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
)
audio_NetVLAD
=
LightVLAD_nonlocal
(
128
,
max_frames
,
cluster_size
/
2
,
add_batch_norm
,
is_training
)
if
add_batch_norm
:
# and not lightvlad:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
with
tf
.
variable_scope
(
"video_VLAD"
):
vlad_video
=
video_NetVLAD
.
forward
(
reshaped_input
[:,
0
:
1024
])
with
tf
.
variable_scope
(
"audio_VLAD"
):
vlad_audio
=
audio_NetVLAD
.
forward
(
reshaped_input
[:,
1024
:])
vlad
=
tf
.
concat
([
vlad_video
,
vlad_audio
],
1
)
vlad_dim
=
vlad
.
get_shape
()
.
as_list
()[
1
]
hidden1_weights
=
tf
.
get_variable
(
"hidden1_weights"
,
[
vlad_dim
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
activation
=
tf
.
matmul
(
vlad
,
hidden1_weights
)
if
add_batch_norm
and
relu
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
if
relu
:
activation
=
tf
.
nn
.
relu6
(
activation
)
if
gating
:
gating_weights
=
tf
.
get_variable
(
"gating_weights_2"
,
[
hidden1_size
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
hidden1_size
)))
gates
=
tf
.
matmul
(
activation
,
gating_weights
)
if
remove_diag
:
#removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
activation
)
if
add_batch_norm
:
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_bn"
)
else
:
gating_biases
=
tf
.
get_variable
(
"gating_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
gates
+=
gating_biases
gates
=
tf
.
sigmoid
(
gates
)
activation
=
tf
.
multiply
(
activation
,
gates
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
def
nonLocal_block
(
vlad
,
feature_size
,
hidden_size
,
cluster_size
):
nonlocal_theta
=
tf
.
get_variable
(
"nonlocal_theta"
,
[
feature_size
,
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
nonlocal_phi
=
tf
.
get_variable
(
"nonlocal_phi"
,
[
feature_size
,
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
nonlocal_g
=
tf
.
get_variable
(
"nonlocal_g"
,
[
feature_size
,
hidden_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
nonlocal_out
=
tf
.
get_variable
(
"nonlocal_out"
,
[
hidden_size
,
feature_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
hidden_size
)))
vlad_theta
=
tf
.
matmul
(
vlad
,
nonlocal_theta
)
vlad_phi
=
tf
.
matmul
(
vlad
,
nonlocal_phi
)
vlad_g
=
tf
.
matmul
(
vlad
,
nonlocal_g
)
vlad_theta
=
tf
.
reshape
(
vlad_theta
,
[
-
1
,
cluster_size
,
hidden_size
])
vlad_phi
=
tf
.
reshape
(
vlad_phi
,
[
-
1
,
cluster_size
,
hidden_size
])
vlad_g
=
tf
.
reshape
(
vlad_phi
,
[
-
1
,
cluster_size
,
hidden_size
])
vlad_softmax
=
tf
.
nn
.
softmax
(
feature_size
**-.
5
*
tf
.
matmul
(
vlad_theta
,
tf
.
transpose
(
vlad_phi
,
perm
=
[
0
,
2
,
1
])))
vlad_g
=
tf
.
matmul
(
vlad_softmax
,
vlad_g
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
hidden_size
])
vlad_g
=
tf
.
matmul
(
vlad_g
,
nonlocal_out
)
vlad
=
vlad
+
vlad_g
return
vlad
class
SoftDbofModelLF
(
models
.
BaseModel
):
"""Creates a Soft Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
iterations
=
300
add_batch_norm
=
True
random_frames
=
True
cluster_size
=
4000
hidden1_size
=
1024
fc_dimred
=
True
relu
=
False
max_pool
=
False
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
tf
.
summary
.
histogram
(
"input_hist"
,
reshaped_input
)
video_Dbof
=
SoftDBoF
(
1024
,
max_frames
,
cluster_size
,
max_pool
,
add_batch_norm
,
is_training
)
audio_Dbof
=
SoftDBoF
(
128
,
max_frames
,
cluster_size
/
8
,
max_pool
,
add_batch_norm
,
is_training
)
if
add_batch_norm
:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
with
tf
.
variable_scope
(
"video_DBOF"
):
dbof_video
=
video_Dbof
.
forward
(
reshaped_input
[:,
0
:
1024
])
with
tf
.
variable_scope
(
"audio_DBOF"
):
dbof_audio
=
audio_Dbof
.
forward
(
reshaped_input
[:,
1024
:])
dbof
=
tf
.
concat
([
dbof_video
,
dbof_audio
],
1
)
dbof_dim
=
dbof
.
get_shape
()
.
as_list
()[
1
]
if
fc_dimred
:
hidden1_weights
=
tf
.
get_variable
(
"hidden1_weights"
,
[
dbof_dim
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
tf
.
summary
.
histogram
(
"hidden1_weights"
,
hidden1_weights
)
activation
=
tf
.
matmul
(
dbof
,
hidden1_weights
)
if
add_batch_norm
and
relu
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
if
relu
:
activation
=
tf
.
nn
.
relu6
(
activation
)
tf
.
summary
.
histogram
(
"hidden1_output"
,
activation
)
else
:
activation
=
dbof
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
class
early_NetVLADModelLF
(
models
.
BaseModel
):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
iterations
=
300
add_batch_norm
=
True
random_frames
=
True
cluster_size
=
64
hidden1_size
=
1024
relu
=
False
dimred
=
-
1
gating
=
True
remove_diag
=
False
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
video_audio_NetVLAD
=
NetVLAD_NonLocal
(
1024
+
128
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
)
if
add_batch_norm
:
# and not lightvlad:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
with
tf
.
variable_scope
(
"video_audio_VLAD"
):
vlad
=
video_audio_NetVLAD
.
forward
(
reshaped_input
)
vlad_dim
=
vlad
.
get_shape
()
.
as_list
()[
1
]
hidden1_weights
=
tf
.
get_variable
(
"hidden1_weights"
,
[
vlad_dim
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
activation
=
tf
.
matmul
(
vlad
,
hidden1_weights
)
if
add_batch_norm
and
relu
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
if
relu
:
activation
=
tf
.
nn
.
relu6
(
activation
)
if
gating
:
gating_weights
=
tf
.
get_variable
(
"gating_weights_2"
,
[
hidden1_size
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
hidden1_size
)))
gates
=
tf
.
matmul
(
activation
,
gating_weights
)
if
remove_diag
:
#removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
activation
)
if
add_batch_norm
:
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_bn"
)
else
:
gating_biases
=
tf
.
get_variable
(
"gating_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
gates
+=
gating_biases
gates
=
tf
.
sigmoid
(
gates
)
activation
=
tf
.
multiply
(
activation
,
gates
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
class
NetVLAD_NonLocal
():
def
__init__
(
self
,
feature_size
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
):
self
.
feature_size
=
feature_size
self
.
max_frames
=
max_frames
self
.
is_training
=
is_training
self
.
add_batch_norm
=
add_batch_norm
self
.
cluster_size
=
cluster_size
def
forward
(
self
,
reshaped_input
):
cluster_weights
=
tf
.
get_variable
(
"cluster_weights"
,
[
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_weights"
,
cluster_weights
)
activation
=
tf
.
matmul
(
reshaped_input
,
cluster_weights
)
if
self
.
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
self
.
is_training
,
scope
=
"cluster_bn"
)
else
:
cluster_biases
=
tf
.
get_variable
(
"cluster_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_biases"
,
cluster_biases
)
activation
+=
cluster_biases
activation
=
tf
.
nn
.
softmax
(
activation
)
tf
.
summary
.
histogram
(
"cluster_output"
,
activation
)
activation
=
tf
.
reshape
(
activation
,
[
-
1
,
self
.
max_frames
,
self
.
cluster_size
])
a_sum
=
tf
.
reduce_sum
(
activation
,
-
2
,
keep_dims
=
True
)
cluster_weights2
=
tf
.
get_variable
(
"cluster_weights2"
,
[
1
,
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
a
=
tf
.
multiply
(
a_sum
,
cluster_weights2
)
activation
=
tf
.
transpose
(
activation
,
perm
=
[
0
,
2
,
1
])
reshaped_input
=
tf
.
reshape
(
reshaped_input
,[
-
1
,
self
.
max_frames
,
self
.
feature_size
])
vlad
=
tf
.
matmul
(
activation
,
reshaped_input
)
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
subtract
(
vlad
,
a
)
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
self
.
feature_size
])
nonlocal_theta
=
tf
.
get_variable
(
"nonlocal_theta"
,
[
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
nonlocal_phi
=
tf
.
get_variable
(
"nonlocal_phi"
,
[
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
nonlocal_g
=
tf
.
get_variable
(
"nonlocal_g"
,
[
int
(
self
.
feature_size
),
int
(
self
.
cluster_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
nonlocal_out
=
tf
.
get_variable
(
"nonlocal_out"
,
[
int
(
self
.
cluster_size
),
int
(
self
.
feature_size
)],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
cluster_size
)))
vlad_theta
=
tf
.
matmul
(
vlad
,
nonlocal_theta
)
vlad_phi
=
tf
.
matmul
(
vlad
,
nonlocal_phi
)
vlad_g
=
tf
.
matmul
(
vlad
,
nonlocal_g
)
vlad_theta
=
tf
.
reshape
(
vlad_theta
,
[
-
1
,
int
(
self
.
cluster_size
),
int
(
self
.
cluster_size
)])
vlad_phi
=
tf
.
reshape
(
vlad_phi
,
[
-
1
,
int
(
self
.
cluster_size
),
int
(
self
.
cluster_size
)])
vlad_g
=
tf
.
reshape
(
vlad_phi
,
[
-
1
,
int
(
self
.
cluster_size
),
int
(
self
.
cluster_size
)])
vlad_softmax
=
tf
.
nn
.
softmax
(
self
.
feature_size
**-.
5
*
tf
.
matmul
(
vlad_theta
,
tf
.
transpose
(
vlad_phi
,
perm
=
[
0
,
2
,
1
])))
vlad_g
=
tf
.
matmul
(
vlad_softmax
,
vlad_g
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
self
.
cluster_size
])
vlad_g
=
tf
.
matmul
(
vlad_g
,
nonlocal_out
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
int
(
self
.
cluster_size
),
int
(
self
.
feature_size
)])
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
int
(
self
.
cluster_size
),
int
(
self
.
feature_size
)])
vlad
=
vlad
+
vlad_g
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
# [b,f,c]
vlad
=
tf
.
reshape
(
vlad
,[
-
1
,
int
(
self
.
cluster_size
*
self
.
feature_size
)])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
return
vlad
class
SoftDbofModelLF_8k
(
models
.
BaseModel
):
"""Creates a Soft Deep Bag of Frames model.
The model projects the features for each frame into a higher dimensional
'clustering' space, pools across frames in that space, and then
uses a configurable video-level model to classify the now aggregated features.
The model will randomly sample either frames or sequences of frames during
training to speed up convergence.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
iterations
=
300
add_batch_norm
=
True
random_frames
=
True
cluster_size
=
2048
hidden1_size
=
1024
fc_dimred
=
True
relu
=
False
max_pool
=
False
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
tf
.
summary
.
histogram
(
"input_hist"
,
reshaped_input
)
video_Dbof
=
SoftDBoF
(
1024
,
max_frames
,
cluster_size
,
max_pool
,
add_batch_norm
,
is_training
)
audio_Dbof
=
SoftDBoF
(
128
,
max_frames
,
cluster_size
/
8
,
max_pool
,
add_batch_norm
,
is_training
)
if
add_batch_norm
:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
with
tf
.
variable_scope
(
"video_DBOF"
):
dbof_video
=
video_Dbof
.
forward
(
reshaped_input
[:,
0
:
1024
])
with
tf
.
variable_scope
(
"audio_DBOF"
):
dbof_audio
=
audio_Dbof
.
forward
(
reshaped_input
[:,
1024
:])
dbof
=
tf
.
concat
([
dbof_video
,
dbof_audio
],
1
)
dbof_dim
=
dbof
.
get_shape
()
.
as_list
()[
1
]
if
fc_dimred
:
hidden1_weights
=
tf
.
get_variable
(
"hidden1_weights"
,
[
dbof_dim
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
tf
.
summary
.
histogram
(
"hidden1_weights"
,
hidden1_weights
)
activation
=
tf
.
matmul
(
dbof
,
hidden1_weights
)
if
add_batch_norm
and
relu
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
if
relu
:
activation
=
tf
.
nn
.
relu6
(
activation
)
tf
.
summary
.
histogram
(
"hidden1_output"
,
activation
)
else
:
activation
=
dbof
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
class
FrameLevelLogisticModel
(
models
.
BaseModel
):
"""Creates a logistic classifier over the aggregated frame-level features."""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
denominators
=
tf
.
reshape
(
tf
.
tile
(
num_frames
,
[
1
,
feature_size
]),
[
-
1
,
feature_size
])
avg_pooled
=
tf
.
reduce_sum
(
model_input
,
axis
=
[
1
])
/
denominators
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
output
}
class
CNN
(
models
.
BaseModel
):
"""Creates a logistic classifier over the aggregated frame-level features."""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
denominators
=
tf
.
reshape
(
tf
.
tile
(
num_frames
,
[
1
,
feature_size
]),
[
-
1
,
feature_size
])
convK3
=
slim
.
convolution
(
model_input
,
num_outputs
=
feature_size
,
kernel_size
=
3
,
scope
=
'conv1'
)
convK5
=
slim
.
convolution
(
model_input
,
num_outputs
=
feature_size
,
kernel_size
=
5
,
scope
=
'conv2'
)
convK1
=
slim
.
convolution
(
model_input
,
num_outputs
=
feature_size
,
kernel_size
=
5
,
scope
=
'conv3'
)
avg_pooled
=
tf
.
reduce_sum
(
tf
.
concat
([
convK3
,
convK5
,
convK1
],
axis
=
1
),
axis
=
[
1
])
/
denominators
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
relu
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
output
}
class
LstmModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_lstm
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
lstm_size
,
forget_bias
=
1.0
)
for
_
in
range
(
number_of_layers
)
])
loss
=
0.0
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
[
-
1
]
.
h
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
BNGRUModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_rnn
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
for
_
in
range
(
number_of_layers
)
],
state_is_tuple
=
False
)
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_rnn
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
state
=
slim
.
batch_norm
(
state
,
center
=
True
,
scale
=
True
,
is_training
=
True
,
scope
=
'proj'
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
GruModel2
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_lstm
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
for
_
in
range
(
number_of_layers
)
],
state_is_tuple
=
False
)
loss
=
0.0
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
BiGRUModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
with
tf
.
variable_scope
(
'fw'
):
rnn_fw
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
for
_
in
range
(
number_of_layers
)
],
state_is_tuple
=
False
)
with
tf
.
variable_scope
(
'bw'
):
rnn_bw
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
for
_
in
range
(
number_of_layers
)
],
state_is_tuple
=
False
)
outputs
,
state
=
tf
.
nn
.
bidirectional_dynamic_rnn
(
rnn_fw
,
rnn_bw
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
,
swap_memory
=
True
)
state
=
tf
.
concat
(
state
,
axis
=
1
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
state
=
slim
.
batch_norm
(
state
,
center
=
True
,
scale
=
True
,
is_training
=
True
,
scope
=
'proj'
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
"""
Copyright (c) 2017, University of Texas Southwestern Medical Center
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the University of Texas at Austin nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Recurrent Weighted Average
Implementation modified from: https://github.com/jostmey/rwa
Paper:
@article{ostmeyer2017machine,
title={Machine Learning on Sequential Data Using a Recurrent Weighted Average},
author={Ostmeyer, Jared and Cowell, Lindsay},
journal={arXiv preprint arXiv:1703.01253},
year={2017}
}
"""
class
RwaModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
# constants
init_factor
=
1.0
num_cells
=
FLAGS
.
lstm_cells
input_shape
=
model_input
.
get_shape
()
.
as_list
()
batch_size
,
max_steps
,
num_features
=
input_shape
# trainable weights
s
=
weights_rwa
.
init_state
(
num_cells
,
"s"
,
init_factor
)
W_g
=
weights_rwa
.
init_weight
([
num_features
+
num_cells
,
num_cells
],
"W_g"
)
W_u
=
weights_rwa
.
init_weight
([
num_features
,
num_cells
],
"W_u"
)
W_a
=
weights_rwa
.
init_weight
([
num_features
+
num_cells
,
num_cells
],
"W_a"
)
b_g
=
weights_rwa
.
init_bias
(
num_cells
,
"b_g"
)
b_u
=
weights_rwa
.
init_bias
(
num_cells
,
"b_u"
)
b_a
=
weights_rwa
.
init_bias
(
num_cells
,
"b_a"
)
#pl = tf.placeholder(tf.float32, shape=[None, num_cells])
pl
=
tf
.
reshape
(
model_input
,
[
-
1
,
max_steps
*
num_features
])[:,
:
num_cells
]
# internal states
#n = tf.zeros([batch_size, num_cells])
#d = tf.zeros([batch_size, num_cells])
#h = tf.zeros([batch_size, num_cells])
#a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible
n
=
tf
.
zeros_like
(
pl
)
d
=
tf
.
zeros_like
(
pl
)
h
=
tf
.
zeros_like
(
pl
)
a_max
=
tf
.
multiply
(
tf
.
ones_like
(
pl
),
-
1E38
)
# define model
h
+=
tf
.
nn
.
tanh
(
tf
.
expand_dims
(
s
,
0
))
for
i
in
range
(
max_steps
):
x_step
=
model_input
[:,
i
,:]
xh_join
=
tf
.
concat
(
axis
=
1
,
values
=
[
x_step
,
h
])
# Combine the features and hidden state into one tensor
u
=
tf
.
matmul
(
x_step
,
W_u
)
+
b_u
g
=
tf
.
matmul
(
xh_join
,
W_g
)
+
b_g
a
=
tf
.
matmul
(
xh_join
,
W_a
)
# The bias term when factored out of the numerator and denominator cancels and is unnecessary
z
=
tf
.
multiply
(
u
,
tf
.
nn
.
tanh
(
g
))
a_newmax
=
tf
.
maximum
(
a_max
,
a
)
exp_diff
=
tf
.
exp
(
a_max
-
a_newmax
)
exp_scaled
=
tf
.
exp
(
a
-
a_newmax
)
n
=
tf
.
multiply
(
n
,
exp_diff
)
+
tf
.
multiply
(
z
,
exp_scaled
)
# Numerically stable update of numerator
d
=
tf
.
multiply
(
d
,
exp_diff
)
+
exp_scaled
# Numerically stable update of denominator
h_new
=
tf
.
nn
.
tanh
(
tf
.
div
(
n
,
d
))
a_max
=
a_newmax
h
=
tf
.
where
(
tf
.
greater
(
num_frames
,
i
),
h_new
,
h
)
# Use new hidden state only if the sequence length has not been exceeded
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
h
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
DropoutGruModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_lstm
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
DropoutWrapper
(
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
),
0.9
,
0.9
)
for
_
in
range
(
number_of_layers
)
],
state_is_tuple
=
False
)
loss
=
0.0
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
aggregated_model
=
FrameLevelLogisticModel
;
return
aggregated_model
()
.
create_model
(
model_input
=
outputs
,
vocab_size
=
vocab_size
,
num_frames
=
num_frames
,
**
unused_params
)
class
ResRnnModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
lstm_size
=
1152
number_of_layers
=
3
#from rnn_cell_modern import Delta_RNN as drnn
from
rnn_wrappers_modern
import
MultiRNNCell
as
mrnn
cells
=
[]
for
i
in
range
(
number_of_layers
):
with
tf
.
variable_scope
(
'cell_'
+
str
(
i
)):
cells
.
append
(
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
lstm_size
,
forget_bias
=
1.0
))
stacked_rnn
=
mrnn
(
cells
,
use_residual_connections
=
True
,
state_is_tuple
=
True
)
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_rnn
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
[
-
1
]
.
h
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
LateVladModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
128
)
input_v
=
model_input
[:,:,:
1024
]
input_a
=
model_input
[:,:,
1024
:]
K
=
8
with
tf
.
variable_scope
(
'video'
):
x
=
input_v
input_shape
=
x
.
get_shape
()
.
as_list
()
_
,
N
,
D
=
input_shape
c_bound
=
math
.
sqrt
(
1.
/
(
K
*
D
))
c
=
tf
.
get_variable
(
name
=
'c'
,
shape
=
[
K
,
N
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
random_uniform_initializer
(
-
c_bound
,
c_bound
))
a
=
slim
.
convolution
(
x
,
num_outputs
=
K
,
kernel_size
=
1
,
data_format
=
'NWC'
,
scope
=
'conv'
)
a
=
tf
.
nn
.
softmax
(
a
)
v
=
[]
for
k
in
range
(
K
):
t
=
x
-
c
[
k
][
None
,
:,
None
]
t
=
tf
.
multiply
(
t
,
a
[:,:,
k
][:,:,
None
])
t
=
tf
.
reduce_sum
(
t
,
1
)
t
=
tf
.
nn
.
l2_normalize
(
t
,
dim
=
1
)
v
.
append
(
t
)
v
=
tf
.
stack
(
v
,
axis
=
1
)
v
=
tf
.
reshape
(
v
,
[
-
1
,
K
*
D
])
proj_weights
=
tf
.
get_variable
(
"proj_weights"
,
[
K
*
D
,
1024
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
K
*
D
)))
activation_v
=
tf
.
matmul
(
v
,
proj_weights
)
with
tf
.
variable_scope
(
'audio'
):
x
=
input_a
input_shape
=
x
.
get_shape
()
.
as_list
()
_
,
N
,
D
=
input_shape
c_bound
=
math
.
sqrt
(
1.
/
(
K
*
D
))
c
=
tf
.
get_variable
(
name
=
'c'
,
shape
=
[
K
,
N
],
dtype
=
tf
.
float32
,
initializer
=
tf
.
random_uniform_initializer
(
-
c_bound
,
c_bound
))
a
=
slim
.
convolution
(
x
,
num_outputs
=
K
,
kernel_size
=
1
,
data_format
=
'NWC'
,
scope
=
'conv'
)
a
=
tf
.
nn
.
softmax
(
a
)
v
=
[]
for
k
in
range
(
K
):
t
=
x
-
c
[
k
][
None
,
:,
None
]
t
=
tf
.
multiply
(
t
,
a
[:,:,
k
][:,:,
None
])
t
=
tf
.
reduce_sum
(
t
,
1
)
t
=
tf
.
nn
.
l2_normalize
(
t
,
dim
=
1
)
v
.
append
(
t
)
v
=
tf
.
stack
(
v
,
axis
=
1
)
v
=
tf
.
reshape
(
v
,
[
-
1
,
K
*
D
])
proj_weights
=
tf
.
get_variable
(
"proj_weights"
,
[
K
*
D
,
1024
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
K
*
D
)))
activation_a
=
tf
.
matmul
(
v
,
proj_weights
)
activation
=
tf
.
concat
([
activation_v
,
activation_a
],
axis
=
1
)
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
True
,
scope
=
'proj'
)
activation
=
tf
.
nn
.
relu6
(
activation
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
LNBLstmModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""Creates a model which uses a stack of LSTMs to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stacked_lstm
=
tf
.
contrib
.
rnn
.
MultiRNNCell
(
[
tf
.
contrib
.
rnn
.
LayerNormBasicLSTMCell
(
lstm_size
,
dropout_keep_prob
=
0.50
)
for
_
in
range
(
number_of_layers
)
])
loss
=
0.0
outputs
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
dtype
=
tf
.
float32
)
aggregated_model
=
getattr
(
video_level_models
,
FLAGS
.
video_level_classifier_model
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
[
-
1
]
.
h
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
audio_avgShort_twowayGRUModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
is_training
=
True
,
**
unused_params
):
"""Creates a model which uses a Bidirectional GRU and mean audio features to represent the video.
---->first half GRU----->
- -
visual_feature ---- concat---------------->
- - -
---->second half GRU----> concat -----> video level classifier
-
mean audio features--->
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
stride
=
FLAGS
.
stride
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
video_input
=
model_input
[:,:,:
1024
]
audio_input
=
model_input
[:,:,
1024
:]
first_num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
audio_den
=
tf
.
reshape
(
tf
.
tile
(
first_num_frames
,
[
1
,
128
]),
[
-
1
,
128
])
mean_audio
=
tf
.
reduce_sum
(
audio_input
,
1
)
/
tf
.
maximum
(
audio_den
,
1
)
pooled_input
,
num_frames
=
self
.
avg_pooled_func
(
video_input
,
num_frames
,
stride
)
pooled_input
=
slim
.
batch_norm
(
pooled_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
mean_audio
=
slim
.
batch_norm
(
mean_audio
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn_audio"
)
fw_gru
=
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
bw_gru
=
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
fw_outputs
,
fw_state
=
tf
.
nn
.
dynamic_rnn
(
fw_gru
,
pooled_input
[:,:
max_frames
//
(
2
*
stride
),:],
sequence_length
=
num_frames
//
2
,
dtype
=
tf
.
float32
,
scope
=
'fw'
)
bw_outputs
,
bw_state
=
tf
.
nn
.
dynamic_rnn
(
bw_gru
,
pooled_input
[:,
max_frames
//
(
2
*
stride
)::
-
1
,:],
sequence_length
=
num_frames
-
num_frames
//
2
,
dtype
=
tf
.
float32
,
scope
=
'bw'
)
state
=
tf
.
concat
([
fw_state
,
bw_state
],
1
)
state
=
tf
.
concat
([
state
,
mean_audio
],
1
)
aggregated_model
=
getattr
(
video_level_models
,
'linear_res_mix_act_MoeModel'
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
def
avg_pooled_func
(
self
,
model_input
,
num_frames_in
,
stride
):
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
num_frames
=
num_frames_in
//
stride
step
=
max_frames
//
stride
first_layer_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
stride
,
step
,
feature_size
])
first_layer_input
=
tf
.
reduce_sum
(
first_layer_input
,
1
)
first_num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
tf
.
expand_dims
(
num_frames
,
1
),
2
),
tf
.
float32
)
denominators
=
tf
.
reshape
(
tf
.
tile
(
first_num_frames
,
[
1
,
step
,
feature_size
]),
[
-
1
,
step
,
feature_size
])
first_layer_avg_pooled
=
first_layer_input
/
tf
.
maximum
(
denominators
,
1
)
return
first_layer_avg_pooled
,
num_frames
class
resav_ConvModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
is_training
=
True
,
**
unused_params
):
"""Creates a model which uses a Convolutional model to represent the video.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
stride
=
FLAGS
.
stride
conv_length
=
FLAGS
.
conv_length
conv_hidden1
=
FLAGS
.
conv_hidden1
conv_hidden2
=
FLAGS
.
conv_hidden2
conv_hidden3
=
FLAGS
.
conv_hidden3
mean_feature
=
tf
.
reduce_mean
(
model_input
,
1
)
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
pooled_input
=
self
.
avg_pooled_func
(
model_input
,
stride
)
# To shape : 'batch_size' x 'max_frames' x 1 x 'num_features'
input_expand
=
tf
.
expand_dims
(
pooled_input
,
-
1
)
input_expand
=
tf
.
transpose
(
input_expand
,
[
0
,
1
,
3
,
2
])
# conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden
conv_out
=
slim
.
conv2d
(
input_expand
,
conv_hidden1
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_1_1'
)
conv_out
=
tf
.
nn
.
relu
(
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_1_1"
))
conv_out
=
slim
.
conv2d
(
conv_out
,
conv_hidden1
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_1_2'
)
conv_out
=
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_1_2"
)
res_out
=
slim
.
conv2d
(
input_expand
,
conv_hidden1
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'xconv_1_1'
)
res_out
=
res_out
+
conv_out
res_out
=
slim
.
max_pool2d
(
res_out
,
[
2
,
1
],
[
2
,
1
],
scope
=
'max_pool1'
)
conv_out
=
slim
.
conv2d
(
res_out
,
conv_hidden2
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_2_1'
)
conv_out
=
tf
.
nn
.
relu
(
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_2_1"
))
conv_out
=
slim
.
conv2d
(
conv_out
,
conv_hidden2
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_2_2'
)
conv_out
=
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_2_2"
)
res_out
=
slim
.
conv2d
(
res_out
,
conv_hidden2
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'xconv_2_1'
)
res_out
=
res_out
+
conv_out
res_out
=
slim
.
max_pool2d
(
res_out
,
[
2
,
1
],
[
2
,
1
],
scope
=
'max_pool2'
)
conv_out
=
slim
.
conv2d
(
res_out
,
conv_hidden3
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_3_1'
)
conv_out
=
tf
.
nn
.
relu
(
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_3_1"
))
conv_out
=
slim
.
conv2d
(
conv_out
,
conv_hidden3
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'conv_3_2'
)
conv_out
=
slim
.
batch_norm
(
conv_out
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"bn_3_2"
)
res_out
=
slim
.
conv2d
(
res_out
,
conv_hidden3
,
[
conv_length
,
1
],
activation_fn
=
None
,
padding
=
'SAME'
,
scope
=
'xconv_3_1'
)
res_out
=
res_out
+
conv_out
res_out
=
slim
.
max_pool2d
(
res_out
,
[
2
,
1
],
[
2
,
1
],
scope
=
'max_pool3'
)
a
=
res_out
.
get_shape
()
.
as_list
()[
1
]
b
=
res_out
.
get_shape
()
.
as_list
()[
2
]
c
=
res_out
.
get_shape
()
.
as_list
()[
3
]
print
(
res_out
.
get_shape
()
.
as_list
())
res_out
=
tf
.
reshape
(
res_out
,
[
-
1
,
a
*
b
*
c
])
state
=
tf
.
concat
([
res_out
,
mean_feature
],
1
)
aggregated_model
=
getattr
(
video_level_models
,
'linear_res_mix_act_MoeModel'
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
def
avg_pooled_func
(
self
,
model_input
,
stride
):
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
step
=
max_frames
//
stride
first_layer_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
stride
,
step
,
feature_size
])
first_layer_input
=
tf
.
reduce_mean
(
first_layer_input
,
1
)
return
first_layer_input
class
pur_twowayGRUModel
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
is_training
=
True
,
**
unused_params
):
"""Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video.
---->first half GRU----->
- -
video_feature ---- concat---------------->video level classifier
- -
---->second half GRU---->
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
lstm_size
=
FLAGS
.
lstm_cells
number_of_layers
=
FLAGS
.
lstm_layers
stride
=
FLAGS
.
stride
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
pooled_input
,
num_frames
=
self
.
avg_pooled_func
(
model_input
,
num_frames
,
stride
)
pooled_input
=
slim
.
batch_norm
(
pooled_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
fw_gru
=
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
bw_gru
=
tf
.
contrib
.
rnn
.
GRUCell
(
lstm_size
)
fw_outputs
,
fw_state
=
tf
.
nn
.
dynamic_rnn
(
fw_gru
,
pooled_input
[:,:
max_frames
//
(
2
*
stride
),:],
sequence_length
=
num_frames
//
2
,
dtype
=
tf
.
float32
,
scope
=
'fw'
)
bw_outputs
,
bw_state
=
tf
.
nn
.
dynamic_rnn
(
bw_gru
,
pooled_input
[:,
max_frames
//
(
2
*
stride
)::
-
1
,:],
sequence_length
=
num_frames
-
num_frames
//
2
,
dtype
=
tf
.
float32
,
scope
=
'bw'
)
state
=
tf
.
concat
([
fw_state
,
bw_state
],
1
)
aggregated_model
=
getattr
(
video_level_models
,
'linear_res_mix_act_MoeModel'
)
return
aggregated_model
()
.
create_model
(
model_input
=
state
,
vocab_size
=
vocab_size
,
**
unused_params
)
def
avg_pooled_func
(
self
,
model_input
,
num_frames_in
,
stride
):
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
num_frames
=
num_frames_in
//
stride
step
=
max_frames
//
stride
first_layer_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
stride
,
step
,
feature_size
])
first_layer_input
=
tf
.
reduce_sum
(
first_layer_input
,
1
)
first_num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
tf
.
expand_dims
(
num_frames
,
1
),
2
),
tf
.
float32
)
denominators
=
tf
.
reshape
(
tf
.
tile
(
first_num_frames
,
[
1
,
step
,
feature_size
]),
[
-
1
,
step
,
feature_size
])
first_layer_avg_pooled
=
first_layer_input
/
tf
.
maximum
(
denominators
,
1
)
return
first_layer_avg_pooled
,
num_frames
\ No newline at end of file
\ No newline at end of file
...
...
web/backend/yt8m/video_level_models.py
View file @
7e0b563
...
...
@@ -136,9 +136,7 @@ class MoeModel(models.BaseModel):
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
final_probabilities
=
tf
.
reshape
(
final_probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
print
(
"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
,
final_probabilities_by_class_and_batch
)
return
{
"predictions"
:
final_probabilities
}
...
...
@@ -251,482 +249,4 @@ class willow_MoeModel(models.BaseModel):
probabilities
=
tf
.
multiply
(
probabilities
,
gates
)
return
{
"predictions"
:
probabilities
}
class
willow_MoeModel_moe4
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def
create_model
(
self
,
model_input
,
vocab_size
,
is_training
,
num_mixtures
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures
=
4
low_rank_gating
=
FLAGS
.
moe_low_rank_gating
l2_penalty
=
FLAGS
.
moe_l2
gating_probabilities
=
FLAGS
.
moe_prob_gating
gating_input
=
FLAGS
.
moe_prob_gating_input
input_size
=
model_input
.
get_shape
()
.
as_list
()[
1
]
remove_diag
=
False
if
low_rank_gating
==
-
1
:
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
else
:
gate_activations1
=
slim
.
fully_connected
(
model_input
,
low_rank_gating
,
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates1"
)
gate_activations
=
slim
.
fully_connected
(
gate_activations1
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates2"
)
expert_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
probabilities
=
tf
.
reshape
(
probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
if
gating_probabilities
:
if
gating_input
==
'prob'
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
vocab_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
probabilities
,
gating_weights
)
else
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
input_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
model_input
,
gating_weights
)
if
remove_diag
:
# removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
probabilities
)
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_prob_bn"
)
gates
=
tf
.
sigmoid
(
gates
)
probabilities
=
tf
.
multiply
(
probabilities
,
gates
)
return
{
"predictions"
:
probabilities
}
class
willow_MoeModel_moe4_noGP
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def
create_model
(
self
,
model_input
,
vocab_size
,
is_training
,
num_mixtures
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures
=
4
low_rank_gating
=
FLAGS
.
moe_low_rank_gating
l2_penalty
=
FLAGS
.
moe_l2
gating_probabilities
=
False
gating_input
=
FLAGS
.
moe_prob_gating_input
input_size
=
model_input
.
get_shape
()
.
as_list
()[
1
]
remove_diag
=
False
if
low_rank_gating
==
-
1
:
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
else
:
gate_activations1
=
slim
.
fully_connected
(
model_input
,
low_rank_gating
,
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates1"
)
gate_activations
=
slim
.
fully_connected
(
gate_activations1
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates2"
)
expert_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
probabilities
=
tf
.
reshape
(
probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
return
{
"predictions"
:
probabilities
}
class
willow_MoeModel_moe2_noGP
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def
create_model
(
self
,
model_input
,
vocab_size
,
is_training
,
num_mixtures
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures
=
2
low_rank_gating
=
FLAGS
.
moe_low_rank_gating
l2_penalty
=
FLAGS
.
moe_l2
gating_probabilities
=
False
gating_input
=
FLAGS
.
moe_prob_gating_input
input_size
=
model_input
.
get_shape
()
.
as_list
()[
1
]
remove_diag
=
False
if
low_rank_gating
==
-
1
:
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
else
:
gate_activations1
=
slim
.
fully_connected
(
model_input
,
low_rank_gating
,
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates1"
)
gate_activations
=
slim
.
fully_connected
(
gate_activations1
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates2"
)
expert_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
probabilities
=
tf
.
reshape
(
probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
return
{
"predictions"
:
probabilities
}
class
willow_MoeModel_moe2
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def
create_model
(
self
,
model_input
,
vocab_size
,
is_training
,
num_mixtures
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures
=
2
low_rank_gating
=
FLAGS
.
moe_low_rank_gating
l2_penalty
=
FLAGS
.
moe_l2
gating_probabilities
=
FLAGS
.
moe_prob_gating
gating_input
=
FLAGS
.
moe_prob_gating_input
input_size
=
model_input
.
get_shape
()
.
as_list
()[
1
]
remove_diag
=
False
if
low_rank_gating
==
-
1
:
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
else
:
gate_activations1
=
slim
.
fully_connected
(
model_input
,
low_rank_gating
,
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates1"
)
gate_activations
=
slim
.
fully_connected
(
gate_activations1
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates2"
)
expert_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
probabilities
=
tf
.
reshape
(
probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
if
gating_probabilities
:
if
gating_input
==
'prob'
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
vocab_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
probabilities
,
gating_weights
)
else
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
input_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
model_input
,
gating_weights
)
if
remove_diag
:
# removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
probabilities
)
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_prob_bn"
)
gates
=
tf
.
sigmoid
(
gates
)
probabilities
=
tf
.
multiply
(
probabilities
,
gates
)
return
{
"predictions"
:
probabilities
}
class
linear_res_mix_act_MoeModel
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization).
-----linear_layers(1) + sigmoid activation-------------
- -
-----linear_layers(2) + relu activation----------------
- -
input_features ----- -------moe-----output
- -
-----linear_layers(3) + elu activation-----------------
- -
-----linear_layers(4) + tanh activation----------------
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_mixtures
=
None
,
num_hiddens
=
None
,
num_maxout
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
num_mixtures
=
num_mixtures
or
FLAGS
.
moe_num_mixtures
num_hiddens
=
num_hiddens
or
FLAGS
.
moe_num_hiddens
num_maxout
=
num_maxout
or
FLAGS
.
num_maxout
hidden_sigmoid
=
slim
.
fully_connected
(
model_input
,
num_hiddens
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
'hidden_sigmoid'
)
hidden_relu
=
slim
.
fully_connected
(
model_input
,
num_hiddens
,
activation_fn
=
tf
.
nn
.
relu
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
'hidden_relu'
)
hidden_elu
=
slim
.
fully_connected
(
model_input
,
num_hiddens
,
activation_fn
=
tf
.
nn
.
elu
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
'hidden_elu'
)
hidden_tanh
=
slim
.
fully_connected
(
model_input
,
num_hiddens
,
activation_fn
=
tf
.
nn
.
tanh
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
'hidden_tanh'
)
linear_input
=
slim
.
fully_connected
(
model_input
,
num_hiddens
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
'hidden_linear'
)
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
expert_activations
=
slim
.
fully_connected
(
tf
.
concat
([
hidden_sigmoid
+
0.25
*
linear_input
,
hidden_relu
+
0.25
*
linear_input
,
hidden_elu
+
0.25
*
linear_input
,
hidden_tanh
+
0.25
*
linear_input
],
1
),
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
final_probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
final_probabilities
=
tf
.
reshape
(
final_probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
return
{
"predictions"
:
final_probabilities
}
\ No newline at end of file
return
{
"predictions"
:
probabilities
}
\ No newline at end of file
...
...
보고서/최종보고서-윤영빈.docx
View file @
7e0b563
This file is too large to display.
보고서/최종보고서-윤영빈.pdf
0 → 100644
View file @
7e0b563
No preview for this file type
Please
register
or
login
to post a comment