Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2020-2-capstone-design2
/
2015104192
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
윤영빈
2020-10-16 02:11:35 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
e3b6e3b4947b3fe3c66c5fbbf6602dbfe1e26b99
e3b6e3b4
1 parent
e9ac668c
NetVLAD model test
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
418 additions
and
39 deletions
README.md
web/backend/yt8m/frame_level_models.py
web/backend/yt8m/video_level_models.py
README.md
View file @
e3b6e3b
...
...
@@ -20,8 +20,6 @@
![
profit_hunter
](
/img/profit_hunter.png
)
*
팀명
**Profit Hunter**
*
윤영빈(컴퓨터공학과, 2015104192)
*
윤준현(컴퓨터공학과, 2015104193)
*
이현규(컴퓨터공학과, 2015104209)
*
이태현(컴퓨터공학과, 2015104208)
## Links
...
...
web/backend/yt8m/frame_level_models.py
View file @
e3b6e3b
...
...
@@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.")
flags
.
DEFINE_integer
(
"lstm_layers"
,
2
,
"Number of LSTM layers."
)
class
FrameLevelLogisticModel
(
models
.
BaseModel
):
"""Creates a logistic classifier over the aggregated frame-level features."""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
denominators
=
tf
.
reshape
(
tf
.
tile
(
num_frames
,
[
1
,
feature_size
]),
[
-
1
,
feature_size
])
avg_pooled
=
tf
.
reduce_sum
(
model_input
,
axis
=
[
1
])
/
denominators
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
output
}
class
DbofModel
(
models
.
BaseModel
):
"""Creates a Deep Bag of Frames model.
...
...
@@ -239,7 +203,6 @@ class LstmModel(models.BaseModel):
tf
.
contrib
.
rnn
.
BasicLSTMCell
(
lstm_size
,
forget_bias
=
1.0
)
for
_
in
range
(
number_of_layers
)
])
_
,
state
=
tf
.
nn
.
dynamic_rnn
(
stacked_lstm
,
model_input
,
sequence_length
=
num_frames
,
...
...
@@ -251,3 +214,300 @@ class LstmModel(models.BaseModel):
return
aggregated_model
()
.
create_model
(
model_input
=
state
[
-
1
]
.
h
,
vocab_size
=
vocab_size
,
**
unused_params
)
class
FrameLevelLogisticModel
(
models
.
BaseModel
):
"""Creates a logistic classifier over the aggregated frame-level features."""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""See base class.
This class is intended to be an example for implementors of frame level
models. If you want to train a model over averaged features it is more
efficient to average them beforehand rather than on the fly.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
denominators
=
tf
.
reshape
(
tf
.
tile
(
num_frames
,
[
1
,
feature_size
]),
[
-
1
,
feature_size
])
avg_pooled
=
tf
.
reduce_sum
(
model_input
,
axis
=
[
1
])
/
denominators
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
output
}
class
CNN
(
models
.
BaseModel
):
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
**
unused_params
):
"""def model(features, labels, mode, params):"""
"""CNN classifier model."""
images
=
features
[
"image"
]
labels
=
labels
[
"label"
]
tf
.
summary
.
image
(
"images"
,
images
)
drop_rate
=
0.0
features
=
images
for
i
,
filters
in
enumerate
([
32
,
64
,
128
]):
features
=
tf
.
layers
.
conv2d
(
features
,
filters
=
filters
,
kernel_size
=
3
,
padding
=
"same"
,
name
=
"conv_
%
d"
%
(
i
+
1
))
features
=
tf
.
layers
.
max_pooling2d
(
inputs
=
features
,
pool_size
=
2
,
strides
=
2
,
padding
=
"same"
,
name
=
"pool_
%
d"
%
(
i
+
1
))
features
=
tf
.
contrib
.
layers
.
flatten
(
features
)
features
=
tf
.
layers
.
dropout
(
features
,
drop_rate
)
features
=
tf
.
layers
.
dense
(
features
,
512
,
name
=
"dense_1"
)
features
=
tf
.
layers
.
dropout
(
features
,
drop_rate
)
logits
=
tf
.
layers
.
dense
(
features
,
params
.
num_classes
,
activation
=
None
,
name
=
"dense_2"
)
predictions
=
tf
.
argmax
(
logits
,
axis
=
1
)
loss
=
tf
.
losses
.
sparse_softmax_cross_entropy
(
labels
=
labels
,
logits
=
logits
)
output
=
slim
.
fully_connected
(
avg_pooled
,
vocab_size
,
activation_fn
=
tf
.
nn
.
sigmoid
,
weights_regularizer
=
slim
.
l2_regularizer
(
1e-8
))
return
{
"predictions"
:
predictions
},
loss
class
NetVLAD_NonLocal_types
():
def
__init__
(
self
,
feature_size
,
max_frames
,
cluster_size
,
add_batch_norm
,
is_training
):
self
.
feature_size
=
feature_size
self
.
max_frames
=
max_frames
self
.
is_training
=
is_training
self
.
add_batch_norm
=
add_batch_norm
self
.
cluster_size
=
cluster_size
def
forward
(
self
,
reshaped_input
):
cluster_weights
=
tf
.
get_variable
(
"cluster_weights"
,
[
self
.
feature_size
,
self
.
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_weights"
,
cluster_weights
)
activation
=
tf
.
matmul
(
reshaped_input
,
cluster_weights
)
if
self
.
add_batch_norm
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
self
.
is_training
,
scope
=
"cluster_bn"
)
else
:
cluster_biases
=
tf
.
get_variable
(
"cluster_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
tf
.
summary
.
histogram
(
"cluster_biases"
,
cluster_biases
)
activation
+=
cluster_biases
activation
=
tf
.
nn
.
softmax
(
activation
)
tf
.
summary
.
histogram
(
"cluster_output"
,
activation
)
activation
=
tf
.
reshape
(
activation
,
[
-
1
,
self
.
max_frames
,
self
.
cluster_size
])
a_sum
=
tf
.
reduce_sum
(
activation
,
-
2
,
keep_dims
=
True
)
cluster_weights2
=
tf
.
get_variable
(
"cluster_weights2"
,
[
1
,
self
.
feature_size
,
self
.
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
a
=
tf
.
multiply
(
a_sum
,
cluster_weights2
)
activation
=
tf
.
transpose
(
activation
,
perm
=
[
0
,
2
,
1
])
reshaped_input
=
tf
.
reshape
(
reshaped_input
,[
-
1
,
self
.
max_frames
,
self
.
feature_size
])
vlad
=
tf
.
matmul
(
activation
,
reshaped_input
)
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
subtract
(
vlad
,
a
)
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
self
.
feature_size
])
vlad_softmax
=
self
.
embedgaussian_relation
(
vlad
,
1
/
float
(
64
))
nonlocal_g
=
tf
.
get_variable
(
"nonlocal_g"
,
[
self
.
feature_size
,
self
.
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
nonlocal_out
=
tf
.
get_variable
(
"nonlocal_out"
,
[
self
.
cluster_size
,
self
.
feature_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
cluster_size
)))
vlad_g
=
tf
.
matmul
(
vlad
,
nonlocal_g
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
self
.
cluster_size
,
self
.
cluster_size
])
vlad_g
=
tf
.
matmul
(
vlad_softmax
,
vlad_g
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
self
.
cluster_size
])
vlad_g
=
tf
.
matmul
(
vlad_g
,
nonlocal_out
)
vlad_g
=
tf
.
reshape
(
vlad_g
,
[
-
1
,
self
.
cluster_size
,
self
.
feature_size
])
vlad
=
tf
.
reshape
(
vlad
,
[
-
1
,
self
.
cluster_size
,
self
.
feature_size
])
vlad
=
vlad
+
vlad_g
vlad
=
tf
.
transpose
(
vlad
,
perm
=
[
0
,
2
,
1
])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
# [b,f,c]
vlad
=
tf
.
reshape
(
vlad
,[
-
1
,
self
.
cluster_size
*
self
.
feature_size
])
vlad
=
tf
.
nn
.
l2_normalize
(
vlad
,
1
)
return
vlad
def
embedgaussian_relation
(
self
,
input_
,
temp
=
1
/
float
(
32
)):
nonlocal_theta
=
tf
.
get_variable
(
"nonlocal_theta"
,
[
self
.
feature_size
,
self
.
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
nonlocal_phi
=
tf
.
get_variable
(
"nonlocal_phi"
,
[
self
.
feature_size
,
self
.
cluster_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
self
.
feature_size
)))
vlad_theta
=
tf
.
matmul
(
input_
,
nonlocal_theta
)
vlad_phi
=
tf
.
matmul
(
input_
,
nonlocal_phi
)
vlad_theta
=
tf
.
reshape
(
vlad_theta
,
[
-
1
,
self
.
cluster_size
,
self
.
cluster_size
])
vlad_phi
=
tf
.
reshape
(
vlad_phi
,
[
-
1
,
self
.
cluster_size
,
self
.
cluster_size
])
vlad_softmax
=
tf
.
nn
.
softmax
(
temp
*
tf
.
matmul
(
vlad_theta
,
tf
.
transpose
(
vlad_phi
,
perm
=
[
0
,
2
,
1
])))
return
vlad_softmax
class
NetVLADModelLF
(
models
.
BaseModel
):
"""Creates a NetVLAD based model.
Args:
model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
input features.
vocab_size: The number of classes in the dataset.
num_frames: A vector of length 'batch' which indicates the number of
frames for each video (before padding).
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
'batch_size' x 'num_classes'.
"""
def
create_model
(
self
,
model_input
,
vocab_size
,
num_frames
,
iterations
=
None
,
add_batch_norm
=
None
,
sample_random_frames
=
None
,
cluster_size
=
None
,
hidden_size
=
None
,
is_training
=
True
,
**
unused_params
):
iterations
=
300
add_batch_norm
=
True
random_frames
=
True
cluster_size
=
64
hidden1_size
=
1024
relu
=
False
dimred
=
-
1
gating
=
True
remove_diag
=
False
num_frames
=
tf
.
cast
(
tf
.
expand_dims
(
num_frames
,
1
),
tf
.
float32
)
if
random_frames
:
model_input
=
utils
.
SampleRandomFrames
(
model_input
,
num_frames
,
iterations
)
else
:
model_input
=
utils
.
SampleRandomSequence
(
model_input
,
num_frames
,
iterations
)
max_frames
=
model_input
.
get_shape
()
.
as_list
()[
1
]
feature_size
=
model_input
.
get_shape
()
.
as_list
()[
2
]
reshaped_input
=
tf
.
reshape
(
model_input
,
[
-
1
,
feature_size
])
video_NetVLAD
=
NetVLAD_NonLocal_types
(
1024
,
int
(
max_frames
),
int
(
cluster_size
),
add_batch_norm
,
is_training
)
audio_NetVLAD
=
NetVLAD_NonLocal_types
(
128
,
int
(
max_frames
),
int
(
cluster_size
/
2
),
add_batch_norm
,
is_training
)
if
add_batch_norm
:
# and not lightvlad:
reshaped_input
=
slim
.
batch_norm
(
reshaped_input
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"input_bn"
)
with
tf
.
variable_scope
(
"video_VLAD"
):
vlad_video
=
video_NetVLAD
.
forward
(
reshaped_input
[:,
0
:
1024
])
with
tf
.
variable_scope
(
"audio_VLAD"
):
vlad_audio
=
audio_NetVLAD
.
forward
(
reshaped_input
[:,
1024
:])
vlad
=
tf
.
concat
([
vlad_video
,
vlad_audio
],
1
)
vlad_dim
=
vlad
.
get_shape
()
.
as_list
()[
1
]
hidden1_weights
=
tf
.
get_variable
(
"hidden1_weights"
,
[
vlad_dim
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
cluster_size
)))
activation
=
tf
.
matmul
(
vlad
,
hidden1_weights
)
if
add_batch_norm
and
relu
:
activation
=
slim
.
batch_norm
(
activation
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"hidden1_bn"
)
else
:
hidden1_biases
=
tf
.
get_variable
(
"hidden1_biases"
,
[
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
0.01
))
tf
.
summary
.
histogram
(
"hidden1_biases"
,
hidden1_biases
)
activation
+=
hidden1_biases
if
relu
:
activation
=
tf
.
nn
.
relu6
(
activation
)
if
gating
:
gating_weights
=
tf
.
get_variable
(
"gating_weights_2"
,
[
hidden1_size
,
hidden1_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
hidden1_size
)))
gates
=
tf
.
matmul
(
activation
,
gating_weights
)
if
remove_diag
:
#removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
activation
)
if
add_batch_norm
:
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_bn"
)
else
:
gating_biases
=
tf
.
get_variable
(
"gating_biases"
,
[
cluster_size
],
initializer
=
tf
.
random_normal
(
stddev
=
1
/
math
.
sqrt
(
feature_size
)))
gates
+=
gating_biases
gates
=
tf
.
sigmoid
(
gates
)
activation
=
tf
.
multiply
(
activation
,
gates
)
aggregated_model
=
getattr
(
video_level_models
,
'willow_MoeModel'
)
return
aggregated_model
()
.
create_model
(
model_input
=
activation
,
vocab_size
=
vocab_size
,
is_training
=
is_training
,
**
unused_params
)
\ No newline at end of file
...
...
web/backend/yt8m/video_level_models.py
View file @
e3b6e3b
...
...
@@ -25,6 +25,21 @@ FLAGS = flags.FLAGS
flags
.
DEFINE_integer
(
"moe_num_mixtures"
,
2
,
"The number of mixtures (excluding the dummy 'expert') used for MoeModel."
)
# flags.DEFINE_integer(
# "moe_num_mixtures", 2,
# "The number of mixtures (excluding the dummy 'expert') used for MoeModel.")
flags
.
DEFINE_float
(
"moe_l2"
,
1e-8
,
"L2 penalty for MoeModel."
)
flags
.
DEFINE_integer
(
"moe_low_rank_gating"
,
-
1
,
"Low rank gating for MoeModel."
)
flags
.
DEFINE_bool
(
"moe_prob_gating"
,
True
,
"Prob gating for MoeModel."
)
flags
.
DEFINE_string
(
"moe_prob_gating_input"
,
"prob"
,
"input Prob gating for MoeModel."
)
class
LogisticModel
(
models
.
BaseModel
):
...
...
@@ -111,3 +126,109 @@ class MoeModel(models.BaseModel):
final_probabilities
=
tf
.
reshape
(
final_probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
return
{
"predictions"
:
final_probabilities
}
class
willow_MoeModel
(
models
.
BaseModel
):
"""A softmax over a mixture of logistic models (with L2 regularization)."""
def
create_model
(
self
,
model_input
,
vocab_size
,
is_training
,
num_mixtures
=
None
,
l2_penalty
=
1e-8
,
**
unused_params
):
"""Creates a Mixture of (Logistic) Experts model.
It also includes the possibility of gating the probabilities
The model consists of a per-class softmax distribution over a
configurable number of logistic classifiers. One of the classifiers in the
mixture is not trained, and always predicts 0.
Args:
model_input: 'batch_size' x 'num_features' matrix of input features.
vocab_size: The number of classes in the dataset.
is_training: Is this the training phase ?
num_mixtures: The number of mixtures (excluding a dummy 'expert' that
always predicts the non-existence of an entity).
l2_penalty: How much to penalize the squared magnitudes of parameter
values.
Returns:
A dictionary with a tensor containing the probability predictions of the
model in the 'predictions' key. The dimensions of the tensor are
batch_size x num_classes.
"""
num_mixtures
=
8
low_rank_gating
=
FLAGS
.
moe_low_rank_gating
l2_penalty
=
FLAGS
.
moe_l2
gating_probabilities
=
FLAGS
.
moe_prob_gating
gating_input
=
FLAGS
.
moe_prob_gating_input
input_size
=
model_input
.
get_shape
()
.
as_list
()[
1
]
remove_diag
=
False
if
low_rank_gating
==
-
1
:
gate_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates"
)
else
:
gate_activations1
=
slim
.
fully_connected
(
model_input
,
low_rank_gating
,
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates1"
)
gate_activations
=
slim
.
fully_connected
(
gate_activations1
,
vocab_size
*
(
num_mixtures
+
1
),
activation_fn
=
None
,
biases_initializer
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"gates2"
)
expert_activations
=
slim
.
fully_connected
(
model_input
,
vocab_size
*
num_mixtures
,
activation_fn
=
None
,
weights_regularizer
=
slim
.
l2_regularizer
(
l2_penalty
),
scope
=
"experts"
)
gating_distribution
=
tf
.
nn
.
softmax
(
tf
.
reshape
(
gate_activations
,
[
-
1
,
num_mixtures
+
1
]))
# (Batch * #Labels) x (num_mixtures + 1)
expert_distribution
=
tf
.
nn
.
sigmoid
(
tf
.
reshape
(
expert_activations
,
[
-
1
,
num_mixtures
]))
# (Batch * #Labels) x num_mixtures
probabilities_by_class_and_batch
=
tf
.
reduce_sum
(
gating_distribution
[:,
:
num_mixtures
]
*
expert_distribution
,
1
)
probabilities
=
tf
.
reshape
(
probabilities_by_class_and_batch
,
[
-
1
,
vocab_size
])
if
gating_probabilities
:
if
gating_input
==
'prob'
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
vocab_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
probabilities
,
gating_weights
)
else
:
gating_weights
=
tf
.
get_variable
(
"gating_prob_weights"
,
[
input_size
,
vocab_size
],
initializer
=
tf
.
random_normal_initializer
(
stddev
=
1
/
math
.
sqrt
(
vocab_size
)))
gates
=
tf
.
matmul
(
model_input
,
gating_weights
)
if
remove_diag
:
# removes diagonals coefficients
diagonals
=
tf
.
matrix_diag_part
(
gating_weights
)
gates
=
gates
-
tf
.
multiply
(
diagonals
,
probabilities
)
gates
=
slim
.
batch_norm
(
gates
,
center
=
True
,
scale
=
True
,
is_training
=
is_training
,
scope
=
"gating_prob_bn"
)
gates
=
tf
.
sigmoid
(
gates
)
probabilities
=
tf
.
multiply
(
probabilities
,
gates
)
return
{
"predictions"
:
probabilities
}
\ No newline at end of file
...
...
Please
register
or
login
to post a comment