frame_level_models.py 12.2 KB
# Copyright 2017 Antoine Miech All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Contains a collection of models which operate on variable-length sequences.
"""
import math

import models
import video_level_models
import tensorflow as tf
import model_utils as utils

import tensorflow.contrib.slim as slim
from tensorflow import flags
import tensorflow as tf
import scipy.io as sio
import numpy as np
import weights_rwa

FLAGS = flags.FLAGS
flags.DEFINE_integer("iterations", 30, "Number of frames per batch for DBoF.")
flags.DEFINE_bool("dbof_add_batch_norm", True,
                  "Adds batch normalization to the DBoF model.")
flags.DEFINE_bool(
    "sample_random_frames", True,
    "If true samples random frames (for frame level models). If false, a random"
    "sequence of frames is sampled instead.")
flags.DEFINE_integer("dbof_cluster_size", 8192,
                     "Number of units in the DBoF cluster layer.")
flags.DEFINE_integer("dbof_hidden_size", 1024,
                     "Number of units in the DBoF hidden layer.")
flags.DEFINE_string(
    "dbof_pooling_method", "max",
    "The pooling method used in the DBoF cluster layer. "
    "Choices are 'average' and 'max'.")
flags.DEFINE_string(
    "dbof_activation", "sigmoid",
    "The nonlinear activation method for cluster and hidden dense layer, e.g., "
    "sigmoid, relu6, etc.")
flags.DEFINE_string(
    "video_level_classifier_model", "MoeModel",
    "Some Frame-Level models can be decomposed into a "
    "generalized pooling operation followed by a "
    "classifier layer")
flags.DEFINE_integer("lstm_cells", 512, "Number of LSTM cells.")
flags.DEFINE_integer("lstm_layers", 4, "Number of LSTM layers.")

flags.DEFINE_integer("input_type", 3,
                     "input type.")
flags.DEFINE_integer("conv_length", 3, "Receptive field of cnn.")
flags.DEFINE_integer("conv_hidden", 256, "Number of cnn hidden.")
flags.DEFINE_integer("conv_hidden1", 1024, "Number of cnn hidden.")
flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.")
flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.")

class FrameLevelLogisticModel(models.BaseModel):
  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    feature_size = model_input.get_shape().as_list()[2]

    denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
                              [-1, feature_size])
    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators

    output = slim.fully_connected(avg_pooled,
                                  vocab_size,
                                  activation_fn=tf.nn.sigmoid,
                                  weights_regularizer=slim.l2_regularizer(1e-8))
    
    return {"predictions": output}

class NetVLAD_NonLocal_types():
    def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
        self.feature_size = feature_size
        self.max_frames = max_frames
        self.is_training = is_training
        self.add_batch_norm = add_batch_norm
        self.cluster_size = cluster_size

    def forward(self,reshaped_input):

        cluster_weights = tf.get_variable("cluster_weights",
              [int(self.feature_size), int(self.cluster_size)],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))

        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)

        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases

        activation = tf.nn.softmax(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, int(self.max_frames), int(self.cluster_size)])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        cluster_weights2 = tf.get_variable("cluster_weights2",
            [1,int(self.feature_size), int(self.cluster_size)],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))

        a = tf.multiply(a_sum,cluster_weights2)

        activation = tf.transpose(activation,perm=[0,2,1])

        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
        vlad = tf.matmul(activation,reshaped_input)
        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.subtract(vlad,a)

        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.reshape(vlad, [-1, self.feature_size])

        vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64))


        nonlocal_g = tf.get_variable("nonlocal_g",
              [int(self.feature_size), int(self.cluster_size)],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        nonlocal_out = tf.get_variable("nonlocal_out",
              [int(self.cluster_size), int(self.feature_size)],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))

        vlad_g = tf.matmul(vlad, nonlocal_g)
        vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size),int(self.cluster_size)])
        vlad_g = tf.matmul(vlad_softmax, vlad_g)
        vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size)])

        vlad_g = tf.matmul(vlad_g, nonlocal_out)
        vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)])
        vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)])
        vlad = vlad + vlad_g

        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]

        vlad = tf.reshape(vlad,[-1,int(int(self.cluster_size*self.feature_size))])
        vlad = tf.nn.l2_normalize(vlad,1)

        return vlad

    def embedgaussian_relation(self, input_, temp=1/float(32)):
      nonlocal_theta = tf.get_variable("nonlocal_theta",
            [int(self.feature_size), int(self.cluster_size)],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
      nonlocal_phi = tf.get_variable("nonlocal_phi",
            [int(self.feature_size), int(self.cluster_size)],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))

      vlad_theta = tf.matmul(input_, nonlocal_theta)
      vlad_phi = tf.matmul(input_, nonlocal_phi)
      vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size), int(self.cluster_size)])
      vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size), int(self.cluster_size)])
      vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
      return vlad_softmax

class NetVLADModelLF(models.BaseModel):
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = 300
    add_batch_norm = True
    random_frames = True
    cluster_size = 64
    hidden1_size = 1024
    relu = True
    dimred = -1
    gating = True
    remove_diag = False

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)


    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])


    video_NetVLAD = NetVLAD_NonLocal_types(1024,max_frames,cluster_size, add_batch_norm, is_training)
    audio_NetVLAD = NetVLAD_NonLocal_types(128,max_frames,cluster_size/2, add_batch_norm, is_training)


    if add_batch_norm:# and not lightvlad:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_VLAD"):
        vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])

    with tf.variable_scope("audio_VLAD"):
        vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])

    vlad = tf.concat([vlad_video, vlad_audio],1)

    vlad_dim = vlad.get_shape().as_list()[1]
    hidden1_weights = tf.get_variable("hidden1_weights",
      [vlad_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))

    activation = tf.matmul(vlad, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")

    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases

    if relu:
      activation = tf.nn.relu6(activation)


    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))

        gates = tf.matmul(activation, gating_weights)

        if remove_diag:
            #removes diagonals coefficients
            diagonals = tf.matrix_diag_part(gating_weights)
            gates = gates - tf.multiply(diagonals,activation)


        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)

class LstmModel(models.BaseModel):
    
  def create_model(self, model_input, vocab_size, num_frames, **unused_params):
    lstm_size = FLAGS.lstm_cells
    number_of_layers = FLAGS.lstm_layers

    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
            [
                tf.contrib.rnn.BasicLSTMCell(
                    lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
                ])

    loss = 0.0

    outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
                                       sequence_length=num_frames,
                                       dtype=tf.float32)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    return aggregated_model().create_model(
        model_input=state[-1].h,
        vocab_size=vocab_size,
        **unused_params)