윤영빈

final report almost done

...@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.") ...@@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.")
65 flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.") 65 flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.")
66 flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.") 66 flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.")
67 67
68 -class DbofModel(models.BaseModel): 68 +class FrameLevelLogisticModel(models.BaseModel):
69 - """Creates a Deep Bag of Frames model. 69 + def create_model(self, model_input, vocab_size, num_frames, **unused_params):
70 - The model projects the features for each frame into a higher dimensional
71 - 'clustering' space, pools across frames in that space, and then
72 - uses a configurable video-level model to classify the now aggregated features.
73 - The model will randomly sample either frames or sequences of frames during
74 - training to speed up convergence.
75 - """
76 -
77 - ACT_FN_MAP = {
78 - "sigmoid": tf.nn.sigmoid,
79 - "relu6": tf.nn.relu6,
80 - }
81 -
82 - def create_model(self,
83 - model_input,
84 - vocab_size,
85 - num_frames,
86 - iterations=None,
87 - add_batch_norm=None,
88 - sample_random_frames=None,
89 - cluster_size=None,
90 - hidden_size=None,
91 - is_training=True,
92 - **unused_params):
93 - """See base class.
94 - Args:
95 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
96 - input features.
97 - vocab_size: The number of classes in the dataset.
98 - num_frames: A vector of length 'batch' which indicates the number of
99 - frames for each video (before padding).
100 - iterations: the number of frames to be sampled.
101 - add_batch_norm: whether to add batch norm during training.
102 - sample_random_frames: whether to sample random frames or random sequences.
103 - cluster_size: the output neuron number of the cluster layer.
104 - hidden_size: the output neuron number of the hidden layer.
105 - is_training: whether to build the graph in training mode.
106 - Returns:
107 - A dictionary with a tensor containing the probability predictions of the
108 - model in the 'predictions' key. The dimensions of the tensor are
109 - 'batch_size' x 'num_classes'.
110 - """
111 - iterations = iterations or FLAGS.iterations
112 - add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
113 - random_frames = sample_random_frames or FLAGS.sample_random_frames
114 - cluster_size = cluster_size or FLAGS.dbof_cluster_size
115 - hidden1_size = hidden_size or FLAGS.dbof_hidden_size
116 - act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
117 - assert act_fn is not None, ("dbof_activation is not valid: %s." %
118 - FLAGS.dbof_activation)
119 -
120 num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) 70 num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
121 - if random_frames:
122 - model_input = utils.SampleRandomFrames(model_input, num_frames,
123 - iterations)
124 - else:
125 - model_input = utils.SampleRandomSequence(model_input, num_frames,
126 - iterations)
127 - max_frames = model_input.get_shape().as_list()[1]
128 feature_size = model_input.get_shape().as_list()[2] 71 feature_size = model_input.get_shape().as_list()[2]
129 - reshaped_input = tf.reshape(model_input, [-1, feature_size])
130 - tf.compat.v1.summary.histogram("input_hist", reshaped_input)
131 72
132 - if add_batch_norm: 73 + denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
133 - reshaped_input = slim.batch_norm(reshaped_input, 74 + [-1, feature_size])
134 - center=True, 75 + avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
135 - scale=True,
136 - is_training=is_training,
137 - scope="input_bn")
138 76
139 - cluster_weights = tf.compat.v1.get_variable( 77 + output = slim.fully_connected(avg_pooled,
140 - "cluster_weights", [feature_size, cluster_size], 78 + vocab_size,
141 - initializer=tf.random_normal_initializer(stddev=1 / 79 + activation_fn=tf.nn.sigmoid,
142 - math.sqrt(feature_size))) 80 + weights_regularizer=slim.l2_regularizer(1e-8))
143 - tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
144 - activation = tf.matmul(reshaped_input, cluster_weights)
145 - if add_batch_norm:
146 - activation = slim.batch_norm(activation,
147 - center=True,
148 - scale=True,
149 - is_training=is_training,
150 - scope="cluster_bn")
151 - else:
152 - cluster_biases = tf.compat.v1.get_variable(
153 - "cluster_biases", [cluster_size],
154 - initializer=tf.random_normal_initializer(stddev=1 /
155 - math.sqrt(feature_size)))
156 - tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
157 - activation += cluster_biases
158 - activation = act_fn(activation)
159 - tf.compat.v1.summary.histogram("cluster_output", activation)
160 -
161 - activation = tf.reshape(activation, [-1, max_frames, cluster_size])
162 - activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
163 -
164 - hidden1_weights = tf.compat.v1.get_variable(
165 - "hidden1_weights", [cluster_size, hidden1_size],
166 - initializer=tf.random_normal_initializer(stddev=1 /
167 - math.sqrt(cluster_size)))
168 - tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
169 - activation = tf.matmul(activation, hidden1_weights)
170 - if add_batch_norm:
171 - activation = slim.batch_norm(activation,
172 - center=True,
173 - scale=True,
174 - is_training=is_training,
175 - scope="hidden1_bn")
176 - else:
177 - hidden1_biases = tf.compat.v1.get_variable(
178 - "hidden1_biases", [hidden1_size],
179 - initializer=tf.random_normal_initializer(stddev=0.01))
180 - tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
181 - activation += hidden1_biases
182 - activation = act_fn(activation)
183 - tf.compat.v1.summary.histogram("hidden1_output", activation)
184 81
185 - aggregated_model = getattr(video_level_models, 82 + return {"predictions": output}
186 - FLAGS.video_level_classifier_model)
187 - return aggregated_model().create_model(model_input=activation,
188 - vocab_size=vocab_size,
189 - **unused_params)
190 83
191 class NetVLAD_NonLocal_types(): 84 class NetVLAD_NonLocal_types():
192 def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): 85 def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
...@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types(): ...@@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types():
286 return vlad_softmax 179 return vlad_softmax
287 180
288 class NetVLADModelLF(models.BaseModel): 181 class NetVLADModelLF(models.BaseModel):
289 - """Creates a NetVLAD based model.
290 - Args:
291 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
292 - input features.
293 - vocab_size: The number of classes in the dataset.
294 - num_frames: A vector of length 'batch' which indicates the number of
295 - frames for each video (before padding).
296 - Returns:
297 - A dictionary with a tensor containing the probability predictions of the
298 - model in the 'predictions' key. The dimensions of the tensor are
299 - 'batch_size' x 'num_classes'.
300 - """
301 -
302 -
303 def create_model(self, 182 def create_model(self,
304 model_input, 183 model_input,
305 vocab_size, 184 vocab_size,
...@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel): ...@@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel):
420 is_training=is_training, 299 is_training=is_training,
421 **unused_params) 300 **unused_params)
422 301
423 -class GruModel(models.BaseModel): 302 +class LstmModel(models.BaseModel):
424 -
425 - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
426 - """Creates a model which uses a stack of GRUs to represent the video.
427 - Args:
428 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
429 - input features.
430 - vocab_size: The number of classes in the dataset.
431 - num_frames: A vector of length 'batch' which indicates the number of
432 - frames for each video (before padding).
433 - Returns:
434 - A dictionary with a tensor containing the probability predictions of the
435 - model in the 'predictions' key. The dimensions of the tensor are
436 - 'batch_size' x 'num_classes'.
437 - """
438 - gru_size = 600
439 - number_of_layers = 4
440 - backward = False
441 - random_frames = False
442 - iterations = 30
443 -
444 - if random_frames:
445 - num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
446 - model_input = utils.SampleRandomFrames(model_input, num_frames_2,
447 - iterations)
448 303
449 - if backward: 304 + def create_model(self, model_input, vocab_size, num_frames, **unused_params):
450 - model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) 305 + lstm_size = FLAGS.lstm_cells
306 + number_of_layers = FLAGS.lstm_layers
451 307
452 - stacked_GRU = tf.contrib.rnn.MultiRNNCell( 308 + stacked_lstm = tf.contrib.rnn.MultiRNNCell(
453 [ 309 [
454 - tf.contrib.rnn.GRUCell(gru_size) 310 + tf.contrib.rnn.BasicLSTMCell(
311 + lstm_size, forget_bias=1.0)
455 for _ in range(number_of_layers) 312 for _ in range(number_of_layers)
456 - ], state_is_tuple=False) 313 + ])
457 314
458 loss = 0.0 315 loss = 0.0
459 - with tf.variable_scope("RNN"): 316 +
460 - outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input, 317 + outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
461 sequence_length=num_frames, 318 sequence_length=num_frames,
462 dtype=tf.float32) 319 dtype=tf.float32)
463 320
464 aggregated_model = getattr(video_level_models, 321 aggregated_model = getattr(video_level_models,
465 - 'MoeModel') 322 + FLAGS.video_level_classifier_model)
323 +
466 return aggregated_model().create_model( 324 return aggregated_model().create_model(
467 - model_input=state, 325 + model_input=state[-1].h,
468 vocab_size=vocab_size, 326 vocab_size=vocab_size,
469 - is_training=is_training,
470 **unused_params) 327 **unused_params)
471 328
...\ No newline at end of file ...\ No newline at end of file
472 -
473 -
474 -class SoftDBoF():
475 - def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training):
476 - self.feature_size = feature_size
477 - self.max_frames = max_frames
478 - self.is_training = is_training
479 - self.add_batch_norm = add_batch_norm
480 - self.cluster_size = cluster_size
481 - self.max_pool = max_pool
482 -
483 - def forward(self, reshaped_input):
484 -
485 - feature_size = self.feature_size
486 - cluster_size = self.cluster_size
487 - add_batch_norm = self.add_batch_norm
488 - max_frames = self.max_frames
489 - is_training = self.is_training
490 - max_pool = self.max_pool
491 -
492 - cluster_weights = tf.get_variable("cluster_weights",
493 - [feature_size, cluster_size],
494 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
495 -
496 - tf.summary.histogram("cluster_weights", cluster_weights)
497 - activation = tf.matmul(reshaped_input, cluster_weights)
498 -
499 - if add_batch_norm:
500 - activation = slim.batch_norm(
501 - activation,
502 - center=True,
503 - scale=True,
504 - is_training=is_training,
505 - scope="cluster_bn")
506 - else:
507 - cluster_biases = tf.get_variable("cluster_biases",
508 - [cluster_size],
509 - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
510 - tf.summary.histogram("cluster_biases", cluster_biases)
511 - activation += cluster_biases
512 -
513 - activation = tf.nn.softmax(activation)
514 -
515 - activation = tf.reshape(activation, [-1, int(max_frames), int(cluster_size)])
516 -
517 - activation_sum = tf.reduce_sum(activation,1)
518 - activation_sum = tf.nn.l2_normalize(activation_sum,1)
519 -
520 - if max_pool:
521 - activation_max = tf.reduce_max(activation,1)
522 - activation_max = tf.nn.l2_normalize(activation_max,1)
523 - activation = tf.concat([activation_sum,activation_max],1)
524 - else:
525 - activation = activation_sum
526 -
527 - return activation
528 -
529 -
530 -class LightVLAD_nonlocal():
531 - def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
532 - self.feature_size = feature_size
533 - self.max_frames = max_frames
534 - self.is_training = is_training
535 - self.add_batch_norm = add_batch_norm
536 - self.cluster_size = cluster_size
537 -
538 - def forward(self,reshaped_input):
539 -
540 -
541 - cluster_weights = tf.get_variable("cluster_weights",
542 - [int(self.feature_size), int(self.cluster_size)],
543 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
544 -
545 - activation = tf.matmul(reshaped_input, cluster_weights)
546 -
547 - if self.add_batch_norm:
548 - activation = slim.batch_norm(
549 - activation,
550 - center=True,
551 - scale=True,
552 - is_training=self.is_training,
553 - scope="cluster_bn")
554 - else:
555 - cluster_biases = tf.get_variable("cluster_biases",
556 - [cluster_size],
557 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
558 - tf.summary.histogram("cluster_biases", cluster_biases)
559 - activation += cluster_biases
560 -
561 - activation = tf.nn.softmax(activation)
562 -
563 - activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
564 -
565 - activation = tf.transpose(activation,perm=[0,2,1])
566 -
567 - reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
568 - vlad = tf.matmul(activation,reshaped_input)
569 -
570 - vlad = tf.reshape(vlad, [-1,self.feature_size])
571 - vlad = nonLocal_block(vlad, feature_size=self.feature_size, hidden_size=self.feature_size//2, cluster_size=self.cluster_size)
572 -
573 - vlad = tf.reshape(vlad, [-1,self.cluster_size,self.feature_size])
574 - vlad = tf.transpose(vlad,perm=[0,2,1])
575 -
576 - vlad = tf.nn.l2_normalize(vlad,1)
577 -
578 - vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
579 - vlad = tf.nn.l2_normalize(vlad,1)
580 -
581 - return vlad
582 -
583 -class LightNetVLADModelLF(models.BaseModel):
584 - """Creates a NetVLAD based model.
585 - Args:
586 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
587 - input features.
588 - vocab_size: The number of classes in the dataset.
589 - num_frames: A vector of length 'batch' which indicates the number of
590 - frames for each video (before padding).
591 - Returns:
592 - A dictionary with a tensor containing the probability predictions of the
593 - model in the 'predictions' key. The dimensions of the tensor are
594 - 'batch_size' x 'num_classes'.
595 - """
596 -
597 -
598 - def create_model(self,
599 - model_input,
600 - vocab_size,
601 - num_frames,
602 - iterations=None,
603 - add_batch_norm=None,
604 - sample_random_frames=None,
605 - cluster_size=None,
606 - hidden_size=None,
607 - is_training=True,
608 - **unused_params):
609 - iterations = 300
610 - add_batch_norm = True
611 - random_frames = True
612 - cluster_size = 64
613 - hidden1_size = 1024
614 - relu = False
615 - dimred = -1
616 - gating = True
617 - remove_diag = False
618 -
619 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
620 - if random_frames:
621 - model_input = utils.SampleRandomFrames(model_input, num_frames,
622 - iterations)
623 - else:
624 - model_input = utils.SampleRandomSequence(model_input, num_frames,
625 - iterations)
626 -
627 -
628 - max_frames = model_input.get_shape().as_list()[1]
629 - feature_size = model_input.get_shape().as_list()[2]
630 - reshaped_input = tf.reshape(model_input, [-1, feature_size])
631 -
632 -
633 - video_NetVLAD = LightVLAD_nonlocal(1024,max_frames,cluster_size, add_batch_norm, is_training)
634 - audio_NetVLAD = LightVLAD_nonlocal(128,max_frames,cluster_size/2, add_batch_norm, is_training)
635 -
636 -
637 - if add_batch_norm:# and not lightvlad:
638 - reshaped_input = slim.batch_norm(
639 - reshaped_input,
640 - center=True,
641 - scale=True,
642 - is_training=is_training,
643 - scope="input_bn")
644 -
645 - with tf.variable_scope("video_VLAD"):
646 - vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024])
647 -
648 - with tf.variable_scope("audio_VLAD"):
649 - vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])
650 -
651 - vlad = tf.concat([vlad_video, vlad_audio],1)
652 -
653 - vlad_dim = vlad.get_shape().as_list()[1]
654 - hidden1_weights = tf.get_variable("hidden1_weights",
655 - [vlad_dim, hidden1_size],
656 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
657 -
658 - activation = tf.matmul(vlad, hidden1_weights)
659 -
660 - if add_batch_norm and relu:
661 - activation = slim.batch_norm(
662 - activation,
663 - center=True,
664 - scale=True,
665 - is_training=is_training,
666 - scope="hidden1_bn")
667 -
668 - else:
669 - hidden1_biases = tf.get_variable("hidden1_biases",
670 - [hidden1_size],
671 - initializer = tf.random_normal_initializer(stddev=0.01))
672 - tf.summary.histogram("hidden1_biases", hidden1_biases)
673 - activation += hidden1_biases
674 -
675 - if relu:
676 - activation = tf.nn.relu6(activation)
677 -
678 -
679 - if gating:
680 - gating_weights = tf.get_variable("gating_weights_2",
681 - [hidden1_size, hidden1_size],
682 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
683 -
684 - gates = tf.matmul(activation, gating_weights)
685 -
686 - if remove_diag:
687 - #removes diagonals coefficients
688 - diagonals = tf.matrix_diag_part(gating_weights)
689 - gates = gates - tf.multiply(diagonals,activation)
690 -
691 -
692 - if add_batch_norm:
693 - gates = slim.batch_norm(
694 - gates,
695 - center=True,
696 - scale=True,
697 - is_training=is_training,
698 - scope="gating_bn")
699 - else:
700 - gating_biases = tf.get_variable("gating_biases",
701 - [cluster_size],
702 - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
703 - gates += gating_biases
704 -
705 - gates = tf.sigmoid(gates)
706 -
707 - activation = tf.multiply(activation,gates)
708 -
709 - aggregated_model = getattr(video_level_models,
710 - FLAGS.video_level_classifier_model)
711 -
712 -
713 - return aggregated_model().create_model(
714 - model_input=activation,
715 - vocab_size=vocab_size,
716 - is_training=is_training,
717 - **unused_params)
718 -
719 -def nonLocal_block(vlad, feature_size, hidden_size, cluster_size):
720 - nonlocal_theta = tf.get_variable("nonlocal_theta",
721 - [feature_size, hidden_size],
722 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
723 - nonlocal_phi = tf.get_variable("nonlocal_phi",
724 - [feature_size, hidden_size],
725 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
726 - nonlocal_g = tf.get_variable("nonlocal_g",
727 - [feature_size, hidden_size],
728 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
729 - nonlocal_out = tf.get_variable("nonlocal_out",
730 - [hidden_size, feature_size],
731 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden_size)))
732 -
733 - vlad_theta = tf.matmul(vlad, nonlocal_theta)
734 - vlad_phi = tf.matmul(vlad, nonlocal_phi)
735 - vlad_g = tf.matmul(vlad, nonlocal_g)
736 -
737 - vlad_theta = tf.reshape(vlad_theta, [-1, cluster_size, hidden_size])
738 - vlad_phi = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
739 - vlad_g = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size])
740 -
741 - vlad_softmax = tf.nn.softmax(feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
742 - vlad_g = tf.matmul(vlad_softmax, vlad_g)
743 - vlad_g = tf.reshape(vlad_g, [-1, hidden_size])
744 -
745 - vlad_g = tf.matmul(vlad_g, nonlocal_out)
746 - vlad = vlad + vlad_g
747 - return vlad
748 -
749 -class SoftDbofModelLF(models.BaseModel):
750 - """Creates a Soft Deep Bag of Frames model.
751 - The model projects the features for each frame into a higher dimensional
752 - 'clustering' space, pools across frames in that space, and then
753 - uses a configurable video-level model to classify the now aggregated features.
754 - The model will randomly sample either frames or sequences of frames during
755 - training to speed up convergence.
756 - Args:
757 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
758 - input features.
759 - vocab_size: The number of classes in the dataset.
760 - num_frames: A vector of length 'batch' which indicates the number of
761 - frames for each video (before padding).
762 - Returns:
763 - A dictionary with a tensor containing the probability predictions of the
764 - model in the 'predictions' key. The dimensions of the tensor are
765 - 'batch_size' x 'num_classes'.
766 - """
767 -
768 - def create_model(self,
769 - model_input,
770 - vocab_size,
771 - num_frames,
772 - iterations=None,
773 - add_batch_norm=None,
774 - sample_random_frames=None,
775 - cluster_size=None,
776 - hidden_size=None,
777 - is_training=True,
778 - **unused_params):
779 - iterations = 300
780 - add_batch_norm = True
781 - random_frames = True
782 - cluster_size = 4000
783 - hidden1_size = 1024
784 - fc_dimred = True
785 - relu = False
786 - max_pool = False
787 -
788 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
789 - if random_frames:
790 - model_input = utils.SampleRandomFrames(model_input, num_frames,
791 - iterations)
792 - else:
793 - model_input = utils.SampleRandomSequence(model_input, num_frames,
794 - iterations)
795 - max_frames = model_input.get_shape().as_list()[1]
796 - feature_size = model_input.get_shape().as_list()[2]
797 - reshaped_input = tf.reshape(model_input, [-1, feature_size])
798 - tf.summary.histogram("input_hist", reshaped_input)
799 -
800 - video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
801 - audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
802 -
803 -
804 - if add_batch_norm:
805 - reshaped_input = slim.batch_norm(
806 - reshaped_input,
807 - center=True,
808 - scale=True,
809 - is_training=is_training,
810 - scope="input_bn")
811 -
812 - with tf.variable_scope("video_DBOF"):
813 - dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
814 -
815 - with tf.variable_scope("audio_DBOF"):
816 - dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
817 -
818 - dbof = tf.concat([dbof_video, dbof_audio],1)
819 -
820 - dbof_dim = dbof.get_shape().as_list()[1]
821 -
822 - if fc_dimred:
823 - hidden1_weights = tf.get_variable("hidden1_weights",
824 - [dbof_dim, hidden1_size],
825 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
826 - tf.summary.histogram("hidden1_weights", hidden1_weights)
827 - activation = tf.matmul(dbof, hidden1_weights)
828 -
829 - if add_batch_norm and relu:
830 - activation = slim.batch_norm(
831 - activation,
832 - center=True,
833 - scale=True,
834 - is_training=is_training,
835 - scope="hidden1_bn")
836 - else:
837 - hidden1_biases = tf.get_variable("hidden1_biases",
838 - [hidden1_size],
839 - initializer = tf.random_normal_initializer(stddev=0.01))
840 - tf.summary.histogram("hidden1_biases", hidden1_biases)
841 - activation += hidden1_biases
842 -
843 - if relu:
844 - activation = tf.nn.relu6(activation)
845 - tf.summary.histogram("hidden1_output", activation)
846 - else:
847 - activation = dbof
848 -
849 - aggregated_model = getattr(video_level_models,
850 - FLAGS.video_level_classifier_model)
851 -
852 - return aggregated_model().create_model(
853 - model_input=activation,
854 - vocab_size=vocab_size,
855 - is_training=is_training,
856 - **unused_params)
857 -
858 -
859 -
860 -class early_NetVLADModelLF(models.BaseModel):
861 - """Creates a NetVLAD based model.
862 - Args:
863 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
864 - input features.
865 - vocab_size: The number of classes in the dataset.
866 - num_frames: A vector of length 'batch' which indicates the number of
867 - frames for each video (before padding).
868 - Returns:
869 - A dictionary with a tensor containing the probability predictions of the
870 - model in the 'predictions' key. The dimensions of the tensor are
871 - 'batch_size' x 'num_classes'.
872 - """
873 -
874 -
875 - def create_model(self,
876 - model_input,
877 - vocab_size,
878 - num_frames,
879 - iterations=None,
880 - add_batch_norm=None,
881 - sample_random_frames=None,
882 - cluster_size=None,
883 - hidden_size=None,
884 - is_training=True,
885 - **unused_params):
886 - iterations = 300
887 - add_batch_norm = True
888 - random_frames = True
889 - cluster_size = 64
890 - hidden1_size = 1024
891 - relu = False
892 - dimred = -1
893 - gating = True
894 - remove_diag = False
895 -
896 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
897 - if random_frames:
898 - model_input = utils.SampleRandomFrames(model_input, num_frames,
899 - iterations)
900 - else:
901 - model_input = utils.SampleRandomSequence(model_input, num_frames,
902 - iterations)
903 -
904 -
905 - max_frames = model_input.get_shape().as_list()[1]
906 - feature_size = model_input.get_shape().as_list()[2]
907 - reshaped_input = tf.reshape(model_input, [-1, feature_size])
908 -
909 - video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training)
910 -
911 - if add_batch_norm:# and not lightvlad:
912 - reshaped_input = slim.batch_norm(
913 - reshaped_input,
914 - center=True,
915 - scale=True,
916 - is_training=is_training,
917 - scope="input_bn")
918 - with tf.variable_scope("video_audio_VLAD"):
919 - vlad = video_audio_NetVLAD.forward(reshaped_input)
920 -
921 - vlad_dim = vlad.get_shape().as_list()[1]
922 - hidden1_weights = tf.get_variable("hidden1_weights",
923 - [vlad_dim, hidden1_size],
924 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
925 -
926 - activation = tf.matmul(vlad, hidden1_weights)
927 -
928 - if add_batch_norm and relu:
929 - activation = slim.batch_norm(
930 - activation,
931 - center=True,
932 - scale=True,
933 - is_training=is_training,
934 - scope="hidden1_bn")
935 -
936 - else:
937 - hidden1_biases = tf.get_variable("hidden1_biases",
938 - [hidden1_size],
939 - initializer = tf.random_normal_initializer(stddev=0.01))
940 - tf.summary.histogram("hidden1_biases", hidden1_biases)
941 - activation += hidden1_biases
942 -
943 - if relu:
944 - activation = tf.nn.relu6(activation)
945 -
946 -
947 - if gating:
948 - gating_weights = tf.get_variable("gating_weights_2",
949 - [hidden1_size, hidden1_size],
950 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
951 -
952 - gates = tf.matmul(activation, gating_weights)
953 -
954 - if remove_diag:
955 - #removes diagonals coefficients
956 - diagonals = tf.matrix_diag_part(gating_weights)
957 - gates = gates - tf.multiply(diagonals,activation)
958 -
959 -
960 - if add_batch_norm:
961 - gates = slim.batch_norm(
962 - gates,
963 - center=True,
964 - scale=True,
965 - is_training=is_training,
966 - scope="gating_bn")
967 - else:
968 - gating_biases = tf.get_variable("gating_biases",
969 - [cluster_size],
970 - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
971 - gates += gating_biases
972 -
973 - gates = tf.sigmoid(gates)
974 -
975 - activation = tf.multiply(activation,gates)
976 -
977 - aggregated_model = getattr(video_level_models,
978 - FLAGS.video_level_classifier_model)
979 -
980 -
981 - return aggregated_model().create_model(
982 - model_input=activation,
983 - vocab_size=vocab_size,
984 - is_training=is_training,
985 - **unused_params)
986 -
987 -class NetVLAD_NonLocal():
988 - def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training):
989 - self.feature_size = feature_size
990 - self.max_frames = max_frames
991 - self.is_training = is_training
992 - self.add_batch_norm = add_batch_norm
993 - self.cluster_size = cluster_size
994 -
995 - def forward(self,reshaped_input):
996 -
997 - cluster_weights = tf.get_variable("cluster_weights",
998 - [int(self.feature_size), int(self.cluster_size)],
999 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1000 -
1001 - tf.summary.histogram("cluster_weights", cluster_weights)
1002 - activation = tf.matmul(reshaped_input, cluster_weights)
1003 -
1004 - if self.add_batch_norm:
1005 - activation = slim.batch_norm(
1006 - activation,
1007 - center=True,
1008 - scale=True,
1009 - is_training=self.is_training,
1010 - scope="cluster_bn")
1011 - else:
1012 - cluster_biases = tf.get_variable("cluster_biases",
1013 - [cluster_size],
1014 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1015 - tf.summary.histogram("cluster_biases", cluster_biases)
1016 - activation += cluster_biases
1017 -
1018 - activation = tf.nn.softmax(activation)
1019 - tf.summary.histogram("cluster_output", activation)
1020 -
1021 - activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
1022 -
1023 - a_sum = tf.reduce_sum(activation,-2,keep_dims=True)
1024 -
1025 - cluster_weights2 = tf.get_variable("cluster_weights2",
1026 - [1,int(self.feature_size), int(self.cluster_size)],
1027 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1028 -
1029 - a = tf.multiply(a_sum,cluster_weights2)
1030 -
1031 - activation = tf.transpose(activation,perm=[0,2,1])
1032 -
1033 - reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
1034 - vlad = tf.matmul(activation,reshaped_input)
1035 - vlad = tf.transpose(vlad,perm=[0,2,1])
1036 - vlad = tf.subtract(vlad,a)
1037 -
1038 -
1039 - vlad = tf.transpose(vlad,perm=[0,2,1])
1040 - vlad = tf.reshape(vlad, [-1, self.feature_size])
1041 -
1042 - nonlocal_theta = tf.get_variable("nonlocal_theta",
1043 - [int(self.feature_size), int(self.cluster_size)],
1044 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1045 - nonlocal_phi = tf.get_variable("nonlocal_phi",
1046 - [int(self.feature_size), int(self.cluster_size)],
1047 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1048 - nonlocal_g = tf.get_variable("nonlocal_g",
1049 - [int(self.feature_size), int(self.cluster_size)],
1050 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
1051 - nonlocal_out = tf.get_variable("nonlocal_out",
1052 - [int(self.cluster_size), int(self.feature_size)],
1053 - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
1054 -
1055 - vlad_theta = tf.matmul(vlad, nonlocal_theta)
1056 - vlad_phi = tf.matmul(vlad, nonlocal_phi)
1057 - vlad_g = tf.matmul(vlad, nonlocal_g)
1058 -
1059 - vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size),int(self.cluster_size)])
1060 - vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
1061 - vlad_g = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)])
1062 -
1063 - vlad_softmax = tf.nn.softmax(self.feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1])))
1064 - vlad_g = tf.matmul(vlad_softmax, vlad_g)
1065 - vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])
1066 -
1067 - vlad_g = tf.matmul(vlad_g, nonlocal_out)
1068 - vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)])
1069 - vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)])
1070 - vlad = vlad + vlad_g
1071 -
1072 - vlad = tf.transpose(vlad,perm=[0,2,1])
1073 - vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c]
1074 -
1075 - vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)])
1076 - vlad = tf.nn.l2_normalize(vlad,1)
1077 -
1078 - return vlad
1079 -
1080 -
1081 -class SoftDbofModelLF_8k(models.BaseModel):
1082 - """Creates a Soft Deep Bag of Frames model.
1083 - The model projects the features for each frame into a higher dimensional
1084 - 'clustering' space, pools across frames in that space, and then
1085 - uses a configurable video-level model to classify the now aggregated features.
1086 - The model will randomly sample either frames or sequences of frames during
1087 - training to speed up convergence.
1088 - Args:
1089 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1090 - input features.
1091 - vocab_size: The number of classes in the dataset.
1092 - num_frames: A vector of length 'batch' which indicates the number of
1093 - frames for each video (before padding).
1094 - Returns:
1095 - A dictionary with a tensor containing the probability predictions of the
1096 - model in the 'predictions' key. The dimensions of the tensor are
1097 - 'batch_size' x 'num_classes'.
1098 - """
1099 -
1100 - def create_model(self,
1101 - model_input,
1102 - vocab_size,
1103 - num_frames,
1104 - iterations=None,
1105 - add_batch_norm=None,
1106 - sample_random_frames=None,
1107 - cluster_size=None,
1108 - hidden_size=None,
1109 - is_training=True,
1110 - **unused_params):
1111 - iterations = 300
1112 - add_batch_norm = True
1113 - random_frames = True
1114 - cluster_size = 2048
1115 - hidden1_size = 1024
1116 - fc_dimred = True
1117 - relu = False
1118 - max_pool = False
1119 -
1120 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1121 - if random_frames:
1122 - model_input = utils.SampleRandomFrames(model_input, num_frames,
1123 - iterations)
1124 - else:
1125 - model_input = utils.SampleRandomSequence(model_input, num_frames,
1126 - iterations)
1127 - max_frames = model_input.get_shape().as_list()[1]
1128 - feature_size = model_input.get_shape().as_list()[2]
1129 - reshaped_input = tf.reshape(model_input, [-1, feature_size])
1130 - tf.summary.histogram("input_hist", reshaped_input)
1131 -
1132 - video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
1133 - audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)
1134 -
1135 -
1136 - if add_batch_norm:
1137 - reshaped_input = slim.batch_norm(
1138 - reshaped_input,
1139 - center=True,
1140 - scale=True,
1141 - is_training=is_training,
1142 - scope="input_bn")
1143 -
1144 - with tf.variable_scope("video_DBOF"):
1145 - dbof_video = video_Dbof.forward(reshaped_input[:,0:1024])
1146 -
1147 - with tf.variable_scope("audio_DBOF"):
1148 - dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])
1149 -
1150 - dbof = tf.concat([dbof_video, dbof_audio],1)
1151 -
1152 - dbof_dim = dbof.get_shape().as_list()[1]
1153 -
1154 - if fc_dimred:
1155 - hidden1_weights = tf.get_variable("hidden1_weights",
1156 - [dbof_dim, hidden1_size],
1157 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
1158 - tf.summary.histogram("hidden1_weights", hidden1_weights)
1159 - activation = tf.matmul(dbof, hidden1_weights)
1160 -
1161 - if add_batch_norm and relu:
1162 - activation = slim.batch_norm(
1163 - activation,
1164 - center=True,
1165 - scale=True,
1166 - is_training=is_training,
1167 - scope="hidden1_bn")
1168 - else:
1169 - hidden1_biases = tf.get_variable("hidden1_biases",
1170 - [hidden1_size],
1171 - initializer = tf.random_normal_initializer(stddev=0.01))
1172 - tf.summary.histogram("hidden1_biases", hidden1_biases)
1173 - activation += hidden1_biases
1174 -
1175 - if relu:
1176 - activation = tf.nn.relu6(activation)
1177 - tf.summary.histogram("hidden1_output", activation)
1178 - else:
1179 - activation = dbof
1180 -
1181 - aggregated_model = getattr(video_level_models,
1182 - FLAGS.video_level_classifier_model)
1183 -
1184 -
1185 - return aggregated_model().create_model(
1186 - model_input=activation,
1187 - vocab_size=vocab_size,
1188 - is_training=is_training,
1189 - **unused_params)
1190 -
1191 -class FrameLevelLogisticModel(models.BaseModel):
1192 - """Creates a logistic classifier over the aggregated frame-level features."""
1193 -
1194 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1195 - """See base class.
1196 -
1197 - This class is intended to be an example for implementors of frame level
1198 - models. If you want to train a model over averaged features it is more
1199 - efficient to average them beforehand rather than on the fly.
1200 -
1201 - Args:
1202 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1203 - input features.
1204 - vocab_size: The number of classes in the dataset.
1205 - num_frames: A vector of length 'batch' which indicates the number of
1206 - frames for each video (before padding).
1207 -
1208 - Returns:
1209 - A dictionary with a tensor containing the probability predictions of the
1210 - model in the 'predictions' key. The dimensions of the tensor are
1211 - 'batch_size' x 'num_classes'.
1212 - """
1213 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1214 - feature_size = model_input.get_shape().as_list()[2]
1215 -
1216 - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
1217 - [-1, feature_size])
1218 - avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators
1219 -
1220 - output = slim.fully_connected(avg_pooled,
1221 - vocab_size,
1222 - activation_fn=tf.nn.sigmoid,
1223 - weights_regularizer=slim.l2_regularizer(1e-8))
1224 -
1225 - return {"predictions": output}
1226 -
1227 -class CNN(models.BaseModel):
1228 - """Creates a logistic classifier over the aggregated frame-level features."""
1229 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1230 - """See base class.
1231 -
1232 - This class is intended to be an example for implementors of frame level
1233 - models. If you want to train a model over averaged features it is more
1234 - efficient to average them beforehand rather than on the fly.
1235 -
1236 - Args:
1237 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1238 - input features.
1239 - vocab_size: The number of classes in the dataset.
1240 - num_frames: A vector of length 'batch' which indicates the number of
1241 - frames for each video (before padding).
1242 -
1243 - Returns:
1244 - A dictionary with a tensor containing the probability predictions of the
1245 - model in the 'predictions' key. The dimensions of the tensor are
1246 - 'batch_size' x 'num_classes'.
1247 - """
1248 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1249 - feature_size = model_input.get_shape().as_list()[2]
1250 -
1251 - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]),
1252 - [-1, feature_size])
1253 -
1254 -
1255 -
1256 - convK3 = slim.convolution(model_input,
1257 - num_outputs=feature_size,
1258 - kernel_size=3,
1259 - scope='conv1')
1260 -
1261 - convK5 = slim.convolution(model_input,
1262 - num_outputs=feature_size,
1263 - kernel_size=5,
1264 - scope='conv2')
1265 -
1266 - convK1 = slim.convolution(model_input,
1267 - num_outputs=feature_size,
1268 - kernel_size=5,
1269 - scope='conv3')
1270 -
1271 -
1272 - avg_pooled = tf.reduce_sum(tf.concat([convK3,convK5,convK1],axis=1), axis=[1]) / denominators
1273 -
1274 - output = slim.fully_connected(avg_pooled,
1275 - vocab_size,
1276 - activation_fn=tf.nn.relu,
1277 - weights_regularizer=slim.l2_regularizer(1e-8))
1278 -
1279 - return {"predictions": output}
1280 -
1281 -class LstmModel(models.BaseModel):
1282 -
1283 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1284 - """Creates a model which uses a stack of LSTMs to represent the video.
1285 - Args:
1286 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1287 - input features.
1288 - vocab_size: The number of classes in the dataset.
1289 - num_frames: A vector of length 'batch' which indicates the number of
1290 - frames for each video (before padding).
1291 - Returns:
1292 - A dictionary with a tensor containing the probability predictions of the
1293 - model in the 'predictions' key. The dimensions of the tensor are
1294 - 'batch_size' x 'num_classes'.
1295 - """
1296 - lstm_size = FLAGS.lstm_cells
1297 - number_of_layers = FLAGS.lstm_layers
1298 -
1299 - stacked_lstm = tf.contrib.rnn.MultiRNNCell(
1300 - [
1301 - tf.contrib.rnn.BasicLSTMCell(
1302 - lstm_size, forget_bias=1.0)
1303 - for _ in range(number_of_layers)
1304 - ])
1305 -
1306 - loss = 0.0
1307 -
1308 - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
1309 - sequence_length=num_frames,
1310 - dtype=tf.float32)
1311 -
1312 - aggregated_model = getattr(video_level_models,
1313 - FLAGS.video_level_classifier_model)
1314 -
1315 - return aggregated_model().create_model(
1316 - model_input=state[-1].h,
1317 - vocab_size=vocab_size,
1318 - **unused_params)
1319 -
1320 -class BNGRUModel(models.BaseModel):
1321 -
1322 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1323 - lstm_size = FLAGS.lstm_cells
1324 - number_of_layers = FLAGS.lstm_layers
1325 -
1326 - stacked_rnn = tf.contrib.rnn.MultiRNNCell(
1327 - [
1328 - tf.contrib.rnn.GRUCell(lstm_size)
1329 - for _ in range(number_of_layers)
1330 - ], state_is_tuple=False)
1331 -
1332 - outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
1333 - sequence_length=num_frames,
1334 - dtype=tf.float32)
1335 - aggregated_model = getattr(video_level_models,
1336 - FLAGS.video_level_classifier_model)
1337 -
1338 - state = slim.batch_norm(
1339 - state,
1340 - center=True,
1341 - scale=True,
1342 - is_training=True,
1343 - scope='proj')
1344 -
1345 - return aggregated_model().create_model(
1346 - model_input=state,
1347 - vocab_size=vocab_size,
1348 - **unused_params)
1349 -
1350 -
1351 -
1352 -class GruModel2(models.BaseModel):
1353 -
1354 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1355 - """Creates a model which uses a stack of LSTMs to represent the video.
1356 - Args:
1357 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1358 - input features.
1359 - vocab_size: The number of classes in the dataset.
1360 - num_frames: A vector of length 'batch' which indicates the number of
1361 - frames for each video (before padding).
1362 - Returns:
1363 - A dictionary with a tensor containing the probability predictions of the
1364 - model in the 'predictions' key. The dimensions of the tensor are
1365 - 'batch_size' x 'num_classes'.
1366 - """
1367 - lstm_size = FLAGS.lstm_cells
1368 - number_of_layers = FLAGS.lstm_layers
1369 -
1370 - stacked_lstm = tf.contrib.rnn.MultiRNNCell(
1371 - [
1372 - tf.contrib.rnn.GRUCell(lstm_size)
1373 - for _ in range(number_of_layers)
1374 - ], state_is_tuple=False)
1375 -
1376 - loss = 0.0
1377 -
1378 - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
1379 - sequence_length=num_frames,
1380 - dtype=tf.float32)
1381 - aggregated_model = getattr(video_level_models,
1382 - FLAGS.video_level_classifier_model)
1383 -
1384 - return aggregated_model().create_model(
1385 - model_input=state,
1386 - vocab_size=vocab_size,
1387 - **unused_params)
1388 -
1389 -
1390 -class BiGRUModel(models.BaseModel):
1391 -
1392 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1393 - """Creates a model which uses a stack of LSTMs to represent the video.
1394 - Args:
1395 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1396 - input features.
1397 - vocab_size: The number of classes in the dataset.
1398 - num_frames: A vector of length 'batch' which indicates the number of
1399 - frames for each video (before padding).
1400 - Returns:
1401 - A dictionary with a tensor containing the probability predictions of the
1402 - model in the 'predictions' key. The dimensions of the tensor are
1403 - 'batch_size' x 'num_classes'.
1404 - """
1405 - lstm_size = FLAGS.lstm_cells
1406 - number_of_layers = FLAGS.lstm_layers
1407 -
1408 - with tf.variable_scope('fw'):
1409 - rnn_fw = tf.contrib.rnn.MultiRNNCell(
1410 - [
1411 - tf.contrib.rnn.GRUCell(lstm_size)
1412 - for _ in range(number_of_layers)
1413 - ], state_is_tuple=False)
1414 -
1415 -
1416 - with tf.variable_scope('bw'):
1417 - rnn_bw = tf.contrib.rnn.MultiRNNCell(
1418 - [
1419 - tf.contrib.rnn.GRUCell(lstm_size)
1420 - for _ in range(number_of_layers)
1421 - ], state_is_tuple=False)
1422 -
1423 - outputs, state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, model_input,
1424 - sequence_length=num_frames,
1425 - dtype=tf.float32, swap_memory=True)
1426 - state = tf.concat(state, axis=1)
1427 - aggregated_model = getattr(video_level_models,
1428 - FLAGS.video_level_classifier_model)
1429 - state = slim.batch_norm(
1430 - state,
1431 - center=True,
1432 - scale=True,
1433 - is_training=True,
1434 - scope='proj')
1435 -
1436 - return aggregated_model().create_model(
1437 - model_input=state,
1438 - vocab_size=vocab_size,
1439 - **unused_params)
1440 -
1441 -"""
1442 -Copyright (c) 2017, University of Texas Southwestern Medical Center
1443 -All rights reserved.
1444 -Redistribution and use in source and binary forms, with or without
1445 -modification, are permitted provided that the following conditions are met:
1446 -* Redistributions of source code must retain the above copyright notice, this
1447 - list of conditions and the following disclaimer.
1448 -* Redistributions in binary form must reproduce the above copyright notice,
1449 - this list of conditions and the following disclaimer in the documentation
1450 - and/or other materials provided with the distribution.
1451 -* Neither the name of the University of Texas at Austin nor the names of its
1452 - contributors may be used to endorse or promote products derived from
1453 - this software without specific prior written permission.
1454 -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS
1455 -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1456 -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1457 -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
1458 -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1459 -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1460 -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
1461 -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
1462 -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
1463 -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1464 -Recurrent Weighted Average
1465 -Implementation modified from: https://github.com/jostmey/rwa
1466 -Paper:
1467 -@article{ostmeyer2017machine,
1468 - title={Machine Learning on Sequential Data Using a Recurrent Weighted Average},
1469 - author={Ostmeyer, Jared and Cowell, Lindsay},
1470 - journal={arXiv preprint arXiv:1703.01253},
1471 - year={2017}
1472 -}
1473 -"""
1474 -
1475 -class RwaModel(models.BaseModel):
1476 -
1477 -
1478 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1479 -
1480 - # constants
1481 -
1482 - init_factor = 1.0
1483 - num_cells = FLAGS.lstm_cells
1484 - input_shape = model_input.get_shape().as_list()
1485 - batch_size, max_steps, num_features = input_shape
1486 -
1487 - # trainable weights
1488 - s = weights_rwa.init_state(num_cells, "s", init_factor)
1489 - W_g = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_g")
1490 - W_u = weights_rwa.init_weight([num_features, num_cells], "W_u")
1491 - W_a = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_a")
1492 - b_g = weights_rwa.init_bias(num_cells, "b_g")
1493 - b_u = weights_rwa.init_bias(num_cells, "b_u")
1494 - b_a = weights_rwa.init_bias(num_cells, "b_a")
1495 -
1496 - #pl = tf.placeholder(tf.float32, shape=[None, num_cells])
1497 - pl = tf.reshape(model_input, [-1, max_steps*num_features])[:, :num_cells]
1498 -
1499 - # internal states
1500 - #n = tf.zeros([batch_size, num_cells])
1501 - #d = tf.zeros([batch_size, num_cells])
1502 - #h = tf.zeros([batch_size, num_cells])
1503 - #a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible
1504 - n = tf.zeros_like(pl)
1505 - d = tf.zeros_like(pl)
1506 - h = tf.zeros_like(pl)
1507 - a_max = tf.multiply(tf.ones_like(pl), -1E38)
1508 -
1509 - # define model
1510 - h += tf.nn.tanh(tf.expand_dims(s, 0))
1511 -
1512 - for i in range(max_steps):
1513 -
1514 - x_step = model_input[:,i,:]
1515 - xh_join = tf.concat(axis=1, values=[x_step, h]) # Combine the features and hidden state into one tensor
1516 -
1517 - u = tf.matmul(x_step, W_u)+b_u
1518 - g = tf.matmul(xh_join, W_g)+b_g
1519 - a = tf.matmul(xh_join, W_a) # The bias term when factored out of the numerator and denominator cancels and is unnecessary
1520 -
1521 - z = tf.multiply(u, tf.nn.tanh(g))
1522 -
1523 - a_newmax = tf.maximum(a_max, a)
1524 - exp_diff = tf.exp(a_max-a_newmax)
1525 - exp_scaled = tf.exp(a-a_newmax)
1526 -
1527 - n = tf.multiply(n, exp_diff)+tf.multiply(z, exp_scaled) # Numerically stable update of numerator
1528 - d = tf.multiply(d, exp_diff)+exp_scaled # Numerically stable update of denominator
1529 - h_new = tf.nn.tanh(tf.div(n, d))
1530 - a_max = a_newmax
1531 -
1532 - h = tf.where(tf.greater(num_frames, i), h_new, h) # Use new hidden state only if the sequence length has not been exceeded
1533 -
1534 -
1535 - aggregated_model = getattr(video_level_models,
1536 - FLAGS.video_level_classifier_model)
1537 - return aggregated_model().create_model(
1538 - model_input=h,
1539 - vocab_size=vocab_size,
1540 - **unused_params)
1541 -
1542 -
1543 -
1544 -class DropoutGruModel(models.BaseModel):
1545 -
1546 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1547 - """Creates a model which uses a stack of LSTMs to represent the video.
1548 - Args:
1549 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1550 - input features.
1551 - vocab_size: The number of classes in the dataset.
1552 - num_frames: A vector of length 'batch' which indicates the number of
1553 - frames for each video (before padding).
1554 - Returns:
1555 - A dictionary with a tensor containing the probability predictions of the
1556 - model in the 'predictions' key. The dimensions of the tensor are
1557 - 'batch_size' x 'num_classes'.
1558 - """
1559 - lstm_size = FLAGS.lstm_cells
1560 - number_of_layers = FLAGS.lstm_layers
1561 -
1562 - stacked_lstm = tf.contrib.rnn.MultiRNNCell(
1563 - [
1564 - tf.contrib.rnn.DropoutWrapper(
1565 - tf.contrib.rnn.GRUCell(lstm_size), 0.9, 0.9)
1566 - for _ in range(number_of_layers)
1567 - ], state_is_tuple=False)
1568 -
1569 - loss = 0.0
1570 -
1571 - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
1572 - sequence_length=num_frames,
1573 - dtype=tf.float32)
1574 - aggregated_model = getattr(video_level_models,
1575 - FLAGS.video_level_classifier_model)
1576 -
1577 - aggregated_model = FrameLevelLogisticModel;
1578 - return aggregated_model().create_model(
1579 - model_input=outputs,
1580 - vocab_size=vocab_size,
1581 - num_frames=num_frames,
1582 - **unused_params)
1583 -
1584 -
1585 -
1586 -
1587 -class ResRnnModel(models.BaseModel):
1588 -
1589 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1590 - lstm_size = 1152
1591 - number_of_layers = 3
1592 -
1593 - #from rnn_cell_modern import Delta_RNN as drnn
1594 - from rnn_wrappers_modern import MultiRNNCell as mrnn
1595 -
1596 - cells = []
1597 - for i in range(number_of_layers):
1598 - with tf.variable_scope('cell_'+str(i)):
1599 - cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0))
1600 -
1601 - stacked_rnn = mrnn(cells, use_residual_connections=True, state_is_tuple=True)
1602 -
1603 - outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input,
1604 - sequence_length=num_frames,
1605 - dtype=tf.float32)
1606 -
1607 - aggregated_model = getattr(video_level_models,
1608 - FLAGS.video_level_classifier_model)
1609 -
1610 - return aggregated_model().create_model(
1611 - model_input=state[-1].h,
1612 - vocab_size=vocab_size,
1613 - **unused_params)
1614 -
1615 -
1616 -class LateVladModel(models.BaseModel):
1617 -
1618 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1619 - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1620 - model_input = utils.SampleRandomSequence(model_input, num_frames, 128)
1621 -
1622 - input_v = model_input[:,:,:1024]
1623 - input_a = model_input[:,:,1024:]
1624 -
1625 - K = 8
1626 -
1627 - with tf.variable_scope('video'):
1628 - x = input_v
1629 - input_shape = x.get_shape().as_list()
1630 - _, N, D = input_shape
1631 - c_bound = math.sqrt(1. / (K * D))
1632 - c = tf.get_variable(name='c',
1633 - shape=[K, N],
1634 - dtype=tf.float32,
1635 - initializer=tf.random_uniform_initializer(-c_bound, c_bound))
1636 - a = slim.convolution(x,
1637 - num_outputs=K,
1638 - kernel_size=1,
1639 - data_format='NWC',
1640 - scope='conv')
1641 - a = tf.nn.softmax(a)
1642 - v = []
1643 - for k in range(K):
1644 - t = x-c[k][None, :, None]
1645 - t = tf.multiply(t, a[:,:,k][:,:,None])
1646 - t = tf.reduce_sum(t, 1)
1647 - t = tf.nn.l2_normalize(t, dim=1)
1648 - v.append(t)
1649 - v = tf.stack(v, axis=1)
1650 - v = tf.reshape(v, [-1, K*D])
1651 -
1652 - proj_weights = tf.get_variable("proj_weights",
1653 - [K*D, 1024],
1654 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
1655 - activation_v = tf.matmul(v, proj_weights)
1656 -
1657 - with tf.variable_scope('audio'):
1658 - x = input_a
1659 - input_shape = x.get_shape().as_list()
1660 - _, N, D = input_shape
1661 - c_bound = math.sqrt(1. / (K * D))
1662 - c = tf.get_variable(name='c',
1663 - shape=[K, N],
1664 - dtype=tf.float32,
1665 - initializer=tf.random_uniform_initializer(-c_bound, c_bound))
1666 - a = slim.convolution(x,
1667 - num_outputs=K,
1668 - kernel_size=1,
1669 - data_format='NWC',
1670 - scope='conv')
1671 - a = tf.nn.softmax(a)
1672 - v = []
1673 - for k in range(K):
1674 - t = x-c[k][None, :, None]
1675 - t = tf.multiply(t, a[:,:,k][:,:,None])
1676 - t = tf.reduce_sum(t, 1)
1677 - t = tf.nn.l2_normalize(t, dim=1)
1678 - v.append(t)
1679 - v = tf.stack(v, axis=1)
1680 - v = tf.reshape(v, [-1, K*D])
1681 -
1682 - proj_weights = tf.get_variable("proj_weights",
1683 - [K*D, 1024],
1684 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D)))
1685 - activation_a = tf.matmul(v, proj_weights)
1686 -
1687 - activation = tf.concat([activation_v, activation_a], axis=1)
1688 -
1689 - activation = slim.batch_norm(
1690 - activation,
1691 - center=True,
1692 - scale=True,
1693 - is_training=True,
1694 - scope='proj')
1695 -
1696 - activation = tf.nn.relu6(activation)
1697 -
1698 - aggregated_model = getattr(video_level_models,
1699 - FLAGS.video_level_classifier_model)
1700 -
1701 - return aggregated_model().create_model(
1702 - model_input=activation,
1703 - vocab_size=vocab_size,
1704 - **unused_params)
1705 -
1706 -class LNBLstmModel(models.BaseModel):
1707 -
1708 - def create_model(self, model_input, vocab_size, num_frames, **unused_params):
1709 - """Creates a model which uses a stack of LSTMs to represent the video.
1710 - Args:
1711 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1712 - input features.
1713 - vocab_size: The number of classes in the dataset.
1714 - num_frames: A vector of length 'batch' which indicates the number of
1715 - frames for each video (before padding).
1716 - Returns:
1717 - A dictionary with a tensor containing the probability predictions of the
1718 - model in the 'predictions' key. The dimensions of the tensor are
1719 - 'batch_size' x 'num_classes'.
1720 - """
1721 - lstm_size = FLAGS.lstm_cells
1722 - number_of_layers = FLAGS.lstm_layers
1723 -
1724 - stacked_lstm = tf.contrib.rnn.MultiRNNCell(
1725 - [
1726 - tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_size, dropout_keep_prob=0.50)
1727 - for _ in range(number_of_layers)
1728 - ])
1729 -
1730 - loss = 0.0
1731 -
1732 - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input,
1733 - sequence_length=num_frames,
1734 - dtype=tf.float32)
1735 -
1736 - aggregated_model = getattr(video_level_models,
1737 - FLAGS.video_level_classifier_model)
1738 -
1739 - return aggregated_model().create_model(
1740 - model_input=state[-1].h,
1741 - vocab_size=vocab_size,
1742 - **unused_params)
1743 -
1744 -class audio_avgShort_twowayGRUModel(models.BaseModel):
1745 -
1746 - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
1747 - """Creates a model which uses a Bidirectional GRU and mean audio features to represent the video.
1748 - ---->first half GRU----->
1749 - - -
1750 - visual_feature ---- concat---------------->
1751 - - - -
1752 - ---->second half GRU----> concat -----> video level classifier
1753 - -
1754 - mean audio features--->
1755 - Args:
1756 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1757 - input features.
1758 - vocab_size: The number of classes in the dataset.
1759 - num_frames: A vector of length 'batch' which indicates the number of
1760 - frames for each video (before padding).
1761 - Returns:
1762 - A dictionary with a tensor containing the probability predictions of the
1763 - model in the 'predictions' key. The dimensions of the tensor are
1764 - 'batch_size' x 'num_classes'.
1765 - """
1766 - lstm_size = FLAGS.lstm_cells
1767 - stride = FLAGS.stride
1768 - max_frames = model_input.get_shape().as_list()[1]
1769 -
1770 - video_input = model_input[:,:,:1024]
1771 - audio_input = model_input[:,:,1024:]
1772 -
1773 - first_num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
1774 - audio_den = tf.reshape(tf.tile(first_num_frames, [1, 128]), [-1, 128])
1775 - mean_audio = tf.reduce_sum(audio_input, 1) / tf.maximum(audio_den, 1)
1776 -
1777 - pooled_input, num_frames = self.avg_pooled_func(video_input, num_frames, stride)
1778 -
1779 - pooled_input = slim.batch_norm(
1780 - pooled_input,
1781 - center=True,
1782 - scale=True,
1783 - is_training=is_training,
1784 - scope="hidden1_bn")
1785 -
1786 - mean_audio = slim.batch_norm(
1787 - mean_audio,
1788 - center=True,
1789 - scale=True,
1790 - is_training=is_training,
1791 - scope="hidden1_bn_audio")
1792 -
1793 - fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
1794 - bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
1795 -
1796 - fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:],
1797 - sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
1798 - bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:],
1799 - sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
1800 -
1801 - state = tf.concat([fw_state, bw_state], 1)
1802 - state = tf.concat([state, mean_audio], 1)
1803 -
1804 - aggregated_model = getattr(video_level_models,
1805 - 'linear_res_mix_act_MoeModel')
1806 -
1807 - return aggregated_model().create_model(
1808 - model_input=state,
1809 - vocab_size=vocab_size,
1810 - **unused_params)
1811 -
1812 - def avg_pooled_func(self, model_input, num_frames_in, stride):
1813 - max_frames = model_input.get_shape().as_list()[1]
1814 - feature_size = model_input.get_shape().as_list()[2]
1815 - num_frames = num_frames_in // stride
1816 - step = max_frames//stride
1817 -
1818 - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
1819 - first_layer_input = tf.reduce_sum(first_layer_input, 1)
1820 -
1821 - first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
1822 - denominators = tf.reshape(
1823 - tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
1824 - first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
1825 -
1826 - return first_layer_avg_pooled, num_frames
1827 -
1828 -
1829 -class resav_ConvModel(models.BaseModel):
1830 -
1831 - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
1832 - """Creates a model which uses a Convolutional model to represent the video.
1833 - Args:
1834 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1835 - input features.
1836 - vocab_size: The number of classes in the dataset.
1837 - num_frames: A vector of length 'batch' which indicates the number of
1838 - frames for each video (before padding).
1839 - Returns:
1840 - A dictionary with a tensor containing the probability predictions of the
1841 - model in the 'predictions' key. The dimensions of the tensor are
1842 - 'batch_size' x 'num_classes'.
1843 - """
1844 - stride = FLAGS.stride
1845 - conv_length = FLAGS.conv_length
1846 - conv_hidden1 = FLAGS.conv_hidden1
1847 - conv_hidden2 = FLAGS.conv_hidden2
1848 - conv_hidden3 = FLAGS.conv_hidden3
1849 - mean_feature = tf.reduce_mean(model_input, 1)
1850 - feature_size = model_input.get_shape().as_list()[2]
1851 -
1852 - pooled_input = self.avg_pooled_func(model_input, stride)
1853 -
1854 - # To shape : 'batch_size' x 'max_frames' x 1 x 'num_features'
1855 - input_expand = tf.expand_dims(pooled_input, -1)
1856 - input_expand = tf.transpose(input_expand, [0,1,3,2])
1857 -
1858 - # conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden
1859 - conv_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_1')
1860 - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_1"))
1861 - conv_out = slim.conv2d(conv_out, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_2')
1862 - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_2")
1863 - res_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_1_1')
1864 - res_out = res_out + conv_out
1865 - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool1')
1866 -
1867 - conv_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_1')
1868 - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_1"))
1869 - conv_out = slim.conv2d(conv_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_2')
1870 - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_2")
1871 - res_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_2_1')
1872 - res_out = res_out + conv_out
1873 - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool2')
1874 -
1875 - conv_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_1')
1876 - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_1"))
1877 - conv_out = slim.conv2d(conv_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_2')
1878 - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_2")
1879 - res_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_3_1')
1880 - res_out = res_out + conv_out
1881 - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool3')
1882 -
1883 - a = res_out.get_shape().as_list()[1]
1884 - b = res_out.get_shape().as_list()[2]
1885 - c = res_out.get_shape().as_list()[3]
1886 -
1887 - print(res_out.get_shape().as_list())
1888 -
1889 - res_out = tf.reshape(res_out, [-1, a*b*c])
1890 -
1891 - state = tf.concat([res_out, mean_feature], 1)
1892 -
1893 - aggregated_model = getattr(video_level_models,
1894 - 'linear_res_mix_act_MoeModel')
1895 - return aggregated_model().create_model(
1896 - model_input=state,
1897 - vocab_size=vocab_size,
1898 - **unused_params)
1899 -
1900 - def avg_pooled_func(self, model_input, stride):
1901 - max_frames = model_input.get_shape().as_list()[1]
1902 - feature_size = model_input.get_shape().as_list()[2]
1903 - step = max_frames//stride
1904 -
1905 - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
1906 - first_layer_input = tf.reduce_mean(first_layer_input, 1)
1907 -
1908 - return first_layer_input
1909 -
1910 -class pur_twowayGRUModel(models.BaseModel):
1911 -
1912 - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params):
1913 - """Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video.
1914 - ---->first half GRU----->
1915 - - -
1916 - video_feature ---- concat---------------->video level classifier
1917 - - -
1918 - ---->second half GRU---->
1919 - Args:
1920 - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
1921 - input features.
1922 - vocab_size: The number of classes in the dataset.
1923 - num_frames: A vector of length 'batch' which indicates the number of
1924 - frames for each video (before padding).
1925 - Returns:
1926 - A dictionary with a tensor containing the probability predictions of the
1927 - model in the 'predictions' key. The dimensions of the tensor are
1928 - 'batch_size' x 'num_classes'.
1929 - """
1930 - lstm_size = FLAGS.lstm_cells
1931 - number_of_layers = FLAGS.lstm_layers
1932 - stride = FLAGS.stride
1933 - max_frames = model_input.get_shape().as_list()[1]
1934 -
1935 - pooled_input, num_frames = self.avg_pooled_func(model_input, num_frames, stride)
1936 -
1937 - pooled_input = slim.batch_norm(
1938 - pooled_input,
1939 - center=True,
1940 - scale=True,
1941 - is_training=is_training,
1942 - scope="hidden1_bn")
1943 -
1944 -
1945 - fw_gru = tf.contrib.rnn.GRUCell(lstm_size)
1946 - bw_gru = tf.contrib.rnn.GRUCell(lstm_size)
1947 -
1948 - fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:],
1949 - sequence_length=num_frames//2, dtype=tf.float32, scope='fw')
1950 - bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:],
1951 - sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw')
1952 -
1953 - state = tf.concat([fw_state, bw_state], 1)
1954 -
1955 - aggregated_model = getattr(video_level_models,
1956 - 'linear_res_mix_act_MoeModel')
1957 -
1958 - return aggregated_model().create_model(
1959 - model_input=state,
1960 - vocab_size=vocab_size,
1961 - **unused_params)
1962 -
1963 - def avg_pooled_func(self, model_input, num_frames_in, stride):
1964 - max_frames = model_input.get_shape().as_list()[1]
1965 - feature_size = model_input.get_shape().as_list()[2]
1966 - num_frames = num_frames_in // stride
1967 - step = max_frames//stride
1968 -
1969 - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size])
1970 - first_layer_input = tf.reduce_sum(first_layer_input, 1)
1971 -
1972 - first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32)
1973 - denominators = tf.reshape(
1974 - tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size])
1975 - first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1)
1976 -
1977 - return first_layer_avg_pooled, num_frames
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -137,8 +137,6 @@ class MoeModel(models.BaseModel): ...@@ -137,8 +137,6 @@ class MoeModel(models.BaseModel):
137 final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, 137 final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
138 [-1, vocab_size]) 138 [-1, vocab_size])
139 139
140 -
141 - print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@", final_probabilities_by_class_and_batch)
142 return {"predictions": final_probabilities} 140 return {"predictions": final_probabilities}
143 141
144 142
...@@ -252,481 +250,3 @@ class willow_MoeModel(models.BaseModel): ...@@ -252,481 +250,3 @@ class willow_MoeModel(models.BaseModel):
252 probabilities = tf.multiply(probabilities, gates) 250 probabilities = tf.multiply(probabilities, gates)
253 251
254 return {"predictions": probabilities} 252 return {"predictions": probabilities}
...\ No newline at end of file ...\ No newline at end of file
255 -
256 -class willow_MoeModel_moe4(models.BaseModel):
257 - """A softmax over a mixture of logistic models (with L2 regularization)."""
258 -
259 - def create_model(self,
260 - model_input,
261 - vocab_size,
262 - is_training,
263 - num_mixtures=None,
264 - l2_penalty=1e-8,
265 - **unused_params):
266 - """Creates a Mixture of (Logistic) Experts model.
267 - It also includes the possibility of gating the probabilities
268 - The model consists of a per-class softmax distribution over a
269 - configurable number of logistic classifiers. One of the classifiers in the
270 - mixture is not trained, and always predicts 0.
271 - Args:
272 - model_input: 'batch_size' x 'num_features' matrix of input features.
273 - vocab_size: The number of classes in the dataset.
274 - is_training: Is this the training phase ?
275 - num_mixtures: The number of mixtures (excluding a dummy 'expert' that
276 - always predicts the non-existence of an entity).
277 - l2_penalty: How much to penalize the squared magnitudes of parameter
278 - values.
279 - Returns:
280 - A dictionary with a tensor containing the probability predictions of the
281 - model in the 'predictions' key. The dimensions of the tensor are
282 - batch_size x num_classes.
283 - """
284 - num_mixtures = 4
285 - low_rank_gating = FLAGS.moe_low_rank_gating
286 - l2_penalty = FLAGS.moe_l2
287 - gating_probabilities = FLAGS.moe_prob_gating
288 - gating_input = FLAGS.moe_prob_gating_input
289 -
290 - input_size = model_input.get_shape().as_list()[1]
291 - remove_diag = False
292 -
293 - if low_rank_gating == -1:
294 - gate_activations = slim.fully_connected(
295 - model_input,
296 - vocab_size * (num_mixtures + 1),
297 - activation_fn=None,
298 - biases_initializer=None,
299 - weights_regularizer=slim.l2_regularizer(l2_penalty),
300 - scope="gates")
301 - else:
302 - gate_activations1 = slim.fully_connected(
303 - model_input,
304 - low_rank_gating,
305 - activation_fn=None,
306 - biases_initializer=None,
307 - weights_regularizer=slim.l2_regularizer(l2_penalty),
308 - scope="gates1")
309 - gate_activations = slim.fully_connected(
310 - gate_activations1,
311 - vocab_size * (num_mixtures + 1),
312 - activation_fn=None,
313 - biases_initializer=None,
314 - weights_regularizer=slim.l2_regularizer(l2_penalty),
315 - scope="gates2")
316 -
317 - expert_activations = slim.fully_connected(
318 - model_input,
319 - vocab_size * num_mixtures,
320 - activation_fn=None,
321 - weights_regularizer=slim.l2_regularizer(l2_penalty),
322 - scope="experts")
323 -
324 - gating_distribution = tf.nn.softmax(tf.reshape(
325 - gate_activations,
326 - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
327 - expert_distribution = tf.nn.sigmoid(tf.reshape(
328 - expert_activations,
329 - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
330 -
331 - probabilities_by_class_and_batch = tf.reduce_sum(
332 - gating_distribution[:, :num_mixtures] * expert_distribution, 1)
333 - probabilities = tf.reshape(probabilities_by_class_and_batch,
334 - [-1, vocab_size])
335 -
336 - if gating_probabilities:
337 - if gating_input == 'prob':
338 - gating_weights = tf.get_variable("gating_prob_weights",
339 - [vocab_size, vocab_size],
340 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
341 - gates = tf.matmul(probabilities, gating_weights)
342 - else:
343 - gating_weights = tf.get_variable("gating_prob_weights",
344 - [input_size, vocab_size],
345 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
346 -
347 - gates = tf.matmul(model_input, gating_weights)
348 -
349 - if remove_diag:
350 - # removes diagonals coefficients
351 - diagonals = tf.matrix_diag_part(gating_weights)
352 - gates = gates - tf.multiply(diagonals, probabilities)
353 -
354 - gates = slim.batch_norm(
355 - gates,
356 - center=True,
357 - scale=True,
358 - is_training=is_training,
359 - scope="gating_prob_bn")
360 -
361 - gates = tf.sigmoid(gates)
362 -
363 - probabilities = tf.multiply(probabilities, gates)
364 -
365 - return {"predictions": probabilities}
366 -
367 -class willow_MoeModel_moe4_noGP(models.BaseModel):
368 - """A softmax over a mixture of logistic models (with L2 regularization)."""
369 -
370 - def create_model(self,
371 - model_input,
372 - vocab_size,
373 - is_training,
374 - num_mixtures=None,
375 - l2_penalty=1e-8,
376 - **unused_params):
377 - """Creates a Mixture of (Logistic) Experts model.
378 - It also includes the possibility of gating the probabilities
379 - The model consists of a per-class softmax distribution over a
380 - configurable number of logistic classifiers. One of the classifiers in the
381 - mixture is not trained, and always predicts 0.
382 - Args:
383 - model_input: 'batch_size' x 'num_features' matrix of input features.
384 - vocab_size: The number of classes in the dataset.
385 - is_training: Is this the training phase ?
386 - num_mixtures: The number of mixtures (excluding a dummy 'expert' that
387 - always predicts the non-existence of an entity).
388 - l2_penalty: How much to penalize the squared magnitudes of parameter
389 - values.
390 - Returns:
391 - A dictionary with a tensor containing the probability predictions of the
392 - model in the 'predictions' key. The dimensions of the tensor are
393 - batch_size x num_classes.
394 - """
395 - num_mixtures = 4
396 - low_rank_gating = FLAGS.moe_low_rank_gating
397 - l2_penalty = FLAGS.moe_l2
398 - gating_probabilities = False
399 - gating_input = FLAGS.moe_prob_gating_input
400 -
401 - input_size = model_input.get_shape().as_list()[1]
402 - remove_diag = False
403 -
404 - if low_rank_gating == -1:
405 - gate_activations = slim.fully_connected(
406 - model_input,
407 - vocab_size * (num_mixtures + 1),
408 - activation_fn=None,
409 - biases_initializer=None,
410 - weights_regularizer=slim.l2_regularizer(l2_penalty),
411 - scope="gates")
412 - else:
413 - gate_activations1 = slim.fully_connected(
414 - model_input,
415 - low_rank_gating,
416 - activation_fn=None,
417 - biases_initializer=None,
418 - weights_regularizer=slim.l2_regularizer(l2_penalty),
419 - scope="gates1")
420 - gate_activations = slim.fully_connected(
421 - gate_activations1,
422 - vocab_size * (num_mixtures + 1),
423 - activation_fn=None,
424 - biases_initializer=None,
425 - weights_regularizer=slim.l2_regularizer(l2_penalty),
426 - scope="gates2")
427 -
428 - expert_activations = slim.fully_connected(
429 - model_input,
430 - vocab_size * num_mixtures,
431 - activation_fn=None,
432 - weights_regularizer=slim.l2_regularizer(l2_penalty),
433 - scope="experts")
434 -
435 - gating_distribution = tf.nn.softmax(tf.reshape(
436 - gate_activations,
437 - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
438 - expert_distribution = tf.nn.sigmoid(tf.reshape(
439 - expert_activations,
440 - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
441 -
442 - probabilities_by_class_and_batch = tf.reduce_sum(
443 - gating_distribution[:, :num_mixtures] * expert_distribution, 1)
444 - probabilities = tf.reshape(probabilities_by_class_and_batch,
445 - [-1, vocab_size])
446 -
447 - return {"predictions": probabilities}
448 -
449 -class willow_MoeModel_moe2_noGP(models.BaseModel):
450 - """A softmax over a mixture of logistic models (with L2 regularization)."""
451 -
452 - def create_model(self,
453 - model_input,
454 - vocab_size,
455 - is_training,
456 - num_mixtures=None,
457 - l2_penalty=1e-8,
458 - **unused_params):
459 - """Creates a Mixture of (Logistic) Experts model.
460 - It also includes the possibility of gating the probabilities
461 - The model consists of a per-class softmax distribution over a
462 - configurable number of logistic classifiers. One of the classifiers in the
463 - mixture is not trained, and always predicts 0.
464 - Args:
465 - model_input: 'batch_size' x 'num_features' matrix of input features.
466 - vocab_size: The number of classes in the dataset.
467 - is_training: Is this the training phase ?
468 - num_mixtures: The number of mixtures (excluding a dummy 'expert' that
469 - always predicts the non-existence of an entity).
470 - l2_penalty: How much to penalize the squared magnitudes of parameter
471 - values.
472 - Returns:
473 - A dictionary with a tensor containing the probability predictions of the
474 - model in the 'predictions' key. The dimensions of the tensor are
475 - batch_size x num_classes.
476 - """
477 - num_mixtures = 2
478 - low_rank_gating = FLAGS.moe_low_rank_gating
479 - l2_penalty = FLAGS.moe_l2
480 - gating_probabilities = False
481 - gating_input = FLAGS.moe_prob_gating_input
482 -
483 - input_size = model_input.get_shape().as_list()[1]
484 - remove_diag = False
485 -
486 - if low_rank_gating == -1:
487 - gate_activations = slim.fully_connected(
488 - model_input,
489 - vocab_size * (num_mixtures + 1),
490 - activation_fn=None,
491 - biases_initializer=None,
492 - weights_regularizer=slim.l2_regularizer(l2_penalty),
493 - scope="gates")
494 - else:
495 - gate_activations1 = slim.fully_connected(
496 - model_input,
497 - low_rank_gating,
498 - activation_fn=None,
499 - biases_initializer=None,
500 - weights_regularizer=slim.l2_regularizer(l2_penalty),
501 - scope="gates1")
502 - gate_activations = slim.fully_connected(
503 - gate_activations1,
504 - vocab_size * (num_mixtures + 1),
505 - activation_fn=None,
506 - biases_initializer=None,
507 - weights_regularizer=slim.l2_regularizer(l2_penalty),
508 - scope="gates2")
509 -
510 - expert_activations = slim.fully_connected(
511 - model_input,
512 - vocab_size * num_mixtures,
513 - activation_fn=None,
514 - weights_regularizer=slim.l2_regularizer(l2_penalty),
515 - scope="experts")
516 -
517 - gating_distribution = tf.nn.softmax(tf.reshape(
518 - gate_activations,
519 - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
520 - expert_distribution = tf.nn.sigmoid(tf.reshape(
521 - expert_activations,
522 - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
523 -
524 - probabilities_by_class_and_batch = tf.reduce_sum(
525 - gating_distribution[:, :num_mixtures] * expert_distribution, 1)
526 - probabilities = tf.reshape(probabilities_by_class_and_batch,
527 - [-1, vocab_size])
528 -
529 - return {"predictions": probabilities}
530 -
531 -
532 -class willow_MoeModel_moe2(models.BaseModel):
533 - """A softmax over a mixture of logistic models (with L2 regularization)."""
534 -
535 - def create_model(self,
536 - model_input,
537 - vocab_size,
538 - is_training,
539 - num_mixtures=None,
540 - l2_penalty=1e-8,
541 - **unused_params):
542 - """Creates a Mixture of (Logistic) Experts model.
543 - It also includes the possibility of gating the probabilities
544 - The model consists of a per-class softmax distribution over a
545 - configurable number of logistic classifiers. One of the classifiers in the
546 - mixture is not trained, and always predicts 0.
547 - Args:
548 - model_input: 'batch_size' x 'num_features' matrix of input features.
549 - vocab_size: The number of classes in the dataset.
550 - is_training: Is this the training phase ?
551 - num_mixtures: The number of mixtures (excluding a dummy 'expert' that
552 - always predicts the non-existence of an entity).
553 - l2_penalty: How much to penalize the squared magnitudes of parameter
554 - values.
555 - Returns:
556 - A dictionary with a tensor containing the probability predictions of the
557 - model in the 'predictions' key. The dimensions of the tensor are
558 - batch_size x num_classes.
559 - """
560 - num_mixtures = 2
561 - low_rank_gating = FLAGS.moe_low_rank_gating
562 - l2_penalty = FLAGS.moe_l2
563 - gating_probabilities = FLAGS.moe_prob_gating
564 - gating_input = FLAGS.moe_prob_gating_input
565 -
566 - input_size = model_input.get_shape().as_list()[1]
567 - remove_diag = False
568 -
569 - if low_rank_gating == -1:
570 - gate_activations = slim.fully_connected(
571 - model_input,
572 - vocab_size * (num_mixtures + 1),
573 - activation_fn=None,
574 - biases_initializer=None,
575 - weights_regularizer=slim.l2_regularizer(l2_penalty),
576 - scope="gates")
577 - else:
578 - gate_activations1 = slim.fully_connected(
579 - model_input,
580 - low_rank_gating,
581 - activation_fn=None,
582 - biases_initializer=None,
583 - weights_regularizer=slim.l2_regularizer(l2_penalty),
584 - scope="gates1")
585 - gate_activations = slim.fully_connected(
586 - gate_activations1,
587 - vocab_size * (num_mixtures + 1),
588 - activation_fn=None,
589 - biases_initializer=None,
590 - weights_regularizer=slim.l2_regularizer(l2_penalty),
591 - scope="gates2")
592 -
593 - expert_activations = slim.fully_connected(
594 - model_input,
595 - vocab_size * num_mixtures,
596 - activation_fn=None,
597 - weights_regularizer=slim.l2_regularizer(l2_penalty),
598 - scope="experts")
599 -
600 - gating_distribution = tf.nn.softmax(tf.reshape(
601 - gate_activations,
602 - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
603 - expert_distribution = tf.nn.sigmoid(tf.reshape(
604 - expert_activations,
605 - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
606 -
607 - probabilities_by_class_and_batch = tf.reduce_sum(
608 - gating_distribution[:, :num_mixtures] * expert_distribution, 1)
609 - probabilities = tf.reshape(probabilities_by_class_and_batch,
610 - [-1, vocab_size])
611 -
612 - if gating_probabilities:
613 - if gating_input == 'prob':
614 - gating_weights = tf.get_variable("gating_prob_weights",
615 - [vocab_size, vocab_size],
616 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
617 - gates = tf.matmul(probabilities, gating_weights)
618 - else:
619 - gating_weights = tf.get_variable("gating_prob_weights",
620 - [input_size, vocab_size],
621 - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
622 -
623 - gates = tf.matmul(model_input, gating_weights)
624 -
625 - if remove_diag:
626 - # removes diagonals coefficients
627 - diagonals = tf.matrix_diag_part(gating_weights)
628 - gates = gates - tf.multiply(diagonals, probabilities)
629 -
630 - gates = slim.batch_norm(
631 - gates,
632 - center=True,
633 - scale=True,
634 - is_training=is_training,
635 - scope="gating_prob_bn")
636 -
637 - gates = tf.sigmoid(gates)
638 -
639 - probabilities = tf.multiply(probabilities, gates)
640 -
641 - return {"predictions": probabilities}
642 -
643 -class linear_res_mix_act_MoeModel(models.BaseModel):
644 - """A softmax over a mixture of logistic models (with L2 regularization).
645 -
646 - -----linear_layers(1) + sigmoid activation-------------
647 - - -
648 - -----linear_layers(2) + relu activation----------------
649 - - -
650 - input_features ----- -------moe-----output
651 - - -
652 - -----linear_layers(3) + elu activation-----------------
653 - - -
654 - -----linear_layers(4) + tanh activation----------------
655 - """
656 - def create_model(self,
657 - model_input,
658 - vocab_size,
659 - num_mixtures=None,
660 - num_hiddens=None,
661 - num_maxout = None,
662 - l2_penalty=1e-8,
663 - **unused_params):
664 -
665 - num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
666 - num_hiddens = num_hiddens or FLAGS.moe_num_hiddens
667 - num_maxout = num_maxout or FLAGS.num_maxout
668 -
669 - hidden_sigmoid = slim.fully_connected(
670 - model_input,
671 - num_hiddens,
672 - activation_fn=tf.nn.sigmoid,
673 - weights_regularizer=slim.l2_regularizer(l2_penalty),
674 - scope='hidden_sigmoid'
675 - )
676 - hidden_relu = slim.fully_connected(
677 - model_input,
678 - num_hiddens,
679 - activation_fn=tf.nn.relu,
680 - weights_regularizer=slim.l2_regularizer(l2_penalty),
681 - scope='hidden_relu'
682 - )
683 - hidden_elu = slim.fully_connected(
684 - model_input,
685 - num_hiddens,
686 - activation_fn=tf.nn.elu,
687 - weights_regularizer=slim.l2_regularizer(l2_penalty),
688 - scope='hidden_elu'
689 - )
690 - hidden_tanh = slim.fully_connected(
691 - model_input,
692 - num_hiddens,
693 - activation_fn=tf.nn.tanh,
694 - weights_regularizer=slim.l2_regularizer(l2_penalty),
695 - scope='hidden_tanh'
696 - )
697 -
698 - linear_input = slim.fully_connected(
699 - model_input,
700 - num_hiddens,
701 - activation_fn=None,
702 - weights_regularizer=slim.l2_regularizer(l2_penalty),
703 - scope='hidden_linear'
704 - )
705 -
706 -
707 - gate_activations = slim.fully_connected(
708 - model_input,
709 - vocab_size * (num_mixtures + 1),
710 - activation_fn=None,
711 - biases_initializer=None,
712 - weights_regularizer=slim.l2_regularizer(l2_penalty),
713 - scope="gates")
714 - expert_activations = slim.fully_connected(
715 - tf.concat([hidden_sigmoid+0.25*linear_input, hidden_relu+0.25*linear_input, hidden_elu+0.25*linear_input, hidden_tanh+0.25*linear_input], 1),
716 - vocab_size * num_mixtures,
717 - activation_fn=None,
718 - weights_regularizer=slim.l2_regularizer(l2_penalty),
719 - scope="experts")
720 -
721 - gating_distribution = tf.nn.softmax(tf.reshape(
722 - gate_activations,
723 - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1)
724 - expert_distribution = tf.nn.sigmoid(tf.reshape(
725 - expert_activations,
726 - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures
727 -
728 - final_probabilities_by_class_and_batch = tf.reduce_sum(
729 - gating_distribution[:, :num_mixtures] * expert_distribution, 1)
730 - final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
731 - [-1, vocab_size])
732 - return {"predictions": final_probabilities}
...\ No newline at end of file ...\ No newline at end of file
......
No preview for this file type