Showing
4 changed files
with
30 additions
and
2159 deletions
... | @@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.") | ... | @@ -65,128 +65,21 @@ flags.DEFINE_integer("conv_hidden2", 1024, "Number of cnn hidden.") |
65 | flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.") | 65 | flags.DEFINE_integer("conv_hidden3", 1024, "Number of cnn hidden.") |
66 | flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.") | 66 | flags.DEFINE_integer("stride", 10, "Number of stride for short rnn.") |
67 | 67 | ||
68 | -class DbofModel(models.BaseModel): | 68 | +class FrameLevelLogisticModel(models.BaseModel): |
69 | - """Creates a Deep Bag of Frames model. | 69 | + def create_model(self, model_input, vocab_size, num_frames, **unused_params): |
70 | - The model projects the features for each frame into a higher dimensional | ||
71 | - 'clustering' space, pools across frames in that space, and then | ||
72 | - uses a configurable video-level model to classify the now aggregated features. | ||
73 | - The model will randomly sample either frames or sequences of frames during | ||
74 | - training to speed up convergence. | ||
75 | - """ | ||
76 | - | ||
77 | - ACT_FN_MAP = { | ||
78 | - "sigmoid": tf.nn.sigmoid, | ||
79 | - "relu6": tf.nn.relu6, | ||
80 | - } | ||
81 | - | ||
82 | - def create_model(self, | ||
83 | - model_input, | ||
84 | - vocab_size, | ||
85 | - num_frames, | ||
86 | - iterations=None, | ||
87 | - add_batch_norm=None, | ||
88 | - sample_random_frames=None, | ||
89 | - cluster_size=None, | ||
90 | - hidden_size=None, | ||
91 | - is_training=True, | ||
92 | - **unused_params): | ||
93 | - """See base class. | ||
94 | - Args: | ||
95 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
96 | - input features. | ||
97 | - vocab_size: The number of classes in the dataset. | ||
98 | - num_frames: A vector of length 'batch' which indicates the number of | ||
99 | - frames for each video (before padding). | ||
100 | - iterations: the number of frames to be sampled. | ||
101 | - add_batch_norm: whether to add batch norm during training. | ||
102 | - sample_random_frames: whether to sample random frames or random sequences. | ||
103 | - cluster_size: the output neuron number of the cluster layer. | ||
104 | - hidden_size: the output neuron number of the hidden layer. | ||
105 | - is_training: whether to build the graph in training mode. | ||
106 | - Returns: | ||
107 | - A dictionary with a tensor containing the probability predictions of the | ||
108 | - model in the 'predictions' key. The dimensions of the tensor are | ||
109 | - 'batch_size' x 'num_classes'. | ||
110 | - """ | ||
111 | - iterations = iterations or FLAGS.iterations | ||
112 | - add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm | ||
113 | - random_frames = sample_random_frames or FLAGS.sample_random_frames | ||
114 | - cluster_size = cluster_size or FLAGS.dbof_cluster_size | ||
115 | - hidden1_size = hidden_size or FLAGS.dbof_hidden_size | ||
116 | - act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation) | ||
117 | - assert act_fn is not None, ("dbof_activation is not valid: %s." % | ||
118 | - FLAGS.dbof_activation) | ||
119 | - | ||
120 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | 70 | num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) |
121 | - if random_frames: | ||
122 | - model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
123 | - iterations) | ||
124 | - else: | ||
125 | - model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
126 | - iterations) | ||
127 | - max_frames = model_input.get_shape().as_list()[1] | ||
128 | feature_size = model_input.get_shape().as_list()[2] | 71 | feature_size = model_input.get_shape().as_list()[2] |
129 | - reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
130 | - tf.compat.v1.summary.histogram("input_hist", reshaped_input) | ||
131 | 72 | ||
132 | - if add_batch_norm: | 73 | + denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), |
133 | - reshaped_input = slim.batch_norm(reshaped_input, | 74 | + [-1, feature_size]) |
134 | - center=True, | 75 | + avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators |
135 | - scale=True, | ||
136 | - is_training=is_training, | ||
137 | - scope="input_bn") | ||
138 | - | ||
139 | - cluster_weights = tf.compat.v1.get_variable( | ||
140 | - "cluster_weights", [feature_size, cluster_size], | ||
141 | - initializer=tf.random_normal_initializer(stddev=1 / | ||
142 | - math.sqrt(feature_size))) | ||
143 | - tf.compat.v1.summary.histogram("cluster_weights", cluster_weights) | ||
144 | - activation = tf.matmul(reshaped_input, cluster_weights) | ||
145 | - if add_batch_norm: | ||
146 | - activation = slim.batch_norm(activation, | ||
147 | - center=True, | ||
148 | - scale=True, | ||
149 | - is_training=is_training, | ||
150 | - scope="cluster_bn") | ||
151 | - else: | ||
152 | - cluster_biases = tf.compat.v1.get_variable( | ||
153 | - "cluster_biases", [cluster_size], | ||
154 | - initializer=tf.random_normal_initializer(stddev=1 / | ||
155 | - math.sqrt(feature_size))) | ||
156 | - tf.compat.v1.summary.histogram("cluster_biases", cluster_biases) | ||
157 | - activation += cluster_biases | ||
158 | - activation = act_fn(activation) | ||
159 | - tf.compat.v1.summary.histogram("cluster_output", activation) | ||
160 | - | ||
161 | - activation = tf.reshape(activation, [-1, max_frames, cluster_size]) | ||
162 | - activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) | ||
163 | - | ||
164 | - hidden1_weights = tf.compat.v1.get_variable( | ||
165 | - "hidden1_weights", [cluster_size, hidden1_size], | ||
166 | - initializer=tf.random_normal_initializer(stddev=1 / | ||
167 | - math.sqrt(cluster_size))) | ||
168 | - tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights) | ||
169 | - activation = tf.matmul(activation, hidden1_weights) | ||
170 | - if add_batch_norm: | ||
171 | - activation = slim.batch_norm(activation, | ||
172 | - center=True, | ||
173 | - scale=True, | ||
174 | - is_training=is_training, | ||
175 | - scope="hidden1_bn") | ||
176 | - else: | ||
177 | - hidden1_biases = tf.compat.v1.get_variable( | ||
178 | - "hidden1_biases", [hidden1_size], | ||
179 | - initializer=tf.random_normal_initializer(stddev=0.01)) | ||
180 | - tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases) | ||
181 | - activation += hidden1_biases | ||
182 | - activation = act_fn(activation) | ||
183 | - tf.compat.v1.summary.histogram("hidden1_output", activation) | ||
184 | 76 | ||
185 | - aggregated_model = getattr(video_level_models, | 77 | + output = slim.fully_connected(avg_pooled, |
186 | - FLAGS.video_level_classifier_model) | 78 | + vocab_size, |
187 | - return aggregated_model().create_model(model_input=activation, | 79 | + activation_fn=tf.nn.sigmoid, |
188 | - vocab_size=vocab_size, | 80 | + weights_regularizer=slim.l2_regularizer(1e-8)) |
189 | - **unused_params) | 81 | + |
82 | + return {"predictions": output} | ||
190 | 83 | ||
191 | class NetVLAD_NonLocal_types(): | 84 | class NetVLAD_NonLocal_types(): |
192 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | 85 | def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): |
... | @@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types(): | ... | @@ -286,20 +179,6 @@ class NetVLAD_NonLocal_types(): |
286 | return vlad_softmax | 179 | return vlad_softmax |
287 | 180 | ||
288 | class NetVLADModelLF(models.BaseModel): | 181 | class NetVLADModelLF(models.BaseModel): |
289 | - """Creates a NetVLAD based model. | ||
290 | - Args: | ||
291 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
292 | - input features. | ||
293 | - vocab_size: The number of classes in the dataset. | ||
294 | - num_frames: A vector of length 'batch' which indicates the number of | ||
295 | - frames for each video (before padding). | ||
296 | - Returns: | ||
297 | - A dictionary with a tensor containing the probability predictions of the | ||
298 | - model in the 'predictions' key. The dimensions of the tensor are | ||
299 | - 'batch_size' x 'num_classes'. | ||
300 | - """ | ||
301 | - | ||
302 | - | ||
303 | def create_model(self, | 182 | def create_model(self, |
304 | model_input, | 183 | model_input, |
305 | vocab_size, | 184 | vocab_size, |
... | @@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel): | ... | @@ -420,1558 +299,30 @@ class NetVLADModelLF(models.BaseModel): |
420 | is_training=is_training, | 299 | is_training=is_training, |
421 | **unused_params) | 300 | **unused_params) |
422 | 301 | ||
423 | -class GruModel(models.BaseModel): | 302 | +class LstmModel(models.BaseModel): |
424 | - | 303 | + |
425 | - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): | 304 | + def create_model(self, model_input, vocab_size, num_frames, **unused_params): |
426 | - """Creates a model which uses a stack of GRUs to represent the video. | 305 | + lstm_size = FLAGS.lstm_cells |
427 | - Args: | 306 | + number_of_layers = FLAGS.lstm_layers |
428 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
429 | - input features. | ||
430 | - vocab_size: The number of classes in the dataset. | ||
431 | - num_frames: A vector of length 'batch' which indicates the number of | ||
432 | - frames for each video (before padding). | ||
433 | - Returns: | ||
434 | - A dictionary with a tensor containing the probability predictions of the | ||
435 | - model in the 'predictions' key. The dimensions of the tensor are | ||
436 | - 'batch_size' x 'num_classes'. | ||
437 | - """ | ||
438 | - gru_size = 600 | ||
439 | - number_of_layers = 4 | ||
440 | - backward = False | ||
441 | - random_frames = False | ||
442 | - iterations = 30 | ||
443 | - | ||
444 | - if random_frames: | ||
445 | - num_frames_2 = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
446 | - model_input = utils.SampleRandomFrames(model_input, num_frames_2, | ||
447 | - iterations) | ||
448 | - | ||
449 | - if backward: | ||
450 | - model_input = tf.reverse_sequence(model_input, num_frames, seq_axis=1) | ||
451 | 307 | ||
452 | - stacked_GRU = tf.contrib.rnn.MultiRNNCell( | 308 | + stacked_lstm = tf.contrib.rnn.MultiRNNCell( |
453 | [ | 309 | [ |
454 | - tf.contrib.rnn.GRUCell(gru_size) | 310 | + tf.contrib.rnn.BasicLSTMCell( |
311 | + lstm_size, forget_bias=1.0) | ||
455 | for _ in range(number_of_layers) | 312 | for _ in range(number_of_layers) |
456 | - ], state_is_tuple=False) | 313 | + ]) |
457 | 314 | ||
458 | loss = 0.0 | 315 | loss = 0.0 |
459 | - with tf.variable_scope("RNN"): | 316 | + |
460 | - outputs, state = tf.nn.dynamic_rnn(stacked_GRU, model_input, | 317 | + outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, |
461 | - sequence_length=num_frames, | 318 | + sequence_length=num_frames, |
462 | - dtype=tf.float32) | 319 | + dtype=tf.float32) |
463 | 320 | ||
464 | aggregated_model = getattr(video_level_models, | 321 | aggregated_model = getattr(video_level_models, |
465 | - 'MoeModel') | 322 | + FLAGS.video_level_classifier_model) |
323 | + | ||
466 | return aggregated_model().create_model( | 324 | return aggregated_model().create_model( |
467 | - model_input=state, | 325 | + model_input=state[-1].h, |
468 | vocab_size=vocab_size, | 326 | vocab_size=vocab_size, |
469 | - is_training=is_training, | ||
470 | **unused_params) | 327 | **unused_params) |
471 | - | ||
472 | - | ||
473 | - | ||
474 | -class SoftDBoF(): | ||
475 | - def __init__(self, feature_size,max_frames,cluster_size, max_pool, add_batch_norm, is_training): | ||
476 | - self.feature_size = feature_size | ||
477 | - self.max_frames = max_frames | ||
478 | - self.is_training = is_training | ||
479 | - self.add_batch_norm = add_batch_norm | ||
480 | - self.cluster_size = cluster_size | ||
481 | - self.max_pool = max_pool | ||
482 | - | ||
483 | - def forward(self, reshaped_input): | ||
484 | - | ||
485 | - feature_size = self.feature_size | ||
486 | - cluster_size = self.cluster_size | ||
487 | - add_batch_norm = self.add_batch_norm | ||
488 | - max_frames = self.max_frames | ||
489 | - is_training = self.is_training | ||
490 | - max_pool = self.max_pool | ||
491 | - | ||
492 | - cluster_weights = tf.get_variable("cluster_weights", | ||
493 | - [feature_size, cluster_size], | ||
494 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) | ||
495 | - | ||
496 | - tf.summary.histogram("cluster_weights", cluster_weights) | ||
497 | - activation = tf.matmul(reshaped_input, cluster_weights) | ||
498 | - | ||
499 | - if add_batch_norm: | ||
500 | - activation = slim.batch_norm( | ||
501 | - activation, | ||
502 | - center=True, | ||
503 | - scale=True, | ||
504 | - is_training=is_training, | ||
505 | - scope="cluster_bn") | ||
506 | - else: | ||
507 | - cluster_biases = tf.get_variable("cluster_biases", | ||
508 | - [cluster_size], | ||
509 | - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) | ||
510 | - tf.summary.histogram("cluster_biases", cluster_biases) | ||
511 | - activation += cluster_biases | ||
512 | - | ||
513 | - activation = tf.nn.softmax(activation) | ||
514 | - | ||
515 | - activation = tf.reshape(activation, [-1, int(max_frames), int(cluster_size)]) | ||
516 | - | ||
517 | - activation_sum = tf.reduce_sum(activation,1) | ||
518 | - activation_sum = tf.nn.l2_normalize(activation_sum,1) | ||
519 | - | ||
520 | - if max_pool: | ||
521 | - activation_max = tf.reduce_max(activation,1) | ||
522 | - activation_max = tf.nn.l2_normalize(activation_max,1) | ||
523 | - activation = tf.concat([activation_sum,activation_max],1) | ||
524 | - else: | ||
525 | - activation = activation_sum | ||
526 | - | ||
527 | - return activation | ||
528 | - | ||
529 | - | ||
530 | -class LightVLAD_nonlocal(): | ||
531 | - def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | ||
532 | - self.feature_size = feature_size | ||
533 | - self.max_frames = max_frames | ||
534 | - self.is_training = is_training | ||
535 | - self.add_batch_norm = add_batch_norm | ||
536 | - self.cluster_size = cluster_size | ||
537 | - | ||
538 | - def forward(self,reshaped_input): | ||
539 | - | ||
540 | - | ||
541 | - cluster_weights = tf.get_variable("cluster_weights", | ||
542 | - [int(self.feature_size), int(self.cluster_size)], | ||
543 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
544 | - | ||
545 | - activation = tf.matmul(reshaped_input, cluster_weights) | ||
546 | - | ||
547 | - if self.add_batch_norm: | ||
548 | - activation = slim.batch_norm( | ||
549 | - activation, | ||
550 | - center=True, | ||
551 | - scale=True, | ||
552 | - is_training=self.is_training, | ||
553 | - scope="cluster_bn") | ||
554 | - else: | ||
555 | - cluster_biases = tf.get_variable("cluster_biases", | ||
556 | - [cluster_size], | ||
557 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
558 | - tf.summary.histogram("cluster_biases", cluster_biases) | ||
559 | - activation += cluster_biases | ||
560 | - | ||
561 | - activation = tf.nn.softmax(activation) | ||
562 | - | ||
563 | - activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) | ||
564 | - | ||
565 | - activation = tf.transpose(activation,perm=[0,2,1]) | ||
566 | - | ||
567 | - reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) | ||
568 | - vlad = tf.matmul(activation,reshaped_input) | ||
569 | - | ||
570 | - vlad = tf.reshape(vlad, [-1,self.feature_size]) | ||
571 | - vlad = nonLocal_block(vlad, feature_size=self.feature_size, hidden_size=self.feature_size//2, cluster_size=self.cluster_size) | ||
572 | - | ||
573 | - vlad = tf.reshape(vlad, [-1,self.cluster_size,self.feature_size]) | ||
574 | - vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
575 | - | ||
576 | - vlad = tf.nn.l2_normalize(vlad,1) | ||
577 | - | ||
578 | - vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)]) | ||
579 | - vlad = tf.nn.l2_normalize(vlad,1) | ||
580 | - | ||
581 | - return vlad | ||
582 | - | ||
583 | -class LightNetVLADModelLF(models.BaseModel): | ||
584 | - """Creates a NetVLAD based model. | ||
585 | - Args: | ||
586 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
587 | - input features. | ||
588 | - vocab_size: The number of classes in the dataset. | ||
589 | - num_frames: A vector of length 'batch' which indicates the number of | ||
590 | - frames for each video (before padding). | ||
591 | - Returns: | ||
592 | - A dictionary with a tensor containing the probability predictions of the | ||
593 | - model in the 'predictions' key. The dimensions of the tensor are | ||
594 | - 'batch_size' x 'num_classes'. | ||
595 | - """ | ||
596 | - | ||
597 | - | ||
598 | - def create_model(self, | ||
599 | - model_input, | ||
600 | - vocab_size, | ||
601 | - num_frames, | ||
602 | - iterations=None, | ||
603 | - add_batch_norm=None, | ||
604 | - sample_random_frames=None, | ||
605 | - cluster_size=None, | ||
606 | - hidden_size=None, | ||
607 | - is_training=True, | ||
608 | - **unused_params): | ||
609 | - iterations = 300 | ||
610 | - add_batch_norm = True | ||
611 | - random_frames = True | ||
612 | - cluster_size = 64 | ||
613 | - hidden1_size = 1024 | ||
614 | - relu = False | ||
615 | - dimred = -1 | ||
616 | - gating = True | ||
617 | - remove_diag = False | ||
618 | - | ||
619 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
620 | - if random_frames: | ||
621 | - model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
622 | - iterations) | ||
623 | - else: | ||
624 | - model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
625 | - iterations) | ||
626 | - | ||
627 | - | ||
628 | - max_frames = model_input.get_shape().as_list()[1] | ||
629 | - feature_size = model_input.get_shape().as_list()[2] | ||
630 | - reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
631 | - | ||
632 | - | ||
633 | - video_NetVLAD = LightVLAD_nonlocal(1024,max_frames,cluster_size, add_batch_norm, is_training) | ||
634 | - audio_NetVLAD = LightVLAD_nonlocal(128,max_frames,cluster_size/2, add_batch_norm, is_training) | ||
635 | - | ||
636 | - | ||
637 | - if add_batch_norm:# and not lightvlad: | ||
638 | - reshaped_input = slim.batch_norm( | ||
639 | - reshaped_input, | ||
640 | - center=True, | ||
641 | - scale=True, | ||
642 | - is_training=is_training, | ||
643 | - scope="input_bn") | ||
644 | - | ||
645 | - with tf.variable_scope("video_VLAD"): | ||
646 | - vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) | ||
647 | - | ||
648 | - with tf.variable_scope("audio_VLAD"): | ||
649 | - vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:]) | ||
650 | - | ||
651 | - vlad = tf.concat([vlad_video, vlad_audio],1) | ||
652 | - | ||
653 | - vlad_dim = vlad.get_shape().as_list()[1] | ||
654 | - hidden1_weights = tf.get_variable("hidden1_weights", | ||
655 | - [vlad_dim, hidden1_size], | ||
656 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) | ||
657 | - | ||
658 | - activation = tf.matmul(vlad, hidden1_weights) | ||
659 | - | ||
660 | - if add_batch_norm and relu: | ||
661 | - activation = slim.batch_norm( | ||
662 | - activation, | ||
663 | - center=True, | ||
664 | - scale=True, | ||
665 | - is_training=is_training, | ||
666 | - scope="hidden1_bn") | ||
667 | - | ||
668 | - else: | ||
669 | - hidden1_biases = tf.get_variable("hidden1_biases", | ||
670 | - [hidden1_size], | ||
671 | - initializer = tf.random_normal_initializer(stddev=0.01)) | ||
672 | - tf.summary.histogram("hidden1_biases", hidden1_biases) | ||
673 | - activation += hidden1_biases | ||
674 | - | ||
675 | - if relu: | ||
676 | - activation = tf.nn.relu6(activation) | ||
677 | - | ||
678 | - | ||
679 | - if gating: | ||
680 | - gating_weights = tf.get_variable("gating_weights_2", | ||
681 | - [hidden1_size, hidden1_size], | ||
682 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) | ||
683 | - | ||
684 | - gates = tf.matmul(activation, gating_weights) | ||
685 | - | ||
686 | - if remove_diag: | ||
687 | - #removes diagonals coefficients | ||
688 | - diagonals = tf.matrix_diag_part(gating_weights) | ||
689 | - gates = gates - tf.multiply(diagonals,activation) | ||
690 | - | ||
691 | - | ||
692 | - if add_batch_norm: | ||
693 | - gates = slim.batch_norm( | ||
694 | - gates, | ||
695 | - center=True, | ||
696 | - scale=True, | ||
697 | - is_training=is_training, | ||
698 | - scope="gating_bn") | ||
699 | - else: | ||
700 | - gating_biases = tf.get_variable("gating_biases", | ||
701 | - [cluster_size], | ||
702 | - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) | ||
703 | - gates += gating_biases | ||
704 | - | ||
705 | - gates = tf.sigmoid(gates) | ||
706 | - | ||
707 | - activation = tf.multiply(activation,gates) | ||
708 | - | ||
709 | - aggregated_model = getattr(video_level_models, | ||
710 | - FLAGS.video_level_classifier_model) | ||
711 | - | ||
712 | - | ||
713 | - return aggregated_model().create_model( | ||
714 | - model_input=activation, | ||
715 | - vocab_size=vocab_size, | ||
716 | - is_training=is_training, | ||
717 | - **unused_params) | ||
718 | - | ||
719 | -def nonLocal_block(vlad, feature_size, hidden_size, cluster_size): | ||
720 | - nonlocal_theta = tf.get_variable("nonlocal_theta", | ||
721 | - [feature_size, hidden_size], | ||
722 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) | ||
723 | - nonlocal_phi = tf.get_variable("nonlocal_phi", | ||
724 | - [feature_size, hidden_size], | ||
725 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) | ||
726 | - nonlocal_g = tf.get_variable("nonlocal_g", | ||
727 | - [feature_size, hidden_size], | ||
728 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) | ||
729 | - nonlocal_out = tf.get_variable("nonlocal_out", | ||
730 | - [hidden_size, feature_size], | ||
731 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden_size))) | ||
732 | - | ||
733 | - vlad_theta = tf.matmul(vlad, nonlocal_theta) | ||
734 | - vlad_phi = tf.matmul(vlad, nonlocal_phi) | ||
735 | - vlad_g = tf.matmul(vlad, nonlocal_g) | ||
736 | - | ||
737 | - vlad_theta = tf.reshape(vlad_theta, [-1, cluster_size, hidden_size]) | ||
738 | - vlad_phi = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size]) | ||
739 | - vlad_g = tf.reshape(vlad_phi, [-1, cluster_size, hidden_size]) | ||
740 | - | ||
741 | - vlad_softmax = tf.nn.softmax(feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1]))) | ||
742 | - vlad_g = tf.matmul(vlad_softmax, vlad_g) | ||
743 | - vlad_g = tf.reshape(vlad_g, [-1, hidden_size]) | ||
744 | - | ||
745 | - vlad_g = tf.matmul(vlad_g, nonlocal_out) | ||
746 | - vlad = vlad + vlad_g | ||
747 | - return vlad | ||
748 | - | ||
749 | -class SoftDbofModelLF(models.BaseModel): | ||
750 | - """Creates a Soft Deep Bag of Frames model. | ||
751 | - The model projects the features for each frame into a higher dimensional | ||
752 | - 'clustering' space, pools across frames in that space, and then | ||
753 | - uses a configurable video-level model to classify the now aggregated features. | ||
754 | - The model will randomly sample either frames or sequences of frames during | ||
755 | - training to speed up convergence. | ||
756 | - Args: | ||
757 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
758 | - input features. | ||
759 | - vocab_size: The number of classes in the dataset. | ||
760 | - num_frames: A vector of length 'batch' which indicates the number of | ||
761 | - frames for each video (before padding). | ||
762 | - Returns: | ||
763 | - A dictionary with a tensor containing the probability predictions of the | ||
764 | - model in the 'predictions' key. The dimensions of the tensor are | ||
765 | - 'batch_size' x 'num_classes'. | ||
766 | - """ | ||
767 | - | ||
768 | - def create_model(self, | ||
769 | - model_input, | ||
770 | - vocab_size, | ||
771 | - num_frames, | ||
772 | - iterations=None, | ||
773 | - add_batch_norm=None, | ||
774 | - sample_random_frames=None, | ||
775 | - cluster_size=None, | ||
776 | - hidden_size=None, | ||
777 | - is_training=True, | ||
778 | - **unused_params): | ||
779 | - iterations = 300 | ||
780 | - add_batch_norm = True | ||
781 | - random_frames = True | ||
782 | - cluster_size = 4000 | ||
783 | - hidden1_size = 1024 | ||
784 | - fc_dimred = True | ||
785 | - relu = False | ||
786 | - max_pool = False | ||
787 | - | ||
788 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
789 | - if random_frames: | ||
790 | - model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
791 | - iterations) | ||
792 | - else: | ||
793 | - model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
794 | - iterations) | ||
795 | - max_frames = model_input.get_shape().as_list()[1] | ||
796 | - feature_size = model_input.get_shape().as_list()[2] | ||
797 | - reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
798 | - tf.summary.histogram("input_hist", reshaped_input) | ||
799 | - | ||
800 | - video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) | ||
801 | - audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) | ||
802 | - | ||
803 | - | ||
804 | - if add_batch_norm: | ||
805 | - reshaped_input = slim.batch_norm( | ||
806 | - reshaped_input, | ||
807 | - center=True, | ||
808 | - scale=True, | ||
809 | - is_training=is_training, | ||
810 | - scope="input_bn") | ||
811 | - | ||
812 | - with tf.variable_scope("video_DBOF"): | ||
813 | - dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) | ||
814 | - | ||
815 | - with tf.variable_scope("audio_DBOF"): | ||
816 | - dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) | ||
817 | - | ||
818 | - dbof = tf.concat([dbof_video, dbof_audio],1) | ||
819 | - | ||
820 | - dbof_dim = dbof.get_shape().as_list()[1] | ||
821 | - | ||
822 | - if fc_dimred: | ||
823 | - hidden1_weights = tf.get_variable("hidden1_weights", | ||
824 | - [dbof_dim, hidden1_size], | ||
825 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) | ||
826 | - tf.summary.histogram("hidden1_weights", hidden1_weights) | ||
827 | - activation = tf.matmul(dbof, hidden1_weights) | ||
828 | - | ||
829 | - if add_batch_norm and relu: | ||
830 | - activation = slim.batch_norm( | ||
831 | - activation, | ||
832 | - center=True, | ||
833 | - scale=True, | ||
834 | - is_training=is_training, | ||
835 | - scope="hidden1_bn") | ||
836 | - else: | ||
837 | - hidden1_biases = tf.get_variable("hidden1_biases", | ||
838 | - [hidden1_size], | ||
839 | - initializer = tf.random_normal_initializer(stddev=0.01)) | ||
840 | - tf.summary.histogram("hidden1_biases", hidden1_biases) | ||
841 | - activation += hidden1_biases | ||
842 | - | ||
843 | - if relu: | ||
844 | - activation = tf.nn.relu6(activation) | ||
845 | - tf.summary.histogram("hidden1_output", activation) | ||
846 | - else: | ||
847 | - activation = dbof | ||
848 | - | ||
849 | - aggregated_model = getattr(video_level_models, | ||
850 | - FLAGS.video_level_classifier_model) | ||
851 | - | ||
852 | - return aggregated_model().create_model( | ||
853 | - model_input=activation, | ||
854 | - vocab_size=vocab_size, | ||
855 | - is_training=is_training, | ||
856 | - **unused_params) | ||
857 | - | ||
858 | - | ||
859 | - | ||
860 | -class early_NetVLADModelLF(models.BaseModel): | ||
861 | - """Creates a NetVLAD based model. | ||
862 | - Args: | ||
863 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
864 | - input features. | ||
865 | - vocab_size: The number of classes in the dataset. | ||
866 | - num_frames: A vector of length 'batch' which indicates the number of | ||
867 | - frames for each video (before padding). | ||
868 | - Returns: | ||
869 | - A dictionary with a tensor containing the probability predictions of the | ||
870 | - model in the 'predictions' key. The dimensions of the tensor are | ||
871 | - 'batch_size' x 'num_classes'. | ||
872 | - """ | ||
873 | - | ||
874 | - | ||
875 | - def create_model(self, | ||
876 | - model_input, | ||
877 | - vocab_size, | ||
878 | - num_frames, | ||
879 | - iterations=None, | ||
880 | - add_batch_norm=None, | ||
881 | - sample_random_frames=None, | ||
882 | - cluster_size=None, | ||
883 | - hidden_size=None, | ||
884 | - is_training=True, | ||
885 | - **unused_params): | ||
886 | - iterations = 300 | ||
887 | - add_batch_norm = True | ||
888 | - random_frames = True | ||
889 | - cluster_size = 64 | ||
890 | - hidden1_size = 1024 | ||
891 | - relu = False | ||
892 | - dimred = -1 | ||
893 | - gating = True | ||
894 | - remove_diag = False | ||
895 | - | ||
896 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
897 | - if random_frames: | ||
898 | - model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
899 | - iterations) | ||
900 | - else: | ||
901 | - model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
902 | - iterations) | ||
903 | - | ||
904 | - | ||
905 | - max_frames = model_input.get_shape().as_list()[1] | ||
906 | - feature_size = model_input.get_shape().as_list()[2] | ||
907 | - reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
908 | - | ||
909 | - video_audio_NetVLAD = NetVLAD_NonLocal(1024+128,max_frames,cluster_size, add_batch_norm, is_training) | ||
910 | - | ||
911 | - if add_batch_norm:# and not lightvlad: | ||
912 | - reshaped_input = slim.batch_norm( | ||
913 | - reshaped_input, | ||
914 | - center=True, | ||
915 | - scale=True, | ||
916 | - is_training=is_training, | ||
917 | - scope="input_bn") | ||
918 | - with tf.variable_scope("video_audio_VLAD"): | ||
919 | - vlad = video_audio_NetVLAD.forward(reshaped_input) | ||
920 | - | ||
921 | - vlad_dim = vlad.get_shape().as_list()[1] | ||
922 | - hidden1_weights = tf.get_variable("hidden1_weights", | ||
923 | - [vlad_dim, hidden1_size], | ||
924 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) | ||
925 | - | ||
926 | - activation = tf.matmul(vlad, hidden1_weights) | ||
927 | - | ||
928 | - if add_batch_norm and relu: | ||
929 | - activation = slim.batch_norm( | ||
930 | - activation, | ||
931 | - center=True, | ||
932 | - scale=True, | ||
933 | - is_training=is_training, | ||
934 | - scope="hidden1_bn") | ||
935 | - | ||
936 | - else: | ||
937 | - hidden1_biases = tf.get_variable("hidden1_biases", | ||
938 | - [hidden1_size], | ||
939 | - initializer = tf.random_normal_initializer(stddev=0.01)) | ||
940 | - tf.summary.histogram("hidden1_biases", hidden1_biases) | ||
941 | - activation += hidden1_biases | ||
942 | - | ||
943 | - if relu: | ||
944 | - activation = tf.nn.relu6(activation) | ||
945 | - | ||
946 | - | ||
947 | - if gating: | ||
948 | - gating_weights = tf.get_variable("gating_weights_2", | ||
949 | - [hidden1_size, hidden1_size], | ||
950 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) | ||
951 | - | ||
952 | - gates = tf.matmul(activation, gating_weights) | ||
953 | - | ||
954 | - if remove_diag: | ||
955 | - #removes diagonals coefficients | ||
956 | - diagonals = tf.matrix_diag_part(gating_weights) | ||
957 | - gates = gates - tf.multiply(diagonals,activation) | ||
958 | - | ||
959 | - | ||
960 | - if add_batch_norm: | ||
961 | - gates = slim.batch_norm( | ||
962 | - gates, | ||
963 | - center=True, | ||
964 | - scale=True, | ||
965 | - is_training=is_training, | ||
966 | - scope="gating_bn") | ||
967 | - else: | ||
968 | - gating_biases = tf.get_variable("gating_biases", | ||
969 | - [cluster_size], | ||
970 | - initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) | ||
971 | - gates += gating_biases | ||
972 | - | ||
973 | - gates = tf.sigmoid(gates) | ||
974 | - | ||
975 | - activation = tf.multiply(activation,gates) | ||
976 | - | ||
977 | - aggregated_model = getattr(video_level_models, | ||
978 | - FLAGS.video_level_classifier_model) | ||
979 | - | ||
980 | - | ||
981 | - return aggregated_model().create_model( | ||
982 | - model_input=activation, | ||
983 | - vocab_size=vocab_size, | ||
984 | - is_training=is_training, | ||
985 | - **unused_params) | ||
986 | - | ||
987 | -class NetVLAD_NonLocal(): | ||
988 | - def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | ||
989 | - self.feature_size = feature_size | ||
990 | - self.max_frames = max_frames | ||
991 | - self.is_training = is_training | ||
992 | - self.add_batch_norm = add_batch_norm | ||
993 | - self.cluster_size = cluster_size | ||
994 | - | ||
995 | - def forward(self,reshaped_input): | ||
996 | - | ||
997 | - cluster_weights = tf.get_variable("cluster_weights", | ||
998 | - [int(self.feature_size), int(self.cluster_size)], | ||
999 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1000 | - | ||
1001 | - tf.summary.histogram("cluster_weights", cluster_weights) | ||
1002 | - activation = tf.matmul(reshaped_input, cluster_weights) | ||
1003 | - | ||
1004 | - if self.add_batch_norm: | ||
1005 | - activation = slim.batch_norm( | ||
1006 | - activation, | ||
1007 | - center=True, | ||
1008 | - scale=True, | ||
1009 | - is_training=self.is_training, | ||
1010 | - scope="cluster_bn") | ||
1011 | - else: | ||
1012 | - cluster_biases = tf.get_variable("cluster_biases", | ||
1013 | - [cluster_size], | ||
1014 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1015 | - tf.summary.histogram("cluster_biases", cluster_biases) | ||
1016 | - activation += cluster_biases | ||
1017 | - | ||
1018 | - activation = tf.nn.softmax(activation) | ||
1019 | - tf.summary.histogram("cluster_output", activation) | ||
1020 | - | ||
1021 | - activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) | ||
1022 | - | ||
1023 | - a_sum = tf.reduce_sum(activation,-2,keep_dims=True) | ||
1024 | - | ||
1025 | - cluster_weights2 = tf.get_variable("cluster_weights2", | ||
1026 | - [1,int(self.feature_size), int(self.cluster_size)], | ||
1027 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1028 | - | ||
1029 | - a = tf.multiply(a_sum,cluster_weights2) | ||
1030 | - | ||
1031 | - activation = tf.transpose(activation,perm=[0,2,1]) | ||
1032 | - | ||
1033 | - reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) | ||
1034 | - vlad = tf.matmul(activation,reshaped_input) | ||
1035 | - vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
1036 | - vlad = tf.subtract(vlad,a) | ||
1037 | - | ||
1038 | - | ||
1039 | - vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
1040 | - vlad = tf.reshape(vlad, [-1, self.feature_size]) | ||
1041 | - | ||
1042 | - nonlocal_theta = tf.get_variable("nonlocal_theta", | ||
1043 | - [int(self.feature_size), int(self.cluster_size)], | ||
1044 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1045 | - nonlocal_phi = tf.get_variable("nonlocal_phi", | ||
1046 | - [int(self.feature_size), int(self.cluster_size)], | ||
1047 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1048 | - nonlocal_g = tf.get_variable("nonlocal_g", | ||
1049 | - [int(self.feature_size), int(self.cluster_size)], | ||
1050 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
1051 | - nonlocal_out = tf.get_variable("nonlocal_out", | ||
1052 | - [int(self.cluster_size), int(self.feature_size)], | ||
1053 | - initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size))) | ||
1054 | - | ||
1055 | - vlad_theta = tf.matmul(vlad, nonlocal_theta) | ||
1056 | - vlad_phi = tf.matmul(vlad, nonlocal_phi) | ||
1057 | - vlad_g = tf.matmul(vlad, nonlocal_g) | ||
1058 | - | ||
1059 | - vlad_theta = tf.reshape(vlad_theta, [-1, int(self.cluster_size),int(self.cluster_size)]) | ||
1060 | - vlad_phi = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)]) | ||
1061 | - vlad_g = tf.reshape(vlad_phi, [-1, int(self.cluster_size),int(self.cluster_size)]) | ||
1062 | - | ||
1063 | - vlad_softmax = tf.nn.softmax(self.feature_size**-.5 * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1]))) | ||
1064 | - vlad_g = tf.matmul(vlad_softmax, vlad_g) | ||
1065 | - vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size]) | ||
1066 | - | ||
1067 | - vlad_g = tf.matmul(vlad_g, nonlocal_out) | ||
1068 | - vlad_g = tf.reshape(vlad_g, [-1, int(self.cluster_size), int(self.feature_size)]) | ||
1069 | - vlad = tf.reshape(vlad, [-1, int(self.cluster_size), int(self.feature_size)]) | ||
1070 | - vlad = vlad + vlad_g | ||
1071 | - | ||
1072 | - vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
1073 | - vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c] | ||
1074 | - | ||
1075 | - vlad = tf.reshape(vlad,[-1,int(self.cluster_size*self.feature_size)]) | ||
1076 | - vlad = tf.nn.l2_normalize(vlad,1) | ||
1077 | - | ||
1078 | - return vlad | ||
1079 | - | ||
1080 | - | ||
1081 | -class SoftDbofModelLF_8k(models.BaseModel): | ||
1082 | - """Creates a Soft Deep Bag of Frames model. | ||
1083 | - The model projects the features for each frame into a higher dimensional | ||
1084 | - 'clustering' space, pools across frames in that space, and then | ||
1085 | - uses a configurable video-level model to classify the now aggregated features. | ||
1086 | - The model will randomly sample either frames or sequences of frames during | ||
1087 | - training to speed up convergence. | ||
1088 | - Args: | ||
1089 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1090 | - input features. | ||
1091 | - vocab_size: The number of classes in the dataset. | ||
1092 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1093 | - frames for each video (before padding). | ||
1094 | - Returns: | ||
1095 | - A dictionary with a tensor containing the probability predictions of the | ||
1096 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1097 | - 'batch_size' x 'num_classes'. | ||
1098 | - """ | ||
1099 | - | ||
1100 | - def create_model(self, | ||
1101 | - model_input, | ||
1102 | - vocab_size, | ||
1103 | - num_frames, | ||
1104 | - iterations=None, | ||
1105 | - add_batch_norm=None, | ||
1106 | - sample_random_frames=None, | ||
1107 | - cluster_size=None, | ||
1108 | - hidden_size=None, | ||
1109 | - is_training=True, | ||
1110 | - **unused_params): | ||
1111 | - iterations = 300 | ||
1112 | - add_batch_norm = True | ||
1113 | - random_frames = True | ||
1114 | - cluster_size = 2048 | ||
1115 | - hidden1_size = 1024 | ||
1116 | - fc_dimred = True | ||
1117 | - relu = False | ||
1118 | - max_pool = False | ||
1119 | - | ||
1120 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
1121 | - if random_frames: | ||
1122 | - model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
1123 | - iterations) | ||
1124 | - else: | ||
1125 | - model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
1126 | - iterations) | ||
1127 | - max_frames = model_input.get_shape().as_list()[1] | ||
1128 | - feature_size = model_input.get_shape().as_list()[2] | ||
1129 | - reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
1130 | - tf.summary.histogram("input_hist", reshaped_input) | ||
1131 | - | ||
1132 | - video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) | ||
1133 | - audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) | ||
1134 | - | ||
1135 | - | ||
1136 | - if add_batch_norm: | ||
1137 | - reshaped_input = slim.batch_norm( | ||
1138 | - reshaped_input, | ||
1139 | - center=True, | ||
1140 | - scale=True, | ||
1141 | - is_training=is_training, | ||
1142 | - scope="input_bn") | ||
1143 | - | ||
1144 | - with tf.variable_scope("video_DBOF"): | ||
1145 | - dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) | ||
1146 | - | ||
1147 | - with tf.variable_scope("audio_DBOF"): | ||
1148 | - dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) | ||
1149 | - | ||
1150 | - dbof = tf.concat([dbof_video, dbof_audio],1) | ||
1151 | - | ||
1152 | - dbof_dim = dbof.get_shape().as_list()[1] | ||
1153 | - | ||
1154 | - if fc_dimred: | ||
1155 | - hidden1_weights = tf.get_variable("hidden1_weights", | ||
1156 | - [dbof_dim, hidden1_size], | ||
1157 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) | ||
1158 | - tf.summary.histogram("hidden1_weights", hidden1_weights) | ||
1159 | - activation = tf.matmul(dbof, hidden1_weights) | ||
1160 | - | ||
1161 | - if add_batch_norm and relu: | ||
1162 | - activation = slim.batch_norm( | ||
1163 | - activation, | ||
1164 | - center=True, | ||
1165 | - scale=True, | ||
1166 | - is_training=is_training, | ||
1167 | - scope="hidden1_bn") | ||
1168 | - else: | ||
1169 | - hidden1_biases = tf.get_variable("hidden1_biases", | ||
1170 | - [hidden1_size], | ||
1171 | - initializer = tf.random_normal_initializer(stddev=0.01)) | ||
1172 | - tf.summary.histogram("hidden1_biases", hidden1_biases) | ||
1173 | - activation += hidden1_biases | ||
1174 | - | ||
1175 | - if relu: | ||
1176 | - activation = tf.nn.relu6(activation) | ||
1177 | - tf.summary.histogram("hidden1_output", activation) | ||
1178 | - else: | ||
1179 | - activation = dbof | ||
1180 | - | ||
1181 | - aggregated_model = getattr(video_level_models, | ||
1182 | - FLAGS.video_level_classifier_model) | ||
1183 | - | ||
1184 | - | ||
1185 | - return aggregated_model().create_model( | ||
1186 | - model_input=activation, | ||
1187 | - vocab_size=vocab_size, | ||
1188 | - is_training=is_training, | ||
1189 | - **unused_params) | ||
1190 | - | ||
1191 | -class FrameLevelLogisticModel(models.BaseModel): | ||
1192 | - """Creates a logistic classifier over the aggregated frame-level features.""" | ||
1193 | - | ||
1194 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1195 | - """See base class. | ||
1196 | - | ||
1197 | - This class is intended to be an example for implementors of frame level | ||
1198 | - models. If you want to train a model over averaged features it is more | ||
1199 | - efficient to average them beforehand rather than on the fly. | ||
1200 | - | ||
1201 | - Args: | ||
1202 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1203 | - input features. | ||
1204 | - vocab_size: The number of classes in the dataset. | ||
1205 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1206 | - frames for each video (before padding). | ||
1207 | - | ||
1208 | - Returns: | ||
1209 | - A dictionary with a tensor containing the probability predictions of the | ||
1210 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1211 | - 'batch_size' x 'num_classes'. | ||
1212 | - """ | ||
1213 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
1214 | - feature_size = model_input.get_shape().as_list()[2] | ||
1215 | - | ||
1216 | - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), | ||
1217 | - [-1, feature_size]) | ||
1218 | - avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators | ||
1219 | - | ||
1220 | - output = slim.fully_connected(avg_pooled, | ||
1221 | - vocab_size, | ||
1222 | - activation_fn=tf.nn.sigmoid, | ||
1223 | - weights_regularizer=slim.l2_regularizer(1e-8)) | ||
1224 | - | ||
1225 | - return {"predictions": output} | ||
1226 | - | ||
1227 | -class CNN(models.BaseModel): | ||
1228 | - """Creates a logistic classifier over the aggregated frame-level features.""" | ||
1229 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1230 | - """See base class. | ||
1231 | - | ||
1232 | - This class is intended to be an example for implementors of frame level | ||
1233 | - models. If you want to train a model over averaged features it is more | ||
1234 | - efficient to average them beforehand rather than on the fly. | ||
1235 | - | ||
1236 | - Args: | ||
1237 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1238 | - input features. | ||
1239 | - vocab_size: The number of classes in the dataset. | ||
1240 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1241 | - frames for each video (before padding). | ||
1242 | - | ||
1243 | - Returns: | ||
1244 | - A dictionary with a tensor containing the probability predictions of the | ||
1245 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1246 | - 'batch_size' x 'num_classes'. | ||
1247 | - """ | ||
1248 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
1249 | - feature_size = model_input.get_shape().as_list()[2] | ||
1250 | - | ||
1251 | - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), | ||
1252 | - [-1, feature_size]) | ||
1253 | - | ||
1254 | - | ||
1255 | - | ||
1256 | - convK3 = slim.convolution(model_input, | ||
1257 | - num_outputs=feature_size, | ||
1258 | - kernel_size=3, | ||
1259 | - scope='conv1') | ||
1260 | - | ||
1261 | - convK5 = slim.convolution(model_input, | ||
1262 | - num_outputs=feature_size, | ||
1263 | - kernel_size=5, | ||
1264 | - scope='conv2') | ||
1265 | - | ||
1266 | - convK1 = slim.convolution(model_input, | ||
1267 | - num_outputs=feature_size, | ||
1268 | - kernel_size=5, | ||
1269 | - scope='conv3') | ||
1270 | - | ||
1271 | - | ||
1272 | - avg_pooled = tf.reduce_sum(tf.concat([convK3,convK5,convK1],axis=1), axis=[1]) / denominators | ||
1273 | - | ||
1274 | - output = slim.fully_connected(avg_pooled, | ||
1275 | - vocab_size, | ||
1276 | - activation_fn=tf.nn.relu, | ||
1277 | - weights_regularizer=slim.l2_regularizer(1e-8)) | ||
1278 | - | ||
1279 | - return {"predictions": output} | ||
1280 | - | ||
1281 | -class LstmModel(models.BaseModel): | ||
1282 | - | ||
1283 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1284 | - """Creates a model which uses a stack of LSTMs to represent the video. | ||
1285 | - Args: | ||
1286 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1287 | - input features. | ||
1288 | - vocab_size: The number of classes in the dataset. | ||
1289 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1290 | - frames for each video (before padding). | ||
1291 | - Returns: | ||
1292 | - A dictionary with a tensor containing the probability predictions of the | ||
1293 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1294 | - 'batch_size' x 'num_classes'. | ||
1295 | - """ | ||
1296 | - lstm_size = FLAGS.lstm_cells | ||
1297 | - number_of_layers = FLAGS.lstm_layers | ||
1298 | - | ||
1299 | - stacked_lstm = tf.contrib.rnn.MultiRNNCell( | ||
1300 | - [ | ||
1301 | - tf.contrib.rnn.BasicLSTMCell( | ||
1302 | - lstm_size, forget_bias=1.0) | ||
1303 | - for _ in range(number_of_layers) | ||
1304 | - ]) | ||
1305 | - | ||
1306 | - loss = 0.0 | ||
1307 | - | ||
1308 | - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, | ||
1309 | - sequence_length=num_frames, | ||
1310 | - dtype=tf.float32) | ||
1311 | - | ||
1312 | - aggregated_model = getattr(video_level_models, | ||
1313 | - FLAGS.video_level_classifier_model) | ||
1314 | - | ||
1315 | - return aggregated_model().create_model( | ||
1316 | - model_input=state[-1].h, | ||
1317 | - vocab_size=vocab_size, | ||
1318 | - **unused_params) | ||
1319 | - | ||
1320 | -class BNGRUModel(models.BaseModel): | ||
1321 | - | ||
1322 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1323 | - lstm_size = FLAGS.lstm_cells | ||
1324 | - number_of_layers = FLAGS.lstm_layers | ||
1325 | - | ||
1326 | - stacked_rnn = tf.contrib.rnn.MultiRNNCell( | ||
1327 | - [ | ||
1328 | - tf.contrib.rnn.GRUCell(lstm_size) | ||
1329 | - for _ in range(number_of_layers) | ||
1330 | - ], state_is_tuple=False) | ||
1331 | - | ||
1332 | - outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input, | ||
1333 | - sequence_length=num_frames, | ||
1334 | - dtype=tf.float32) | ||
1335 | - aggregated_model = getattr(video_level_models, | ||
1336 | - FLAGS.video_level_classifier_model) | ||
1337 | - | ||
1338 | - state = slim.batch_norm( | ||
1339 | - state, | ||
1340 | - center=True, | ||
1341 | - scale=True, | ||
1342 | - is_training=True, | ||
1343 | - scope='proj') | ||
1344 | - | ||
1345 | - return aggregated_model().create_model( | ||
1346 | - model_input=state, | ||
1347 | - vocab_size=vocab_size, | ||
1348 | - **unused_params) | ||
1349 | - | ||
1350 | - | ||
1351 | - | ||
1352 | -class GruModel2(models.BaseModel): | ||
1353 | - | ||
1354 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1355 | - """Creates a model which uses a stack of LSTMs to represent the video. | ||
1356 | - Args: | ||
1357 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1358 | - input features. | ||
1359 | - vocab_size: The number of classes in the dataset. | ||
1360 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1361 | - frames for each video (before padding). | ||
1362 | - Returns: | ||
1363 | - A dictionary with a tensor containing the probability predictions of the | ||
1364 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1365 | - 'batch_size' x 'num_classes'. | ||
1366 | - """ | ||
1367 | - lstm_size = FLAGS.lstm_cells | ||
1368 | - number_of_layers = FLAGS.lstm_layers | ||
1369 | - | ||
1370 | - stacked_lstm = tf.contrib.rnn.MultiRNNCell( | ||
1371 | - [ | ||
1372 | - tf.contrib.rnn.GRUCell(lstm_size) | ||
1373 | - for _ in range(number_of_layers) | ||
1374 | - ], state_is_tuple=False) | ||
1375 | - | ||
1376 | - loss = 0.0 | ||
1377 | - | ||
1378 | - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, | ||
1379 | - sequence_length=num_frames, | ||
1380 | - dtype=tf.float32) | ||
1381 | - aggregated_model = getattr(video_level_models, | ||
1382 | - FLAGS.video_level_classifier_model) | ||
1383 | - | ||
1384 | - return aggregated_model().create_model( | ||
1385 | - model_input=state, | ||
1386 | - vocab_size=vocab_size, | ||
1387 | - **unused_params) | ||
1388 | - | ||
1389 | - | ||
1390 | -class BiGRUModel(models.BaseModel): | ||
1391 | - | ||
1392 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1393 | - """Creates a model which uses a stack of LSTMs to represent the video. | ||
1394 | - Args: | ||
1395 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1396 | - input features. | ||
1397 | - vocab_size: The number of classes in the dataset. | ||
1398 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1399 | - frames for each video (before padding). | ||
1400 | - Returns: | ||
1401 | - A dictionary with a tensor containing the probability predictions of the | ||
1402 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1403 | - 'batch_size' x 'num_classes'. | ||
1404 | - """ | ||
1405 | - lstm_size = FLAGS.lstm_cells | ||
1406 | - number_of_layers = FLAGS.lstm_layers | ||
1407 | - | ||
1408 | - with tf.variable_scope('fw'): | ||
1409 | - rnn_fw = tf.contrib.rnn.MultiRNNCell( | ||
1410 | - [ | ||
1411 | - tf.contrib.rnn.GRUCell(lstm_size) | ||
1412 | - for _ in range(number_of_layers) | ||
1413 | - ], state_is_tuple=False) | ||
1414 | - | ||
1415 | - | ||
1416 | - with tf.variable_scope('bw'): | ||
1417 | - rnn_bw = tf.contrib.rnn.MultiRNNCell( | ||
1418 | - [ | ||
1419 | - tf.contrib.rnn.GRUCell(lstm_size) | ||
1420 | - for _ in range(number_of_layers) | ||
1421 | - ], state_is_tuple=False) | ||
1422 | - | ||
1423 | - outputs, state = tf.nn.bidirectional_dynamic_rnn(rnn_fw, rnn_bw, model_input, | ||
1424 | - sequence_length=num_frames, | ||
1425 | - dtype=tf.float32, swap_memory=True) | ||
1426 | - state = tf.concat(state, axis=1) | ||
1427 | - aggregated_model = getattr(video_level_models, | ||
1428 | - FLAGS.video_level_classifier_model) | ||
1429 | - state = slim.batch_norm( | ||
1430 | - state, | ||
1431 | - center=True, | ||
1432 | - scale=True, | ||
1433 | - is_training=True, | ||
1434 | - scope='proj') | ||
1435 | - | ||
1436 | - return aggregated_model().create_model( | ||
1437 | - model_input=state, | ||
1438 | - vocab_size=vocab_size, | ||
1439 | - **unused_params) | ||
1440 | - | ||
1441 | -""" | ||
1442 | -Copyright (c) 2017, University of Texas Southwestern Medical Center | ||
1443 | -All rights reserved. | ||
1444 | -Redistribution and use in source and binary forms, with or without | ||
1445 | -modification, are permitted provided that the following conditions are met: | ||
1446 | -* Redistributions of source code must retain the above copyright notice, this | ||
1447 | - list of conditions and the following disclaimer. | ||
1448 | -* Redistributions in binary form must reproduce the above copyright notice, | ||
1449 | - this list of conditions and the following disclaimer in the documentation | ||
1450 | - and/or other materials provided with the distribution. | ||
1451 | -* Neither the name of the University of Texas at Austin nor the names of its | ||
1452 | - contributors may be used to endorse or promote products derived from | ||
1453 | - this software without specific prior written permission. | ||
1454 | -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS | ||
1455 | -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
1456 | -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
1457 | -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
1458 | -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
1459 | -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
1460 | -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
1461 | -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
1462 | -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
1463 | -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
1464 | -Recurrent Weighted Average | ||
1465 | -Implementation modified from: https://github.com/jostmey/rwa | ||
1466 | -Paper: | ||
1467 | -@article{ostmeyer2017machine, | ||
1468 | - title={Machine Learning on Sequential Data Using a Recurrent Weighted Average}, | ||
1469 | - author={Ostmeyer, Jared and Cowell, Lindsay}, | ||
1470 | - journal={arXiv preprint arXiv:1703.01253}, | ||
1471 | - year={2017} | ||
1472 | -} | ||
1473 | -""" | ||
1474 | - | ||
1475 | -class RwaModel(models.BaseModel): | ||
1476 | - | ||
1477 | - | ||
1478 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1479 | - | ||
1480 | - # constants | ||
1481 | - | ||
1482 | - init_factor = 1.0 | ||
1483 | - num_cells = FLAGS.lstm_cells | ||
1484 | - input_shape = model_input.get_shape().as_list() | ||
1485 | - batch_size, max_steps, num_features = input_shape | ||
1486 | - | ||
1487 | - # trainable weights | ||
1488 | - s = weights_rwa.init_state(num_cells, "s", init_factor) | ||
1489 | - W_g = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_g") | ||
1490 | - W_u = weights_rwa.init_weight([num_features, num_cells], "W_u") | ||
1491 | - W_a = weights_rwa.init_weight([num_features+num_cells, num_cells], "W_a") | ||
1492 | - b_g = weights_rwa.init_bias(num_cells, "b_g") | ||
1493 | - b_u = weights_rwa.init_bias(num_cells, "b_u") | ||
1494 | - b_a = weights_rwa.init_bias(num_cells, "b_a") | ||
1495 | - | ||
1496 | - #pl = tf.placeholder(tf.float32, shape=[None, num_cells]) | ||
1497 | - pl = tf.reshape(model_input, [-1, max_steps*num_features])[:, :num_cells] | ||
1498 | - | ||
1499 | - # internal states | ||
1500 | - #n = tf.zeros([batch_size, num_cells]) | ||
1501 | - #d = tf.zeros([batch_size, num_cells]) | ||
1502 | - #h = tf.zeros([batch_size, num_cells]) | ||
1503 | - #a_max = tf.fill([batch_size, num_cells], -1E38) # Start off with lowest number possible | ||
1504 | - n = tf.zeros_like(pl) | ||
1505 | - d = tf.zeros_like(pl) | ||
1506 | - h = tf.zeros_like(pl) | ||
1507 | - a_max = tf.multiply(tf.ones_like(pl), -1E38) | ||
1508 | - | ||
1509 | - # define model | ||
1510 | - h += tf.nn.tanh(tf.expand_dims(s, 0)) | ||
1511 | - | ||
1512 | - for i in range(max_steps): | ||
1513 | - | ||
1514 | - x_step = model_input[:,i,:] | ||
1515 | - xh_join = tf.concat(axis=1, values=[x_step, h]) # Combine the features and hidden state into one tensor | ||
1516 | - | ||
1517 | - u = tf.matmul(x_step, W_u)+b_u | ||
1518 | - g = tf.matmul(xh_join, W_g)+b_g | ||
1519 | - a = tf.matmul(xh_join, W_a) # The bias term when factored out of the numerator and denominator cancels and is unnecessary | ||
1520 | - | ||
1521 | - z = tf.multiply(u, tf.nn.tanh(g)) | ||
1522 | - | ||
1523 | - a_newmax = tf.maximum(a_max, a) | ||
1524 | - exp_diff = tf.exp(a_max-a_newmax) | ||
1525 | - exp_scaled = tf.exp(a-a_newmax) | ||
1526 | - | ||
1527 | - n = tf.multiply(n, exp_diff)+tf.multiply(z, exp_scaled) # Numerically stable update of numerator | ||
1528 | - d = tf.multiply(d, exp_diff)+exp_scaled # Numerically stable update of denominator | ||
1529 | - h_new = tf.nn.tanh(tf.div(n, d)) | ||
1530 | - a_max = a_newmax | ||
1531 | - | ||
1532 | - h = tf.where(tf.greater(num_frames, i), h_new, h) # Use new hidden state only if the sequence length has not been exceeded | ||
1533 | - | ||
1534 | - | ||
1535 | - aggregated_model = getattr(video_level_models, | ||
1536 | - FLAGS.video_level_classifier_model) | ||
1537 | - return aggregated_model().create_model( | ||
1538 | - model_input=h, | ||
1539 | - vocab_size=vocab_size, | ||
1540 | - **unused_params) | ||
1541 | - | ||
1542 | - | ||
1543 | - | ||
1544 | -class DropoutGruModel(models.BaseModel): | ||
1545 | - | ||
1546 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1547 | - """Creates a model which uses a stack of LSTMs to represent the video. | ||
1548 | - Args: | ||
1549 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1550 | - input features. | ||
1551 | - vocab_size: The number of classes in the dataset. | ||
1552 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1553 | - frames for each video (before padding). | ||
1554 | - Returns: | ||
1555 | - A dictionary with a tensor containing the probability predictions of the | ||
1556 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1557 | - 'batch_size' x 'num_classes'. | ||
1558 | - """ | ||
1559 | - lstm_size = FLAGS.lstm_cells | ||
1560 | - number_of_layers = FLAGS.lstm_layers | ||
1561 | - | ||
1562 | - stacked_lstm = tf.contrib.rnn.MultiRNNCell( | ||
1563 | - [ | ||
1564 | - tf.contrib.rnn.DropoutWrapper( | ||
1565 | - tf.contrib.rnn.GRUCell(lstm_size), 0.9, 0.9) | ||
1566 | - for _ in range(number_of_layers) | ||
1567 | - ], state_is_tuple=False) | ||
1568 | - | ||
1569 | - loss = 0.0 | ||
1570 | - | ||
1571 | - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, | ||
1572 | - sequence_length=num_frames, | ||
1573 | - dtype=tf.float32) | ||
1574 | - aggregated_model = getattr(video_level_models, | ||
1575 | - FLAGS.video_level_classifier_model) | ||
1576 | - | ||
1577 | - aggregated_model = FrameLevelLogisticModel; | ||
1578 | - return aggregated_model().create_model( | ||
1579 | - model_input=outputs, | ||
1580 | - vocab_size=vocab_size, | ||
1581 | - num_frames=num_frames, | ||
1582 | - **unused_params) | ||
1583 | - | ||
1584 | - | ||
1585 | - | ||
1586 | - | ||
1587 | -class ResRnnModel(models.BaseModel): | ||
1588 | - | ||
1589 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1590 | - lstm_size = 1152 | ||
1591 | - number_of_layers = 3 | ||
1592 | - | ||
1593 | - #from rnn_cell_modern import Delta_RNN as drnn | ||
1594 | - from rnn_wrappers_modern import MultiRNNCell as mrnn | ||
1595 | - | ||
1596 | - cells = [] | ||
1597 | - for i in range(number_of_layers): | ||
1598 | - with tf.variable_scope('cell_'+str(i)): | ||
1599 | - cells.append(tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)) | ||
1600 | - | ||
1601 | - stacked_rnn = mrnn(cells, use_residual_connections=True, state_is_tuple=True) | ||
1602 | - | ||
1603 | - outputs, state = tf.nn.dynamic_rnn(stacked_rnn, model_input, | ||
1604 | - sequence_length=num_frames, | ||
1605 | - dtype=tf.float32) | ||
1606 | - | ||
1607 | - aggregated_model = getattr(video_level_models, | ||
1608 | - FLAGS.video_level_classifier_model) | ||
1609 | - | ||
1610 | - return aggregated_model().create_model( | ||
1611 | - model_input=state[-1].h, | ||
1612 | - vocab_size=vocab_size, | ||
1613 | - **unused_params) | ||
1614 | - | ||
1615 | - | ||
1616 | -class LateVladModel(models.BaseModel): | ||
1617 | - | ||
1618 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1619 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
1620 | - model_input = utils.SampleRandomSequence(model_input, num_frames, 128) | ||
1621 | - | ||
1622 | - input_v = model_input[:,:,:1024] | ||
1623 | - input_a = model_input[:,:,1024:] | ||
1624 | - | ||
1625 | - K = 8 | ||
1626 | - | ||
1627 | - with tf.variable_scope('video'): | ||
1628 | - x = input_v | ||
1629 | - input_shape = x.get_shape().as_list() | ||
1630 | - _, N, D = input_shape | ||
1631 | - c_bound = math.sqrt(1. / (K * D)) | ||
1632 | - c = tf.get_variable(name='c', | ||
1633 | - shape=[K, N], | ||
1634 | - dtype=tf.float32, | ||
1635 | - initializer=tf.random_uniform_initializer(-c_bound, c_bound)) | ||
1636 | - a = slim.convolution(x, | ||
1637 | - num_outputs=K, | ||
1638 | - kernel_size=1, | ||
1639 | - data_format='NWC', | ||
1640 | - scope='conv') | ||
1641 | - a = tf.nn.softmax(a) | ||
1642 | - v = [] | ||
1643 | - for k in range(K): | ||
1644 | - t = x-c[k][None, :, None] | ||
1645 | - t = tf.multiply(t, a[:,:,k][:,:,None]) | ||
1646 | - t = tf.reduce_sum(t, 1) | ||
1647 | - t = tf.nn.l2_normalize(t, dim=1) | ||
1648 | - v.append(t) | ||
1649 | - v = tf.stack(v, axis=1) | ||
1650 | - v = tf.reshape(v, [-1, K*D]) | ||
1651 | - | ||
1652 | - proj_weights = tf.get_variable("proj_weights", | ||
1653 | - [K*D, 1024], | ||
1654 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D))) | ||
1655 | - activation_v = tf.matmul(v, proj_weights) | ||
1656 | - | ||
1657 | - with tf.variable_scope('audio'): | ||
1658 | - x = input_a | ||
1659 | - input_shape = x.get_shape().as_list() | ||
1660 | - _, N, D = input_shape | ||
1661 | - c_bound = math.sqrt(1. / (K * D)) | ||
1662 | - c = tf.get_variable(name='c', | ||
1663 | - shape=[K, N], | ||
1664 | - dtype=tf.float32, | ||
1665 | - initializer=tf.random_uniform_initializer(-c_bound, c_bound)) | ||
1666 | - a = slim.convolution(x, | ||
1667 | - num_outputs=K, | ||
1668 | - kernel_size=1, | ||
1669 | - data_format='NWC', | ||
1670 | - scope='conv') | ||
1671 | - a = tf.nn.softmax(a) | ||
1672 | - v = [] | ||
1673 | - for k in range(K): | ||
1674 | - t = x-c[k][None, :, None] | ||
1675 | - t = tf.multiply(t, a[:,:,k][:,:,None]) | ||
1676 | - t = tf.reduce_sum(t, 1) | ||
1677 | - t = tf.nn.l2_normalize(t, dim=1) | ||
1678 | - v.append(t) | ||
1679 | - v = tf.stack(v, axis=1) | ||
1680 | - v = tf.reshape(v, [-1, K*D]) | ||
1681 | - | ||
1682 | - proj_weights = tf.get_variable("proj_weights", | ||
1683 | - [K*D, 1024], | ||
1684 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(K*D))) | ||
1685 | - activation_a = tf.matmul(v, proj_weights) | ||
1686 | - | ||
1687 | - activation = tf.concat([activation_v, activation_a], axis=1) | ||
1688 | - | ||
1689 | - activation = slim.batch_norm( | ||
1690 | - activation, | ||
1691 | - center=True, | ||
1692 | - scale=True, | ||
1693 | - is_training=True, | ||
1694 | - scope='proj') | ||
1695 | - | ||
1696 | - activation = tf.nn.relu6(activation) | ||
1697 | - | ||
1698 | - aggregated_model = getattr(video_level_models, | ||
1699 | - FLAGS.video_level_classifier_model) | ||
1700 | - | ||
1701 | - return aggregated_model().create_model( | ||
1702 | - model_input=activation, | ||
1703 | - vocab_size=vocab_size, | ||
1704 | - **unused_params) | ||
1705 | - | ||
1706 | -class LNBLstmModel(models.BaseModel): | ||
1707 | - | ||
1708 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
1709 | - """Creates a model which uses a stack of LSTMs to represent the video. | ||
1710 | - Args: | ||
1711 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1712 | - input features. | ||
1713 | - vocab_size: The number of classes in the dataset. | ||
1714 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1715 | - frames for each video (before padding). | ||
1716 | - Returns: | ||
1717 | - A dictionary with a tensor containing the probability predictions of the | ||
1718 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1719 | - 'batch_size' x 'num_classes'. | ||
1720 | - """ | ||
1721 | - lstm_size = FLAGS.lstm_cells | ||
1722 | - number_of_layers = FLAGS.lstm_layers | ||
1723 | - | ||
1724 | - stacked_lstm = tf.contrib.rnn.MultiRNNCell( | ||
1725 | - [ | ||
1726 | - tf.contrib.rnn.LayerNormBasicLSTMCell(lstm_size, dropout_keep_prob=0.50) | ||
1727 | - for _ in range(number_of_layers) | ||
1728 | - ]) | ||
1729 | - | ||
1730 | - loss = 0.0 | ||
1731 | - | ||
1732 | - outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, | ||
1733 | - sequence_length=num_frames, | ||
1734 | - dtype=tf.float32) | ||
1735 | - | ||
1736 | - aggregated_model = getattr(video_level_models, | ||
1737 | - FLAGS.video_level_classifier_model) | ||
1738 | - | ||
1739 | - return aggregated_model().create_model( | ||
1740 | - model_input=state[-1].h, | ||
1741 | - vocab_size=vocab_size, | ||
1742 | - **unused_params) | ||
1743 | - | ||
1744 | -class audio_avgShort_twowayGRUModel(models.BaseModel): | ||
1745 | - | ||
1746 | - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): | ||
1747 | - """Creates a model which uses a Bidirectional GRU and mean audio features to represent the video. | ||
1748 | - ---->first half GRU-----> | ||
1749 | - - - | ||
1750 | - visual_feature ---- concat----------------> | ||
1751 | - - - - | ||
1752 | - ---->second half GRU----> concat -----> video level classifier | ||
1753 | - - | ||
1754 | - mean audio features---> | ||
1755 | - Args: | ||
1756 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1757 | - input features. | ||
1758 | - vocab_size: The number of classes in the dataset. | ||
1759 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1760 | - frames for each video (before padding). | ||
1761 | - Returns: | ||
1762 | - A dictionary with a tensor containing the probability predictions of the | ||
1763 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1764 | - 'batch_size' x 'num_classes'. | ||
1765 | - """ | ||
1766 | - lstm_size = FLAGS.lstm_cells | ||
1767 | - stride = FLAGS.stride | ||
1768 | - max_frames = model_input.get_shape().as_list()[1] | ||
1769 | - | ||
1770 | - video_input = model_input[:,:,:1024] | ||
1771 | - audio_input = model_input[:,:,1024:] | ||
1772 | - | ||
1773 | - first_num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
1774 | - audio_den = tf.reshape(tf.tile(first_num_frames, [1, 128]), [-1, 128]) | ||
1775 | - mean_audio = tf.reduce_sum(audio_input, 1) / tf.maximum(audio_den, 1) | ||
1776 | - | ||
1777 | - pooled_input, num_frames = self.avg_pooled_func(video_input, num_frames, stride) | ||
1778 | - | ||
1779 | - pooled_input = slim.batch_norm( | ||
1780 | - pooled_input, | ||
1781 | - center=True, | ||
1782 | - scale=True, | ||
1783 | - is_training=is_training, | ||
1784 | - scope="hidden1_bn") | ||
1785 | - | ||
1786 | - mean_audio = slim.batch_norm( | ||
1787 | - mean_audio, | ||
1788 | - center=True, | ||
1789 | - scale=True, | ||
1790 | - is_training=is_training, | ||
1791 | - scope="hidden1_bn_audio") | ||
1792 | - | ||
1793 | - fw_gru = tf.contrib.rnn.GRUCell(lstm_size) | ||
1794 | - bw_gru = tf.contrib.rnn.GRUCell(lstm_size) | ||
1795 | - | ||
1796 | - fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], | ||
1797 | - sequence_length=num_frames//2, dtype=tf.float32, scope='fw') | ||
1798 | - bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], | ||
1799 | - sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw') | ||
1800 | - | ||
1801 | - state = tf.concat([fw_state, bw_state], 1) | ||
1802 | - state = tf.concat([state, mean_audio], 1) | ||
1803 | - | ||
1804 | - aggregated_model = getattr(video_level_models, | ||
1805 | - 'linear_res_mix_act_MoeModel') | ||
1806 | - | ||
1807 | - return aggregated_model().create_model( | ||
1808 | - model_input=state, | ||
1809 | - vocab_size=vocab_size, | ||
1810 | - **unused_params) | ||
1811 | - | ||
1812 | - def avg_pooled_func(self, model_input, num_frames_in, stride): | ||
1813 | - max_frames = model_input.get_shape().as_list()[1] | ||
1814 | - feature_size = model_input.get_shape().as_list()[2] | ||
1815 | - num_frames = num_frames_in // stride | ||
1816 | - step = max_frames//stride | ||
1817 | - | ||
1818 | - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size]) | ||
1819 | - first_layer_input = tf.reduce_sum(first_layer_input, 1) | ||
1820 | - | ||
1821 | - first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32) | ||
1822 | - denominators = tf.reshape( | ||
1823 | - tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size]) | ||
1824 | - first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1) | ||
1825 | - | ||
1826 | - return first_layer_avg_pooled, num_frames | ||
1827 | - | ||
1828 | - | ||
1829 | -class resav_ConvModel(models.BaseModel): | ||
1830 | - | ||
1831 | - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): | ||
1832 | - """Creates a model which uses a Convolutional model to represent the video. | ||
1833 | - Args: | ||
1834 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1835 | - input features. | ||
1836 | - vocab_size: The number of classes in the dataset. | ||
1837 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1838 | - frames for each video (before padding). | ||
1839 | - Returns: | ||
1840 | - A dictionary with a tensor containing the probability predictions of the | ||
1841 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1842 | - 'batch_size' x 'num_classes'. | ||
1843 | - """ | ||
1844 | - stride = FLAGS.stride | ||
1845 | - conv_length = FLAGS.conv_length | ||
1846 | - conv_hidden1 = FLAGS.conv_hidden1 | ||
1847 | - conv_hidden2 = FLAGS.conv_hidden2 | ||
1848 | - conv_hidden3 = FLAGS.conv_hidden3 | ||
1849 | - mean_feature = tf.reduce_mean(model_input, 1) | ||
1850 | - feature_size = model_input.get_shape().as_list()[2] | ||
1851 | - | ||
1852 | - pooled_input = self.avg_pooled_func(model_input, stride) | ||
1853 | - | ||
1854 | - # To shape : 'batch_size' x 'max_frames' x 1 x 'num_features' | ||
1855 | - input_expand = tf.expand_dims(pooled_input, -1) | ||
1856 | - input_expand = tf.transpose(input_expand, [0,1,3,2]) | ||
1857 | - | ||
1858 | - # conv_out : batch_size x max_frames-conv_length x 1 x conv_hidden | ||
1859 | - conv_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_1') | ||
1860 | - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_1")) | ||
1861 | - conv_out = slim.conv2d(conv_out, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_1_2') | ||
1862 | - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_1_2") | ||
1863 | - res_out = slim.conv2d(input_expand, conv_hidden1, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_1_1') | ||
1864 | - res_out = res_out + conv_out | ||
1865 | - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool1') | ||
1866 | - | ||
1867 | - conv_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_1') | ||
1868 | - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_1")) | ||
1869 | - conv_out = slim.conv2d(conv_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_2_2') | ||
1870 | - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_2_2") | ||
1871 | - res_out = slim.conv2d(res_out, conv_hidden2, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_2_1') | ||
1872 | - res_out = res_out + conv_out | ||
1873 | - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool2') | ||
1874 | - | ||
1875 | - conv_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_1') | ||
1876 | - conv_out = tf.nn.relu(slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_1")) | ||
1877 | - conv_out = slim.conv2d(conv_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='conv_3_2') | ||
1878 | - conv_out = slim.batch_norm(conv_out, center=True, scale=True, is_training=is_training, scope="bn_3_2") | ||
1879 | - res_out = slim.conv2d(res_out, conv_hidden3, [conv_length, 1], activation_fn=None, padding= 'SAME', scope='xconv_3_1') | ||
1880 | - res_out = res_out + conv_out | ||
1881 | - res_out = slim.max_pool2d(res_out, [2,1], [2,1], scope='max_pool3') | ||
1882 | - | ||
1883 | - a = res_out.get_shape().as_list()[1] | ||
1884 | - b = res_out.get_shape().as_list()[2] | ||
1885 | - c = res_out.get_shape().as_list()[3] | ||
1886 | - | ||
1887 | - print(res_out.get_shape().as_list()) | ||
1888 | - | ||
1889 | - res_out = tf.reshape(res_out, [-1, a*b*c]) | ||
1890 | - | ||
1891 | - state = tf.concat([res_out, mean_feature], 1) | ||
1892 | - | ||
1893 | - aggregated_model = getattr(video_level_models, | ||
1894 | - 'linear_res_mix_act_MoeModel') | ||
1895 | - return aggregated_model().create_model( | ||
1896 | - model_input=state, | ||
1897 | - vocab_size=vocab_size, | ||
1898 | - **unused_params) | ||
1899 | - | ||
1900 | - def avg_pooled_func(self, model_input, stride): | ||
1901 | - max_frames = model_input.get_shape().as_list()[1] | ||
1902 | - feature_size = model_input.get_shape().as_list()[2] | ||
1903 | - step = max_frames//stride | ||
1904 | - | ||
1905 | - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size]) | ||
1906 | - first_layer_input = tf.reduce_mean(first_layer_input, 1) | ||
1907 | - | ||
1908 | - return first_layer_input | ||
1909 | - | ||
1910 | -class pur_twowayGRUModel(models.BaseModel): | ||
1911 | - | ||
1912 | - def create_model(self, model_input, vocab_size, num_frames, is_training=True, **unused_params): | ||
1913 | - """Creates a model which uses a Bidirectional GRU without explictly using mean audio feature to represent the video. | ||
1914 | - ---->first half GRU-----> | ||
1915 | - - - | ||
1916 | - video_feature ---- concat---------------->video level classifier | ||
1917 | - - - | ||
1918 | - ---->second half GRU----> | ||
1919 | - Args: | ||
1920 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
1921 | - input features. | ||
1922 | - vocab_size: The number of classes in the dataset. | ||
1923 | - num_frames: A vector of length 'batch' which indicates the number of | ||
1924 | - frames for each video (before padding). | ||
1925 | - Returns: | ||
1926 | - A dictionary with a tensor containing the probability predictions of the | ||
1927 | - model in the 'predictions' key. The dimensions of the tensor are | ||
1928 | - 'batch_size' x 'num_classes'. | ||
1929 | - """ | ||
1930 | - lstm_size = FLAGS.lstm_cells | ||
1931 | - number_of_layers = FLAGS.lstm_layers | ||
1932 | - stride = FLAGS.stride | ||
1933 | - max_frames = model_input.get_shape().as_list()[1] | ||
1934 | - | ||
1935 | - pooled_input, num_frames = self.avg_pooled_func(model_input, num_frames, stride) | ||
1936 | - | ||
1937 | - pooled_input = slim.batch_norm( | ||
1938 | - pooled_input, | ||
1939 | - center=True, | ||
1940 | - scale=True, | ||
1941 | - is_training=is_training, | ||
1942 | - scope="hidden1_bn") | ||
1943 | - | ||
1944 | - | ||
1945 | - fw_gru = tf.contrib.rnn.GRUCell(lstm_size) | ||
1946 | - bw_gru = tf.contrib.rnn.GRUCell(lstm_size) | ||
1947 | - | ||
1948 | - fw_outputs, fw_state = tf.nn.dynamic_rnn(fw_gru, pooled_input[:,:max_frames//(2*stride),:], | ||
1949 | - sequence_length=num_frames//2, dtype=tf.float32, scope='fw') | ||
1950 | - bw_outputs, bw_state = tf.nn.dynamic_rnn(bw_gru, pooled_input[:,max_frames//(2*stride)::-1,:], | ||
1951 | - sequence_length=num_frames - num_frames//2, dtype=tf.float32, scope='bw') | ||
1952 | - | ||
1953 | - state = tf.concat([fw_state, bw_state], 1) | ||
1954 | - | ||
1955 | - aggregated_model = getattr(video_level_models, | ||
1956 | - 'linear_res_mix_act_MoeModel') | ||
1957 | - | ||
1958 | - return aggregated_model().create_model( | ||
1959 | - model_input=state, | ||
1960 | - vocab_size=vocab_size, | ||
1961 | - **unused_params) | ||
1962 | - | ||
1963 | - def avg_pooled_func(self, model_input, num_frames_in, stride): | ||
1964 | - max_frames = model_input.get_shape().as_list()[1] | ||
1965 | - feature_size = model_input.get_shape().as_list()[2] | ||
1966 | - num_frames = num_frames_in // stride | ||
1967 | - step = max_frames//stride | ||
1968 | - | ||
1969 | - first_layer_input = tf.reshape(model_input, [-1, stride, step, feature_size]) | ||
1970 | - first_layer_input = tf.reduce_sum(first_layer_input, 1) | ||
1971 | - | ||
1972 | - first_num_frames = tf.cast(tf.expand_dims(tf.expand_dims(num_frames, 1),2), tf.float32) | ||
1973 | - denominators = tf.reshape( | ||
1974 | - tf.tile(first_num_frames, [1, step, feature_size]), [-1, step, feature_size]) | ||
1975 | - first_layer_avg_pooled = first_layer_input / tf.maximum(denominators,1) | ||
1976 | - | ||
1977 | - return first_layer_avg_pooled, num_frames | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
328 | + | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -136,9 +136,7 @@ class MoeModel(models.BaseModel): | ... | @@ -136,9 +136,7 @@ class MoeModel(models.BaseModel): |
136 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) | 136 | gating_distribution[:, :num_mixtures] * expert_distribution, 1) |
137 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, | 137 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, |
138 | [-1, vocab_size]) | 138 | [-1, vocab_size]) |
139 | - | 139 | + |
140 | - | ||
141 | - print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@", final_probabilities_by_class_and_batch) | ||
142 | return {"predictions": final_probabilities} | 140 | return {"predictions": final_probabilities} |
143 | 141 | ||
144 | 142 | ||
... | @@ -251,482 +249,4 @@ class willow_MoeModel(models.BaseModel): | ... | @@ -251,482 +249,4 @@ class willow_MoeModel(models.BaseModel): |
251 | 249 | ||
252 | probabilities = tf.multiply(probabilities, gates) | 250 | probabilities = tf.multiply(probabilities, gates) |
253 | 251 | ||
254 | - return {"predictions": probabilities} | ||
255 | - | ||
256 | -class willow_MoeModel_moe4(models.BaseModel): | ||
257 | - """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
258 | - | ||
259 | - def create_model(self, | ||
260 | - model_input, | ||
261 | - vocab_size, | ||
262 | - is_training, | ||
263 | - num_mixtures=None, | ||
264 | - l2_penalty=1e-8, | ||
265 | - **unused_params): | ||
266 | - """Creates a Mixture of (Logistic) Experts model. | ||
267 | - It also includes the possibility of gating the probabilities | ||
268 | - The model consists of a per-class softmax distribution over a | ||
269 | - configurable number of logistic classifiers. One of the classifiers in the | ||
270 | - mixture is not trained, and always predicts 0. | ||
271 | - Args: | ||
272 | - model_input: 'batch_size' x 'num_features' matrix of input features. | ||
273 | - vocab_size: The number of classes in the dataset. | ||
274 | - is_training: Is this the training phase ? | ||
275 | - num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
276 | - always predicts the non-existence of an entity). | ||
277 | - l2_penalty: How much to penalize the squared magnitudes of parameter | ||
278 | - values. | ||
279 | - Returns: | ||
280 | - A dictionary with a tensor containing the probability predictions of the | ||
281 | - model in the 'predictions' key. The dimensions of the tensor are | ||
282 | - batch_size x num_classes. | ||
283 | - """ | ||
284 | - num_mixtures = 4 | ||
285 | - low_rank_gating = FLAGS.moe_low_rank_gating | ||
286 | - l2_penalty = FLAGS.moe_l2 | ||
287 | - gating_probabilities = FLAGS.moe_prob_gating | ||
288 | - gating_input = FLAGS.moe_prob_gating_input | ||
289 | - | ||
290 | - input_size = model_input.get_shape().as_list()[1] | ||
291 | - remove_diag = False | ||
292 | - | ||
293 | - if low_rank_gating == -1: | ||
294 | - gate_activations = slim.fully_connected( | ||
295 | - model_input, | ||
296 | - vocab_size * (num_mixtures + 1), | ||
297 | - activation_fn=None, | ||
298 | - biases_initializer=None, | ||
299 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
300 | - scope="gates") | ||
301 | - else: | ||
302 | - gate_activations1 = slim.fully_connected( | ||
303 | - model_input, | ||
304 | - low_rank_gating, | ||
305 | - activation_fn=None, | ||
306 | - biases_initializer=None, | ||
307 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
308 | - scope="gates1") | ||
309 | - gate_activations = slim.fully_connected( | ||
310 | - gate_activations1, | ||
311 | - vocab_size * (num_mixtures + 1), | ||
312 | - activation_fn=None, | ||
313 | - biases_initializer=None, | ||
314 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
315 | - scope="gates2") | ||
316 | - | ||
317 | - expert_activations = slim.fully_connected( | ||
318 | - model_input, | ||
319 | - vocab_size * num_mixtures, | ||
320 | - activation_fn=None, | ||
321 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
322 | - scope="experts") | ||
323 | - | ||
324 | - gating_distribution = tf.nn.softmax(tf.reshape( | ||
325 | - gate_activations, | ||
326 | - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
327 | - expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
328 | - expert_activations, | ||
329 | - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
330 | - | ||
331 | - probabilities_by_class_and_batch = tf.reduce_sum( | ||
332 | - gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
333 | - probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
334 | - [-1, vocab_size]) | ||
335 | - | ||
336 | - if gating_probabilities: | ||
337 | - if gating_input == 'prob': | ||
338 | - gating_weights = tf.get_variable("gating_prob_weights", | ||
339 | - [vocab_size, vocab_size], | ||
340 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
341 | - gates = tf.matmul(probabilities, gating_weights) | ||
342 | - else: | ||
343 | - gating_weights = tf.get_variable("gating_prob_weights", | ||
344 | - [input_size, vocab_size], | ||
345 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
346 | - | ||
347 | - gates = tf.matmul(model_input, gating_weights) | ||
348 | - | ||
349 | - if remove_diag: | ||
350 | - # removes diagonals coefficients | ||
351 | - diagonals = tf.matrix_diag_part(gating_weights) | ||
352 | - gates = gates - tf.multiply(diagonals, probabilities) | ||
353 | - | ||
354 | - gates = slim.batch_norm( | ||
355 | - gates, | ||
356 | - center=True, | ||
357 | - scale=True, | ||
358 | - is_training=is_training, | ||
359 | - scope="gating_prob_bn") | ||
360 | - | ||
361 | - gates = tf.sigmoid(gates) | ||
362 | - | ||
363 | - probabilities = tf.multiply(probabilities, gates) | ||
364 | - | ||
365 | - return {"predictions": probabilities} | ||
366 | - | ||
367 | -class willow_MoeModel_moe4_noGP(models.BaseModel): | ||
368 | - """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
369 | - | ||
370 | - def create_model(self, | ||
371 | - model_input, | ||
372 | - vocab_size, | ||
373 | - is_training, | ||
374 | - num_mixtures=None, | ||
375 | - l2_penalty=1e-8, | ||
376 | - **unused_params): | ||
377 | - """Creates a Mixture of (Logistic) Experts model. | ||
378 | - It also includes the possibility of gating the probabilities | ||
379 | - The model consists of a per-class softmax distribution over a | ||
380 | - configurable number of logistic classifiers. One of the classifiers in the | ||
381 | - mixture is not trained, and always predicts 0. | ||
382 | - Args: | ||
383 | - model_input: 'batch_size' x 'num_features' matrix of input features. | ||
384 | - vocab_size: The number of classes in the dataset. | ||
385 | - is_training: Is this the training phase ? | ||
386 | - num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
387 | - always predicts the non-existence of an entity). | ||
388 | - l2_penalty: How much to penalize the squared magnitudes of parameter | ||
389 | - values. | ||
390 | - Returns: | ||
391 | - A dictionary with a tensor containing the probability predictions of the | ||
392 | - model in the 'predictions' key. The dimensions of the tensor are | ||
393 | - batch_size x num_classes. | ||
394 | - """ | ||
395 | - num_mixtures = 4 | ||
396 | - low_rank_gating = FLAGS.moe_low_rank_gating | ||
397 | - l2_penalty = FLAGS.moe_l2 | ||
398 | - gating_probabilities = False | ||
399 | - gating_input = FLAGS.moe_prob_gating_input | ||
400 | - | ||
401 | - input_size = model_input.get_shape().as_list()[1] | ||
402 | - remove_diag = False | ||
403 | - | ||
404 | - if low_rank_gating == -1: | ||
405 | - gate_activations = slim.fully_connected( | ||
406 | - model_input, | ||
407 | - vocab_size * (num_mixtures + 1), | ||
408 | - activation_fn=None, | ||
409 | - biases_initializer=None, | ||
410 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
411 | - scope="gates") | ||
412 | - else: | ||
413 | - gate_activations1 = slim.fully_connected( | ||
414 | - model_input, | ||
415 | - low_rank_gating, | ||
416 | - activation_fn=None, | ||
417 | - biases_initializer=None, | ||
418 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
419 | - scope="gates1") | ||
420 | - gate_activations = slim.fully_connected( | ||
421 | - gate_activations1, | ||
422 | - vocab_size * (num_mixtures + 1), | ||
423 | - activation_fn=None, | ||
424 | - biases_initializer=None, | ||
425 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
426 | - scope="gates2") | ||
427 | - | ||
428 | - expert_activations = slim.fully_connected( | ||
429 | - model_input, | ||
430 | - vocab_size * num_mixtures, | ||
431 | - activation_fn=None, | ||
432 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
433 | - scope="experts") | ||
434 | - | ||
435 | - gating_distribution = tf.nn.softmax(tf.reshape( | ||
436 | - gate_activations, | ||
437 | - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
438 | - expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
439 | - expert_activations, | ||
440 | - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
441 | - | ||
442 | - probabilities_by_class_and_batch = tf.reduce_sum( | ||
443 | - gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
444 | - probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
445 | - [-1, vocab_size]) | ||
446 | - | ||
447 | - return {"predictions": probabilities} | ||
448 | - | ||
449 | -class willow_MoeModel_moe2_noGP(models.BaseModel): | ||
450 | - """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
451 | - | ||
452 | - def create_model(self, | ||
453 | - model_input, | ||
454 | - vocab_size, | ||
455 | - is_training, | ||
456 | - num_mixtures=None, | ||
457 | - l2_penalty=1e-8, | ||
458 | - **unused_params): | ||
459 | - """Creates a Mixture of (Logistic) Experts model. | ||
460 | - It also includes the possibility of gating the probabilities | ||
461 | - The model consists of a per-class softmax distribution over a | ||
462 | - configurable number of logistic classifiers. One of the classifiers in the | ||
463 | - mixture is not trained, and always predicts 0. | ||
464 | - Args: | ||
465 | - model_input: 'batch_size' x 'num_features' matrix of input features. | ||
466 | - vocab_size: The number of classes in the dataset. | ||
467 | - is_training: Is this the training phase ? | ||
468 | - num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
469 | - always predicts the non-existence of an entity). | ||
470 | - l2_penalty: How much to penalize the squared magnitudes of parameter | ||
471 | - values. | ||
472 | - Returns: | ||
473 | - A dictionary with a tensor containing the probability predictions of the | ||
474 | - model in the 'predictions' key. The dimensions of the tensor are | ||
475 | - batch_size x num_classes. | ||
476 | - """ | ||
477 | - num_mixtures = 2 | ||
478 | - low_rank_gating = FLAGS.moe_low_rank_gating | ||
479 | - l2_penalty = FLAGS.moe_l2 | ||
480 | - gating_probabilities = False | ||
481 | - gating_input = FLAGS.moe_prob_gating_input | ||
482 | - | ||
483 | - input_size = model_input.get_shape().as_list()[1] | ||
484 | - remove_diag = False | ||
485 | - | ||
486 | - if low_rank_gating == -1: | ||
487 | - gate_activations = slim.fully_connected( | ||
488 | - model_input, | ||
489 | - vocab_size * (num_mixtures + 1), | ||
490 | - activation_fn=None, | ||
491 | - biases_initializer=None, | ||
492 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
493 | - scope="gates") | ||
494 | - else: | ||
495 | - gate_activations1 = slim.fully_connected( | ||
496 | - model_input, | ||
497 | - low_rank_gating, | ||
498 | - activation_fn=None, | ||
499 | - biases_initializer=None, | ||
500 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
501 | - scope="gates1") | ||
502 | - gate_activations = slim.fully_connected( | ||
503 | - gate_activations1, | ||
504 | - vocab_size * (num_mixtures + 1), | ||
505 | - activation_fn=None, | ||
506 | - biases_initializer=None, | ||
507 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
508 | - scope="gates2") | ||
509 | - | ||
510 | - expert_activations = slim.fully_connected( | ||
511 | - model_input, | ||
512 | - vocab_size * num_mixtures, | ||
513 | - activation_fn=None, | ||
514 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
515 | - scope="experts") | ||
516 | - | ||
517 | - gating_distribution = tf.nn.softmax(tf.reshape( | ||
518 | - gate_activations, | ||
519 | - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
520 | - expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
521 | - expert_activations, | ||
522 | - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
523 | - | ||
524 | - probabilities_by_class_and_batch = tf.reduce_sum( | ||
525 | - gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
526 | - probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
527 | - [-1, vocab_size]) | ||
528 | - | ||
529 | - return {"predictions": probabilities} | ||
530 | - | ||
531 | - | ||
532 | -class willow_MoeModel_moe2(models.BaseModel): | ||
533 | - """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
534 | - | ||
535 | - def create_model(self, | ||
536 | - model_input, | ||
537 | - vocab_size, | ||
538 | - is_training, | ||
539 | - num_mixtures=None, | ||
540 | - l2_penalty=1e-8, | ||
541 | - **unused_params): | ||
542 | - """Creates a Mixture of (Logistic) Experts model. | ||
543 | - It also includes the possibility of gating the probabilities | ||
544 | - The model consists of a per-class softmax distribution over a | ||
545 | - configurable number of logistic classifiers. One of the classifiers in the | ||
546 | - mixture is not trained, and always predicts 0. | ||
547 | - Args: | ||
548 | - model_input: 'batch_size' x 'num_features' matrix of input features. | ||
549 | - vocab_size: The number of classes in the dataset. | ||
550 | - is_training: Is this the training phase ? | ||
551 | - num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
552 | - always predicts the non-existence of an entity). | ||
553 | - l2_penalty: How much to penalize the squared magnitudes of parameter | ||
554 | - values. | ||
555 | - Returns: | ||
556 | - A dictionary with a tensor containing the probability predictions of the | ||
557 | - model in the 'predictions' key. The dimensions of the tensor are | ||
558 | - batch_size x num_classes. | ||
559 | - """ | ||
560 | - num_mixtures = 2 | ||
561 | - low_rank_gating = FLAGS.moe_low_rank_gating | ||
562 | - l2_penalty = FLAGS.moe_l2 | ||
563 | - gating_probabilities = FLAGS.moe_prob_gating | ||
564 | - gating_input = FLAGS.moe_prob_gating_input | ||
565 | - | ||
566 | - input_size = model_input.get_shape().as_list()[1] | ||
567 | - remove_diag = False | ||
568 | - | ||
569 | - if low_rank_gating == -1: | ||
570 | - gate_activations = slim.fully_connected( | ||
571 | - model_input, | ||
572 | - vocab_size * (num_mixtures + 1), | ||
573 | - activation_fn=None, | ||
574 | - biases_initializer=None, | ||
575 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
576 | - scope="gates") | ||
577 | - else: | ||
578 | - gate_activations1 = slim.fully_connected( | ||
579 | - model_input, | ||
580 | - low_rank_gating, | ||
581 | - activation_fn=None, | ||
582 | - biases_initializer=None, | ||
583 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
584 | - scope="gates1") | ||
585 | - gate_activations = slim.fully_connected( | ||
586 | - gate_activations1, | ||
587 | - vocab_size * (num_mixtures + 1), | ||
588 | - activation_fn=None, | ||
589 | - biases_initializer=None, | ||
590 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
591 | - scope="gates2") | ||
592 | - | ||
593 | - expert_activations = slim.fully_connected( | ||
594 | - model_input, | ||
595 | - vocab_size * num_mixtures, | ||
596 | - activation_fn=None, | ||
597 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
598 | - scope="experts") | ||
599 | - | ||
600 | - gating_distribution = tf.nn.softmax(tf.reshape( | ||
601 | - gate_activations, | ||
602 | - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
603 | - expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
604 | - expert_activations, | ||
605 | - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
606 | - | ||
607 | - probabilities_by_class_and_batch = tf.reduce_sum( | ||
608 | - gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
609 | - probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
610 | - [-1, vocab_size]) | ||
611 | - | ||
612 | - if gating_probabilities: | ||
613 | - if gating_input == 'prob': | ||
614 | - gating_weights = tf.get_variable("gating_prob_weights", | ||
615 | - [vocab_size, vocab_size], | ||
616 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
617 | - gates = tf.matmul(probabilities, gating_weights) | ||
618 | - else: | ||
619 | - gating_weights = tf.get_variable("gating_prob_weights", | ||
620 | - [input_size, vocab_size], | ||
621 | - initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
622 | - | ||
623 | - gates = tf.matmul(model_input, gating_weights) | ||
624 | - | ||
625 | - if remove_diag: | ||
626 | - # removes diagonals coefficients | ||
627 | - diagonals = tf.matrix_diag_part(gating_weights) | ||
628 | - gates = gates - tf.multiply(diagonals, probabilities) | ||
629 | - | ||
630 | - gates = slim.batch_norm( | ||
631 | - gates, | ||
632 | - center=True, | ||
633 | - scale=True, | ||
634 | - is_training=is_training, | ||
635 | - scope="gating_prob_bn") | ||
636 | - | ||
637 | - gates = tf.sigmoid(gates) | ||
638 | - | ||
639 | - probabilities = tf.multiply(probabilities, gates) | ||
640 | - | ||
641 | - return {"predictions": probabilities} | ||
642 | - | ||
643 | -class linear_res_mix_act_MoeModel(models.BaseModel): | ||
644 | - """A softmax over a mixture of logistic models (with L2 regularization). | ||
645 | - | ||
646 | - -----linear_layers(1) + sigmoid activation------------- | ||
647 | - - - | ||
648 | - -----linear_layers(2) + relu activation---------------- | ||
649 | - - - | ||
650 | - input_features ----- -------moe-----output | ||
651 | - - - | ||
652 | - -----linear_layers(3) + elu activation----------------- | ||
653 | - - - | ||
654 | - -----linear_layers(4) + tanh activation---------------- | ||
655 | - """ | ||
656 | - def create_model(self, | ||
657 | - model_input, | ||
658 | - vocab_size, | ||
659 | - num_mixtures=None, | ||
660 | - num_hiddens=None, | ||
661 | - num_maxout = None, | ||
662 | - l2_penalty=1e-8, | ||
663 | - **unused_params): | ||
664 | - | ||
665 | - num_mixtures = num_mixtures or FLAGS.moe_num_mixtures | ||
666 | - num_hiddens = num_hiddens or FLAGS.moe_num_hiddens | ||
667 | - num_maxout = num_maxout or FLAGS.num_maxout | ||
668 | - | ||
669 | - hidden_sigmoid = slim.fully_connected( | ||
670 | - model_input, | ||
671 | - num_hiddens, | ||
672 | - activation_fn=tf.nn.sigmoid, | ||
673 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
674 | - scope='hidden_sigmoid' | ||
675 | - ) | ||
676 | - hidden_relu = slim.fully_connected( | ||
677 | - model_input, | ||
678 | - num_hiddens, | ||
679 | - activation_fn=tf.nn.relu, | ||
680 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
681 | - scope='hidden_relu' | ||
682 | - ) | ||
683 | - hidden_elu = slim.fully_connected( | ||
684 | - model_input, | ||
685 | - num_hiddens, | ||
686 | - activation_fn=tf.nn.elu, | ||
687 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
688 | - scope='hidden_elu' | ||
689 | - ) | ||
690 | - hidden_tanh = slim.fully_connected( | ||
691 | - model_input, | ||
692 | - num_hiddens, | ||
693 | - activation_fn=tf.nn.tanh, | ||
694 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
695 | - scope='hidden_tanh' | ||
696 | - ) | ||
697 | - | ||
698 | - linear_input = slim.fully_connected( | ||
699 | - model_input, | ||
700 | - num_hiddens, | ||
701 | - activation_fn=None, | ||
702 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
703 | - scope='hidden_linear' | ||
704 | - ) | ||
705 | - | ||
706 | - | ||
707 | - gate_activations = slim.fully_connected( | ||
708 | - model_input, | ||
709 | - vocab_size * (num_mixtures + 1), | ||
710 | - activation_fn=None, | ||
711 | - biases_initializer=None, | ||
712 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
713 | - scope="gates") | ||
714 | - expert_activations = slim.fully_connected( | ||
715 | - tf.concat([hidden_sigmoid+0.25*linear_input, hidden_relu+0.25*linear_input, hidden_elu+0.25*linear_input, hidden_tanh+0.25*linear_input], 1), | ||
716 | - vocab_size * num_mixtures, | ||
717 | - activation_fn=None, | ||
718 | - weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
719 | - scope="experts") | ||
720 | - | ||
721 | - gating_distribution = tf.nn.softmax(tf.reshape( | ||
722 | - gate_activations, | ||
723 | - [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
724 | - expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
725 | - expert_activations, | ||
726 | - [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
727 | - | ||
728 | - final_probabilities_by_class_and_batch = tf.reduce_sum( | ||
729 | - gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
730 | - final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, | ||
731 | - [-1, vocab_size]) | ||
732 | - return {"predictions": final_probabilities} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
252 | + return {"predictions": probabilities} | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
This file is too large to display.
보고서/최종보고서-윤영빈.pdf
0 → 100644
No preview for this file type
-
Please register or login to post a comment