Showing
3 changed files
with
418 additions
and
39 deletions
... | @@ -20,8 +20,6 @@ | ... | @@ -20,8 +20,6 @@ |
20 | ![profit_hunter](/img/profit_hunter.png) | 20 | ![profit_hunter](/img/profit_hunter.png) |
21 | * 팀명 **Profit Hunter** | 21 | * 팀명 **Profit Hunter** |
22 | * 윤영빈(컴퓨터공학과, 2015104192) | 22 | * 윤영빈(컴퓨터공학과, 2015104192) |
23 | -* 윤준현(컴퓨터공학과, 2015104193) | ||
24 | -* 이현규(컴퓨터공학과, 2015104209) | ||
25 | * 이태현(컴퓨터공학과, 2015104208) | 23 | * 이태현(컴퓨터공학과, 2015104208) |
26 | 24 | ||
27 | ## Links | 25 | ## Links | ... | ... |
... | @@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.") | ... | @@ -50,42 +50,6 @@ flags.DEFINE_integer("lstm_cells", 1024, "Number of LSTM cells.") |
50 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") | 50 | flags.DEFINE_integer("lstm_layers", 2, "Number of LSTM layers.") |
51 | 51 | ||
52 | 52 | ||
53 | -class FrameLevelLogisticModel(models.BaseModel): | ||
54 | - """Creates a logistic classifier over the aggregated frame-level features.""" | ||
55 | - | ||
56 | - def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
57 | - """See base class. | ||
58 | - | ||
59 | - This class is intended to be an example for implementors of frame level | ||
60 | - models. If you want to train a model over averaged features it is more | ||
61 | - efficient to average them beforehand rather than on the fly. | ||
62 | - | ||
63 | - Args: | ||
64 | - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
65 | - input features. | ||
66 | - vocab_size: The number of classes in the dataset. | ||
67 | - num_frames: A vector of length 'batch' which indicates the number of | ||
68 | - frames for each video (before padding). | ||
69 | - | ||
70 | - Returns: | ||
71 | - A dictionary with a tensor containing the probability predictions of the | ||
72 | - model in the 'predictions' key. The dimensions of the tensor are | ||
73 | - 'batch_size' x 'num_classes'. | ||
74 | - """ | ||
75 | - num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
76 | - feature_size = model_input.get_shape().as_list()[2] | ||
77 | - | ||
78 | - denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), | ||
79 | - [-1, feature_size]) | ||
80 | - avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators | ||
81 | - | ||
82 | - output = slim.fully_connected(avg_pooled, | ||
83 | - vocab_size, | ||
84 | - activation_fn=tf.nn.sigmoid, | ||
85 | - weights_regularizer=slim.l2_regularizer(1e-8)) | ||
86 | - return {"predictions": output} | ||
87 | - | ||
88 | - | ||
89 | class DbofModel(models.BaseModel): | 53 | class DbofModel(models.BaseModel): |
90 | """Creates a Deep Bag of Frames model. | 54 | """Creates a Deep Bag of Frames model. |
91 | 55 | ||
... | @@ -239,7 +203,6 @@ class LstmModel(models.BaseModel): | ... | @@ -239,7 +203,6 @@ class LstmModel(models.BaseModel): |
239 | tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) | 203 | tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) |
240 | for _ in range(number_of_layers) | 204 | for _ in range(number_of_layers) |
241 | ]) | 205 | ]) |
242 | - | ||
243 | _, state = tf.nn.dynamic_rnn(stacked_lstm, | 206 | _, state = tf.nn.dynamic_rnn(stacked_lstm, |
244 | model_input, | 207 | model_input, |
245 | sequence_length=num_frames, | 208 | sequence_length=num_frames, |
... | @@ -251,3 +214,300 @@ class LstmModel(models.BaseModel): | ... | @@ -251,3 +214,300 @@ class LstmModel(models.BaseModel): |
251 | return aggregated_model().create_model(model_input=state[-1].h, | 214 | return aggregated_model().create_model(model_input=state[-1].h, |
252 | vocab_size=vocab_size, | 215 | vocab_size=vocab_size, |
253 | **unused_params) | 216 | **unused_params) |
217 | + | ||
218 | +class FrameLevelLogisticModel(models.BaseModel): | ||
219 | + """Creates a logistic classifier over the aggregated frame-level features.""" | ||
220 | + def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
221 | + """See base class. | ||
222 | + | ||
223 | + This class is intended to be an example for implementors of frame level | ||
224 | + models. If you want to train a model over averaged features it is more | ||
225 | + efficient to average them beforehand rather than on the fly. | ||
226 | + | ||
227 | + Args: | ||
228 | + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
229 | + input features. | ||
230 | + vocab_size: The number of classes in the dataset. | ||
231 | + num_frames: A vector of length 'batch' which indicates the number of | ||
232 | + frames for each video (before padding). | ||
233 | + | ||
234 | + Returns: | ||
235 | + A dictionary with a tensor containing the probability predictions of the | ||
236 | + model in the 'predictions' key. The dimensions of the tensor are | ||
237 | + 'batch_size' x 'num_classes'. | ||
238 | + """ | ||
239 | + num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
240 | + feature_size = model_input.get_shape().as_list()[2] | ||
241 | + | ||
242 | + denominators = tf.reshape(tf.tile(num_frames, [1, feature_size]), | ||
243 | + [-1, feature_size]) | ||
244 | + avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators | ||
245 | + | ||
246 | + output = slim.fully_connected(avg_pooled, | ||
247 | + vocab_size, | ||
248 | + activation_fn=tf.nn.sigmoid, | ||
249 | + weights_regularizer=slim.l2_regularizer(1e-8)) | ||
250 | + return {"predictions": output} | ||
251 | + | ||
252 | +class CNN(models.BaseModel): | ||
253 | + def create_model(self, model_input, vocab_size, num_frames, **unused_params): | ||
254 | + """def model(features, labels, mode, params):""" | ||
255 | + """CNN classifier model.""" | ||
256 | + images = features["image"] | ||
257 | + labels = labels["label"] | ||
258 | + | ||
259 | + tf.summary.image("images", images) | ||
260 | + | ||
261 | + drop_rate = 0.0 | ||
262 | + | ||
263 | + features = images | ||
264 | + for i, filters in enumerate([32, 64, 128]): | ||
265 | + features = tf.layers.conv2d( | ||
266 | + features, filters=filters, kernel_size=3, padding="same", | ||
267 | + name="conv_%d" % (i + 1)) | ||
268 | + features = tf.layers.max_pooling2d( | ||
269 | + inputs=features, pool_size=2, strides=2, padding="same", | ||
270 | + name="pool_%d" % (i + 1)) | ||
271 | + | ||
272 | + features = tf.contrib.layers.flatten(features) | ||
273 | + | ||
274 | + features = tf.layers.dropout(features, drop_rate) | ||
275 | + features = tf.layers.dense(features, 512, name="dense_1") | ||
276 | + | ||
277 | + features = tf.layers.dropout(features, drop_rate) | ||
278 | + logits = tf.layers.dense(features, params.num_classes, activation=None, | ||
279 | + name="dense_2") | ||
280 | + | ||
281 | + predictions = tf.argmax(logits, axis=1) | ||
282 | + | ||
283 | + loss = tf.losses.sparse_softmax_cross_entropy( | ||
284 | + labels=labels, logits=logits) | ||
285 | + | ||
286 | + output = slim.fully_connected(avg_pooled, | ||
287 | + vocab_size, | ||
288 | + activation_fn=tf.nn.sigmoid, | ||
289 | + weights_regularizer=slim.l2_regularizer(1e-8)) | ||
290 | + return {"predictions": predictions}, loss | ||
291 | + | ||
292 | + | ||
293 | + | ||
294 | + | ||
295 | + | ||
296 | + | ||
297 | +class NetVLAD_NonLocal_types(): | ||
298 | + def __init__(self, feature_size,max_frames,cluster_size, add_batch_norm, is_training): | ||
299 | + self.feature_size = feature_size | ||
300 | + self.max_frames = max_frames | ||
301 | + self.is_training = is_training | ||
302 | + self.add_batch_norm = add_batch_norm | ||
303 | + self.cluster_size = cluster_size | ||
304 | + | ||
305 | + def forward(self,reshaped_input): | ||
306 | + | ||
307 | + cluster_weights = tf.get_variable("cluster_weights", | ||
308 | + [self.feature_size, self.cluster_size], | ||
309 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
310 | + | ||
311 | + tf.summary.histogram("cluster_weights", cluster_weights) | ||
312 | + activation = tf.matmul(reshaped_input, cluster_weights) | ||
313 | + | ||
314 | + if self.add_batch_norm: | ||
315 | + activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") | ||
316 | + else: | ||
317 | + cluster_biases = tf.get_variable("cluster_biases", | ||
318 | + [cluster_size], | ||
319 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
320 | + tf.summary.histogram("cluster_biases", cluster_biases) | ||
321 | + activation += cluster_biases | ||
322 | + | ||
323 | + activation = tf.nn.softmax(activation) | ||
324 | + tf.summary.histogram("cluster_output", activation) | ||
325 | + | ||
326 | + activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) | ||
327 | + | ||
328 | + a_sum = tf.reduce_sum(activation,-2,keep_dims=True) | ||
329 | + | ||
330 | + cluster_weights2 = tf.get_variable("cluster_weights2", | ||
331 | + [1,self.feature_size, self.cluster_size], | ||
332 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
333 | + | ||
334 | + a = tf.multiply(a_sum,cluster_weights2) | ||
335 | + | ||
336 | + activation = tf.transpose(activation,perm=[0,2,1]) | ||
337 | + | ||
338 | + reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) | ||
339 | + vlad = tf.matmul(activation,reshaped_input) | ||
340 | + vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
341 | + vlad = tf.subtract(vlad,a) | ||
342 | + | ||
343 | + vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
344 | + vlad = tf.reshape(vlad, [-1, self.feature_size]) | ||
345 | + | ||
346 | + vlad_softmax = self.embedgaussian_relation(vlad, 1/float(64)) | ||
347 | + | ||
348 | + | ||
349 | + nonlocal_g = tf.get_variable("nonlocal_g", | ||
350 | + [self.feature_size, self.cluster_size], | ||
351 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
352 | + nonlocal_out = tf.get_variable("nonlocal_out", | ||
353 | + [self.cluster_size, self.feature_size], | ||
354 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size))) | ||
355 | + | ||
356 | + vlad_g = tf.matmul(vlad, nonlocal_g) | ||
357 | + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size]) | ||
358 | + vlad_g = tf.matmul(vlad_softmax, vlad_g) | ||
359 | + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size]) | ||
360 | + | ||
361 | + vlad_g = tf.matmul(vlad_g, nonlocal_out) | ||
362 | + vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size]) | ||
363 | + vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size]) | ||
364 | + vlad = vlad + vlad_g | ||
365 | + | ||
366 | + vlad = tf.transpose(vlad,perm=[0,2,1]) | ||
367 | + vlad = tf.nn.l2_normalize(vlad,1) # [b,f,c] | ||
368 | + | ||
369 | + vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size]) | ||
370 | + vlad = tf.nn.l2_normalize(vlad,1) | ||
371 | + | ||
372 | + return vlad | ||
373 | + | ||
374 | + def embedgaussian_relation(self, input_, temp=1/float(32)): | ||
375 | + nonlocal_theta = tf.get_variable("nonlocal_theta", | ||
376 | + [self.feature_size, self.cluster_size], | ||
377 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
378 | + nonlocal_phi = tf.get_variable("nonlocal_phi", | ||
379 | + [self.feature_size, self.cluster_size], | ||
380 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) | ||
381 | + | ||
382 | + vlad_theta = tf.matmul(input_, nonlocal_theta) | ||
383 | + vlad_phi = tf.matmul(input_, nonlocal_phi) | ||
384 | + vlad_theta = tf.reshape(vlad_theta, [-1, self.cluster_size, self.cluster_size]) | ||
385 | + vlad_phi = tf.reshape(vlad_phi, [-1, self.cluster_size, self.cluster_size]) | ||
386 | + vlad_softmax = tf.nn.softmax(temp * tf.matmul(vlad_theta, tf.transpose(vlad_phi,perm=[0,2,1]))) | ||
387 | + return vlad_softmax | ||
388 | + | ||
389 | +class NetVLADModelLF(models.BaseModel): | ||
390 | + """Creates a NetVLAD based model. | ||
391 | + Args: | ||
392 | + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of | ||
393 | + input features. | ||
394 | + vocab_size: The number of classes in the dataset. | ||
395 | + num_frames: A vector of length 'batch' which indicates the number of | ||
396 | + frames for each video (before padding). | ||
397 | + Returns: | ||
398 | + A dictionary with a tensor containing the probability predictions of the | ||
399 | + model in the 'predictions' key. The dimensions of the tensor are | ||
400 | + 'batch_size' x 'num_classes'. | ||
401 | + """ | ||
402 | + | ||
403 | + | ||
404 | + def create_model(self,model_input,vocab_size,num_frames,iterations=None,add_batch_norm=None,sample_random_frames=None,cluster_size=None,hidden_size=None,is_training=True,**unused_params): | ||
405 | + iterations = 300 | ||
406 | + add_batch_norm = True | ||
407 | + random_frames = True | ||
408 | + cluster_size = 64 | ||
409 | + hidden1_size = 1024 | ||
410 | + relu = False | ||
411 | + dimred = -1 | ||
412 | + gating = True | ||
413 | + remove_diag = False | ||
414 | + | ||
415 | + num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) | ||
416 | + if random_frames: | ||
417 | + model_input = utils.SampleRandomFrames(model_input, num_frames, | ||
418 | + iterations) | ||
419 | + else: | ||
420 | + model_input = utils.SampleRandomSequence(model_input, num_frames, | ||
421 | + iterations) | ||
422 | + | ||
423 | + | ||
424 | + max_frames = model_input.get_shape().as_list()[1] | ||
425 | + feature_size = model_input.get_shape().as_list()[2] | ||
426 | + reshaped_input = tf.reshape(model_input, [-1, feature_size]) | ||
427 | + | ||
428 | + | ||
429 | + video_NetVLAD = NetVLAD_NonLocal_types(1024,int(max_frames),int(cluster_size), add_batch_norm, is_training) | ||
430 | + audio_NetVLAD = NetVLAD_NonLocal_types(128,int(max_frames),int(cluster_size/2), add_batch_norm, is_training) | ||
431 | + | ||
432 | + | ||
433 | + if add_batch_norm:# and not lightvlad: | ||
434 | + reshaped_input = slim.batch_norm( | ||
435 | + reshaped_input, | ||
436 | + center=True, | ||
437 | + scale=True, | ||
438 | + is_training=is_training, | ||
439 | + scope="input_bn") | ||
440 | + | ||
441 | + with tf.variable_scope("video_VLAD"): | ||
442 | + vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) | ||
443 | + | ||
444 | + with tf.variable_scope("audio_VLAD"): | ||
445 | + vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:]) | ||
446 | + | ||
447 | + vlad = tf.concat([vlad_video, vlad_audio],1) | ||
448 | + | ||
449 | + vlad_dim = vlad.get_shape().as_list()[1] | ||
450 | + hidden1_weights = tf.get_variable("hidden1_weights", | ||
451 | + [vlad_dim, hidden1_size], | ||
452 | + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) | ||
453 | + | ||
454 | + activation = tf.matmul(vlad, hidden1_weights) | ||
455 | + | ||
456 | + if add_batch_norm and relu: | ||
457 | + activation = slim.batch_norm( | ||
458 | + activation, | ||
459 | + center=True, | ||
460 | + scale=True, | ||
461 | + is_training=is_training, | ||
462 | + scope="hidden1_bn") | ||
463 | + | ||
464 | + else: | ||
465 | + hidden1_biases = tf.get_variable("hidden1_biases", | ||
466 | + [hidden1_size], | ||
467 | + initializer = tf.random_normal_initializer(stddev=0.01)) | ||
468 | + tf.summary.histogram("hidden1_biases", hidden1_biases) | ||
469 | + activation += hidden1_biases | ||
470 | + | ||
471 | + if relu: | ||
472 | + activation = tf.nn.relu6(activation) | ||
473 | + | ||
474 | + | ||
475 | + if gating: | ||
476 | + gating_weights = tf.get_variable("gating_weights_2", | ||
477 | + [hidden1_size, hidden1_size], | ||
478 | + initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) | ||
479 | + | ||
480 | + gates = tf.matmul(activation, gating_weights) | ||
481 | + | ||
482 | + if remove_diag: | ||
483 | + #removes diagonals coefficients | ||
484 | + diagonals = tf.matrix_diag_part(gating_weights) | ||
485 | + gates = gates - tf.multiply(diagonals,activation) | ||
486 | + | ||
487 | + | ||
488 | + if add_batch_norm: | ||
489 | + gates = slim.batch_norm( | ||
490 | + gates, | ||
491 | + center=True, | ||
492 | + scale=True, | ||
493 | + is_training=is_training, | ||
494 | + scope="gating_bn") | ||
495 | + else: | ||
496 | + gating_biases = tf.get_variable("gating_biases", | ||
497 | + [cluster_size], | ||
498 | + initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) | ||
499 | + gates += gating_biases | ||
500 | + | ||
501 | + gates = tf.sigmoid(gates) | ||
502 | + | ||
503 | + activation = tf.multiply(activation,gates) | ||
504 | + | ||
505 | + aggregated_model = getattr(video_level_models, | ||
506 | + 'willow_MoeModel') | ||
507 | + | ||
508 | + | ||
509 | + return aggregated_model().create_model( | ||
510 | + model_input=activation, | ||
511 | + vocab_size=vocab_size, | ||
512 | + is_training=is_training, | ||
513 | + **unused_params) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -25,6 +25,21 @@ FLAGS = flags.FLAGS | ... | @@ -25,6 +25,21 @@ FLAGS = flags.FLAGS |
25 | flags.DEFINE_integer( | 25 | flags.DEFINE_integer( |
26 | "moe_num_mixtures", 2, | 26 | "moe_num_mixtures", 2, |
27 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") | 27 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") |
28 | +# flags.DEFINE_integer( | ||
29 | +# "moe_num_mixtures", 2, | ||
30 | +# "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") | ||
31 | +flags.DEFINE_float( | ||
32 | + "moe_l2", 1e-8, | ||
33 | + "L2 penalty for MoeModel.") | ||
34 | +flags.DEFINE_integer( | ||
35 | + "moe_low_rank_gating", -1, | ||
36 | + "Low rank gating for MoeModel.") | ||
37 | +flags.DEFINE_bool( | ||
38 | + "moe_prob_gating", True, | ||
39 | + "Prob gating for MoeModel.") | ||
40 | +flags.DEFINE_string( | ||
41 | + "moe_prob_gating_input", "prob", | ||
42 | + "input Prob gating for MoeModel.") | ||
28 | 43 | ||
29 | 44 | ||
30 | class LogisticModel(models.BaseModel): | 45 | class LogisticModel(models.BaseModel): |
... | @@ -111,3 +126,109 @@ class MoeModel(models.BaseModel): | ... | @@ -111,3 +126,109 @@ class MoeModel(models.BaseModel): |
111 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, | 126 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, |
112 | [-1, vocab_size]) | 127 | [-1, vocab_size]) |
113 | return {"predictions": final_probabilities} | 128 | return {"predictions": final_probabilities} |
129 | + | ||
130 | + | ||
131 | +class willow_MoeModel(models.BaseModel): | ||
132 | + """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
133 | + | ||
134 | + def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params): | ||
135 | + """Creates a Mixture of (Logistic) Experts model. | ||
136 | + It also includes the possibility of gating the probabilities | ||
137 | + The model consists of a per-class softmax distribution over a | ||
138 | + configurable number of logistic classifiers. One of the classifiers in the | ||
139 | + mixture is not trained, and always predicts 0. | ||
140 | + Args: | ||
141 | + model_input: 'batch_size' x 'num_features' matrix of input features. | ||
142 | + vocab_size: The number of classes in the dataset. | ||
143 | + is_training: Is this the training phase ? | ||
144 | + num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
145 | + always predicts the non-existence of an entity). | ||
146 | + l2_penalty: How much to penalize the squared magnitudes of parameter | ||
147 | + values. | ||
148 | + Returns: | ||
149 | + A dictionary with a tensor containing the probability predictions of the | ||
150 | + model in the 'predictions' key. The dimensions of the tensor are | ||
151 | + batch_size x num_classes. | ||
152 | + """ | ||
153 | + num_mixtures = 8 | ||
154 | + low_rank_gating = FLAGS.moe_low_rank_gating | ||
155 | + l2_penalty = FLAGS.moe_l2 | ||
156 | + gating_probabilities = FLAGS.moe_prob_gating | ||
157 | + gating_input = FLAGS.moe_prob_gating_input | ||
158 | + | ||
159 | + input_size = model_input.get_shape().as_list()[1] | ||
160 | + remove_diag = False | ||
161 | + | ||
162 | + if low_rank_gating == -1: | ||
163 | + gate_activations = slim.fully_connected( | ||
164 | + model_input, | ||
165 | + vocab_size * (num_mixtures + 1), | ||
166 | + activation_fn=None, | ||
167 | + biases_initializer=None, | ||
168 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
169 | + scope="gates") | ||
170 | + else: | ||
171 | + gate_activations1 = slim.fully_connected( | ||
172 | + model_input, | ||
173 | + low_rank_gating, | ||
174 | + activation_fn=None, | ||
175 | + biases_initializer=None, | ||
176 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
177 | + scope="gates1") | ||
178 | + gate_activations = slim.fully_connected( | ||
179 | + gate_activations1, | ||
180 | + vocab_size * (num_mixtures + 1), | ||
181 | + activation_fn=None, | ||
182 | + biases_initializer=None, | ||
183 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
184 | + scope="gates2") | ||
185 | + | ||
186 | + expert_activations = slim.fully_connected( | ||
187 | + model_input, | ||
188 | + vocab_size * num_mixtures, | ||
189 | + activation_fn=None, | ||
190 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
191 | + scope="experts") | ||
192 | + | ||
193 | + gating_distribution = tf.nn.softmax(tf.reshape( | ||
194 | + gate_activations, | ||
195 | + [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
196 | + expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
197 | + expert_activations, | ||
198 | + [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
199 | + | ||
200 | + probabilities_by_class_and_batch = tf.reduce_sum( | ||
201 | + gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
202 | + probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
203 | + [-1, vocab_size]) | ||
204 | + | ||
205 | + if gating_probabilities: | ||
206 | + if gating_input == 'prob': | ||
207 | + gating_weights = tf.get_variable("gating_prob_weights", | ||
208 | + [vocab_size, vocab_size], | ||
209 | + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
210 | + gates = tf.matmul(probabilities, gating_weights) | ||
211 | + else: | ||
212 | + gating_weights = tf.get_variable("gating_prob_weights", | ||
213 | + [input_size, vocab_size], | ||
214 | + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
215 | + | ||
216 | + gates = tf.matmul(model_input, gating_weights) | ||
217 | + | ||
218 | + if remove_diag: | ||
219 | + # removes diagonals coefficients | ||
220 | + diagonals = tf.matrix_diag_part(gating_weights) | ||
221 | + gates = gates - tf.multiply(diagonals, probabilities) | ||
222 | + | ||
223 | + gates = slim.batch_norm( | ||
224 | + gates, | ||
225 | + center=True, | ||
226 | + scale=True, | ||
227 | + is_training=is_training, | ||
228 | + scope="gating_prob_bn") | ||
229 | + | ||
230 | + gates = tf.sigmoid(gates) | ||
231 | + | ||
232 | + probabilities = tf.multiply(probabilities, gates) | ||
233 | + | ||
234 | + return {"predictions": probabilities} | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment