Showing
3 changed files
with
121 additions
and
2 deletions
... | @@ -20,8 +20,6 @@ | ... | @@ -20,8 +20,6 @@ |
20 |  | 20 |  |
21 | * 팀명 **Profit Hunter** | 21 | * 팀명 **Profit Hunter** |
22 | * 윤영빈(컴퓨터공학과, 2015104192) | 22 | * 윤영빈(컴퓨터공학과, 2015104192) |
23 | -* 윤준현(컴퓨터공학과, 2015104193) | ||
24 | -* 이현규(컴퓨터공학과, 2015104209) | ||
25 | * 이태현(컴퓨터공학과, 2015104208) | 23 | * 이태현(컴퓨터공학과, 2015104208) |
26 | 24 | ||
27 | ## Links | 25 | ## Links | ... | ... |
This diff is collapsed. Click to expand it.
... | @@ -25,6 +25,21 @@ FLAGS = flags.FLAGS | ... | @@ -25,6 +25,21 @@ FLAGS = flags.FLAGS |
25 | flags.DEFINE_integer( | 25 | flags.DEFINE_integer( |
26 | "moe_num_mixtures", 2, | 26 | "moe_num_mixtures", 2, |
27 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") | 27 | "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") |
28 | +# flags.DEFINE_integer( | ||
29 | +# "moe_num_mixtures", 2, | ||
30 | +# "The number of mixtures (excluding the dummy 'expert') used for MoeModel.") | ||
31 | +flags.DEFINE_float( | ||
32 | + "moe_l2", 1e-8, | ||
33 | + "L2 penalty for MoeModel.") | ||
34 | +flags.DEFINE_integer( | ||
35 | + "moe_low_rank_gating", -1, | ||
36 | + "Low rank gating for MoeModel.") | ||
37 | +flags.DEFINE_bool( | ||
38 | + "moe_prob_gating", True, | ||
39 | + "Prob gating for MoeModel.") | ||
40 | +flags.DEFINE_string( | ||
41 | + "moe_prob_gating_input", "prob", | ||
42 | + "input Prob gating for MoeModel.") | ||
28 | 43 | ||
29 | 44 | ||
30 | class LogisticModel(models.BaseModel): | 45 | class LogisticModel(models.BaseModel): |
... | @@ -111,3 +126,109 @@ class MoeModel(models.BaseModel): | ... | @@ -111,3 +126,109 @@ class MoeModel(models.BaseModel): |
111 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, | 126 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, |
112 | [-1, vocab_size]) | 127 | [-1, vocab_size]) |
113 | return {"predictions": final_probabilities} | 128 | return {"predictions": final_probabilities} |
129 | + | ||
130 | + | ||
131 | +class willow_MoeModel(models.BaseModel): | ||
132 | + """A softmax over a mixture of logistic models (with L2 regularization).""" | ||
133 | + | ||
134 | + def create_model(self,model_input,vocab_size,is_training,num_mixtures=None,l2_penalty=1e-8,**unused_params): | ||
135 | + """Creates a Mixture of (Logistic) Experts model. | ||
136 | + It also includes the possibility of gating the probabilities | ||
137 | + The model consists of a per-class softmax distribution over a | ||
138 | + configurable number of logistic classifiers. One of the classifiers in the | ||
139 | + mixture is not trained, and always predicts 0. | ||
140 | + Args: | ||
141 | + model_input: 'batch_size' x 'num_features' matrix of input features. | ||
142 | + vocab_size: The number of classes in the dataset. | ||
143 | + is_training: Is this the training phase ? | ||
144 | + num_mixtures: The number of mixtures (excluding a dummy 'expert' that | ||
145 | + always predicts the non-existence of an entity). | ||
146 | + l2_penalty: How much to penalize the squared magnitudes of parameter | ||
147 | + values. | ||
148 | + Returns: | ||
149 | + A dictionary with a tensor containing the probability predictions of the | ||
150 | + model in the 'predictions' key. The dimensions of the tensor are | ||
151 | + batch_size x num_classes. | ||
152 | + """ | ||
153 | + num_mixtures = 8 | ||
154 | + low_rank_gating = FLAGS.moe_low_rank_gating | ||
155 | + l2_penalty = FLAGS.moe_l2 | ||
156 | + gating_probabilities = FLAGS.moe_prob_gating | ||
157 | + gating_input = FLAGS.moe_prob_gating_input | ||
158 | + | ||
159 | + input_size = model_input.get_shape().as_list()[1] | ||
160 | + remove_diag = False | ||
161 | + | ||
162 | + if low_rank_gating == -1: | ||
163 | + gate_activations = slim.fully_connected( | ||
164 | + model_input, | ||
165 | + vocab_size * (num_mixtures + 1), | ||
166 | + activation_fn=None, | ||
167 | + biases_initializer=None, | ||
168 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
169 | + scope="gates") | ||
170 | + else: | ||
171 | + gate_activations1 = slim.fully_connected( | ||
172 | + model_input, | ||
173 | + low_rank_gating, | ||
174 | + activation_fn=None, | ||
175 | + biases_initializer=None, | ||
176 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
177 | + scope="gates1") | ||
178 | + gate_activations = slim.fully_connected( | ||
179 | + gate_activations1, | ||
180 | + vocab_size * (num_mixtures + 1), | ||
181 | + activation_fn=None, | ||
182 | + biases_initializer=None, | ||
183 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
184 | + scope="gates2") | ||
185 | + | ||
186 | + expert_activations = slim.fully_connected( | ||
187 | + model_input, | ||
188 | + vocab_size * num_mixtures, | ||
189 | + activation_fn=None, | ||
190 | + weights_regularizer=slim.l2_regularizer(l2_penalty), | ||
191 | + scope="experts") | ||
192 | + | ||
193 | + gating_distribution = tf.nn.softmax(tf.reshape( | ||
194 | + gate_activations, | ||
195 | + [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) | ||
196 | + expert_distribution = tf.nn.sigmoid(tf.reshape( | ||
197 | + expert_activations, | ||
198 | + [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures | ||
199 | + | ||
200 | + probabilities_by_class_and_batch = tf.reduce_sum( | ||
201 | + gating_distribution[:, :num_mixtures] * expert_distribution, 1) | ||
202 | + probabilities = tf.reshape(probabilities_by_class_and_batch, | ||
203 | + [-1, vocab_size]) | ||
204 | + | ||
205 | + if gating_probabilities: | ||
206 | + if gating_input == 'prob': | ||
207 | + gating_weights = tf.get_variable("gating_prob_weights", | ||
208 | + [vocab_size, vocab_size], | ||
209 | + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
210 | + gates = tf.matmul(probabilities, gating_weights) | ||
211 | + else: | ||
212 | + gating_weights = tf.get_variable("gating_prob_weights", | ||
213 | + [input_size, vocab_size], | ||
214 | + initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) | ||
215 | + | ||
216 | + gates = tf.matmul(model_input, gating_weights) | ||
217 | + | ||
218 | + if remove_diag: | ||
219 | + # removes diagonals coefficients | ||
220 | + diagonals = tf.matrix_diag_part(gating_weights) | ||
221 | + gates = gates - tf.multiply(diagonals, probabilities) | ||
222 | + | ||
223 | + gates = slim.batch_norm( | ||
224 | + gates, | ||
225 | + center=True, | ||
226 | + scale=True, | ||
227 | + is_training=is_training, | ||
228 | + scope="gating_prob_bn") | ||
229 | + | ||
230 | + gates = tf.sigmoid(gates) | ||
231 | + | ||
232 | + probabilities = tf.multiply(probabilities, gates) | ||
233 | + | ||
234 | + return {"predictions": probabilities} | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment