Showing
5 changed files
with
384 additions
and
0 deletions
No preview for this file type
code/.DS_Store
0 → 100644
No preview for this file type
code/morphemeMerger.py
0 → 100644
1 | +# need pip install jamo | ||
2 | +import sys, re | ||
3 | +from jamo import h2j, j2hcj | ||
4 | +from seq2seq.merger.unicode import join_jamos | ||
5 | + | ||
6 | +def isHangul(text): | ||
7 | + #Check the Python Version | ||
8 | + pyVer3 = sys.version_info >= (3, 0) | ||
9 | + | ||
10 | + if pyVer3 : # for Ver 3 or later | ||
11 | + encText = text | ||
12 | + else: # for Ver 2.x | ||
13 | + if type(text) is not unicode: | ||
14 | + encText = text.decode('utf-8') | ||
15 | + else: | ||
16 | + encText = text | ||
17 | + | ||
18 | + hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) | ||
19 | + return hanCount > 0 | ||
20 | + | ||
21 | +# jamo-level | ||
22 | +def decomposition(sentence): | ||
23 | + sentence = j2hcj(h2j(sentence)) | ||
24 | + index = [] | ||
25 | + for item in sentence: | ||
26 | + if(not isHangul(item) and item.isalpha()): | ||
27 | + index.insert(-1, sentence.find(item)) | ||
28 | + break | ||
29 | + if(len(index)): | ||
30 | + part1 = list(sentence[:index[0]-1]) | ||
31 | + part2 = sentence[index[0]:].split() | ||
32 | + return ''.join((part1 + part2)) | ||
33 | + else: | ||
34 | + return sentence | ||
35 | + | ||
36 | +def reconstructor(decom): | ||
37 | + return join_jamos(''.join(decom)) | ||
38 | + | ||
39 | +''' | ||
40 | + Here is the cell about rules. | ||
41 | + Each return values have 2 values. | ||
42 | + And the second values are intended to generate complete 한글 | ||
43 | + because the second value will become the first value next for-loop(in below 'composer' function). | ||
44 | +''' | ||
45 | + | ||
46 | +# rule about '었' | ||
47 | +def ruleOfEot(front, back): | ||
48 | + deFront = decomposition(front) | ||
49 | + deBack = decomposition(back) | ||
50 | + if len(deFront) == 3: | ||
51 | + # 어떻었다 ㄸㅓㅎ ㅇㅓㅆ => 어땠다 | ||
52 | + if deFront[1] == 'ㅓ' and deFront[2] == 'ㅎ': | ||
53 | + return '', deFront[0] + 'ㅐ' + deBack[-1] | ||
54 | + # Irregular about 'ㄷ' badchim of '듣' | ||
55 | + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ': | ||
56 | + return deFront[:-1] + 'ㄹ', deBack | ||
57 | + # Irregular about 'ㄷ' badchim of '싣', '긷' | ||
58 | + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ': | ||
59 | + return deFront[:-1] + 'ㄹ', deBack | ||
60 | + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻' | ||
61 | + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ': | ||
62 | + return deFront[:-1] + 'ㄹ', deBack | ||
63 | + # '업', '접' have no tallak | ||
64 | + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅂ': | ||
65 | + return deFront, deBack | ||
66 | + # 뜨겁었다 ㄱㅓㅂ ㅇㅓㅆ | ||
67 | + elif deFront[2] == 'ㅂ': | ||
68 | + return deFront[:2], deBack[0] + 'ㅝ' + deBack[-1] | ||
69 | + # Irregular about 'ㅅ' badchim of '긋' | ||
70 | + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ': | ||
71 | + return deFront[:-1], deBack | ||
72 | + # Irregular about 'ㅅ' badchim of '젓' | ||
73 | + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ': | ||
74 | + return deFront[:-1], deBack | ||
75 | + # Irregular about 'ㅅ' badchim of '잇', '짓' | ||
76 | + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ': | ||
77 | + return deFront[:-1], deBack | ||
78 | + else: | ||
79 | + return deFront, deBack | ||
80 | + else: | ||
81 | + if deFront[1] == 'ㅜ': | ||
82 | + return '', deFront[0] + 'ㅝ' + deBack[-1] | ||
83 | + elif deFront[1] == 'ㅣ': | ||
84 | + return '', deFront[0] + 'ㅕ' + deBack[-1] | ||
85 | + elif deFront[1] == 'ㅐ': | ||
86 | + return '', deFront + deBack[-1] | ||
87 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ': | ||
88 | + return deFront[-2], 'ㄹ' + deBack[1:] | ||
89 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅓ': | ||
90 | + return '', 'ㄹㅐ' + deBack[-1] | ||
91 | + elif deFront[-1] == 'ㅡ': | ||
92 | + return '', deFront[-2] + deBack[1:] | ||
93 | + else: | ||
94 | + return deFront, deBack | ||
95 | + | ||
96 | +# rule about '았' | ||
97 | +def ruleOfAt(front, back): | ||
98 | + deFront = decomposition(front) | ||
99 | + deBack = decomposition(back) | ||
100 | + if len(deFront) == 3: | ||
101 | + # Irregular about 'ㅅ' badchim of '낫', '잣' | ||
102 | + if (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ': | ||
103 | + return deFront[:-1], deBack | ||
104 | + else: | ||
105 | + return deFront, deBack | ||
106 | + else: | ||
107 | + if deFront[1] == 'ㅏ': | ||
108 | + return '', deFront + deBack[-1] | ||
109 | + elif deFront[-1] == 'ㅗ': | ||
110 | + return '', deFront[-2] + 'ㅘ' + deBack[-1] | ||
111 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ': | ||
112 | + return deFront[-2], 'ㄹ'+deBack[1:] | ||
113 | + elif deFront[-1] == 'ㅡ': | ||
114 | + return '', deFront[-2] + deBack[1:] | ||
115 | + else: | ||
116 | + return deFront, deBack | ||
117 | + | ||
118 | +# rule about '시' | ||
119 | +def ruleOfSi(front, back): | ||
120 | + deFront = decomposition(front) | ||
121 | + deBack = decomposition(back) | ||
122 | + if len(deFront) == 3: | ||
123 | + if deFront[-1] == 'ㅎ'or deFront[-1] == 'ㄹ': | ||
124 | + return deFront[:2], deBack | ||
125 | + else: | ||
126 | + return deFront, deBack | ||
127 | + else: | ||
128 | + return deFront, deBack | ||
129 | + | ||
130 | +# rule about '였' | ||
131 | +def ruleOfYeot(front, back): | ||
132 | + deFront = decomposition(front) | ||
133 | + deBack = decomposition(back) | ||
134 | + if deFront[-2] == 'ㅎ' and deFront[-1] == 'ㅏ': | ||
135 | + return '', deFront[-2] + 'ㅐ' + deBack[-1] | ||
136 | + elif deFront[-2] == 'ㄷ' and deFront[-1] == 'ㅐ': | ||
137 | + return 'ㄷㅏ', 'ㅎㅐㅆ' | ||
138 | + else: | ||
139 | + return deFront, deBack | ||
140 | + | ||
141 | +# rule about '아' | ||
142 | +def ruleOfAh(front, back): | ||
143 | + deFront = decomposition(front) | ||
144 | + deBack = decomposition(back) | ||
145 | + if len(deFront) == 3: | ||
146 | + if deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅗ' and deFront[-1] == 'ㅂ': | ||
147 | + return deFront[:-1], deBack[0] + 'ㅘ' | ||
148 | + # Irregular about 'ㅅ' badchim of '낫', '잣' | ||
149 | + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ': | ||
150 | + return deFront[:-1], deBack | ||
151 | + else: | ||
152 | + return deFront, deBack | ||
153 | + else: | ||
154 | + if deFront[-2] == 'ㅎ' and deFront[-1] == 'ㅏ': | ||
155 | + return '', deFront[0] + 'ㅐ' | ||
156 | + elif deFront[-1] == 'ㅏ': | ||
157 | + return '', deFront | ||
158 | + elif deFront[-1] == 'ㅐ': | ||
159 | + return '', deFront | ||
160 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ': | ||
161 | + return deFront[0], 'ㄹ'+deBack[-1] | ||
162 | + elif deFront[-2] == 'ㄷ' and deFront[-1] == 'ㅗ': | ||
163 | + return deFront, deBack[0] + 'ㅘ' | ||
164 | + elif deFront[-1] == 'ㅗ': | ||
165 | + return '', deFront[-2] + 'ㅘ' | ||
166 | + else: | ||
167 | + return deFront, deBack | ||
168 | + | ||
169 | +# rule about '어' | ||
170 | +def ruleOfEo(front, back): | ||
171 | + deFront = decomposition(front) | ||
172 | + deBack = decomposition(back) | ||
173 | + if len(deFront) == 3: | ||
174 | + # '업', '접' have no tallak | ||
175 | + if (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅂ': | ||
176 | + return deFront, deBack | ||
177 | + elif deFront[-1] == 'ㅂ': | ||
178 | + return deFront[:2], deBack[0] + 'ㅝ' | ||
179 | + elif deFront[-1] == 'ㅎ': | ||
180 | + return '', deFront[-3] + 'ㅐ' | ||
181 | + # Irregular about 'ㄷ' badchim of '듣' | ||
182 | + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ': | ||
183 | + return deFront[:-1] + 'ㄹ', deBack | ||
184 | + # Irregular about 'ㄷ' badchim of '싣', '긷' | ||
185 | + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ': | ||
186 | + return deFront[:-1] + 'ㄹ', deBack | ||
187 | + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻' | ||
188 | + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ': | ||
189 | + return deFront[:-1] + 'ㄹ', deBack | ||
190 | + # Irregular about 'ㅅ' badchim of '긋' | ||
191 | + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ': | ||
192 | + return deFront[:-1], deBack | ||
193 | + # Irregular about 'ㅅ' badchim of '젓' | ||
194 | + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ': | ||
195 | + return deFront[:-1], deBack | ||
196 | + # Irregular about 'ㅅ' badchim of '잇', '짓' | ||
197 | + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ': | ||
198 | + return deFront[:-1], deBack | ||
199 | + else: | ||
200 | + return deFront, deBack | ||
201 | + else: | ||
202 | + if deFront[-1] == 'ㅜ': | ||
203 | + return '', deFront[0] + 'ㅝ' | ||
204 | + elif deFront[-2] == 'ㅂ' and deFront[-1] == 'ㅣ': | ||
205 | + return deFront, deBack | ||
206 | + elif deFront[1] == 'ㅣ': | ||
207 | + return '', deFront[0] + 'ㅕ' | ||
208 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅓ': | ||
209 | + return '', deFront[-2] + 'ㅐ' | ||
210 | + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ': | ||
211 | + return deFront[-2], deFront[-2] + deBack[-1] | ||
212 | + elif deFront[-1] == 'ㅡ': | ||
213 | + return '', deFront[-2] + deBack[-1] | ||
214 | + else: | ||
215 | + return deFront, deBack | ||
216 | + | ||
217 | +# rule about '여' | ||
218 | +def ruleOfYeo(front, back): | ||
219 | + deFront = decomposition(front) | ||
220 | + deBack = decomposition(back) | ||
221 | + if front == '하': | ||
222 | + return '', deFront[0] + 'ㅐ' | ||
223 | + else: | ||
224 | + return deFront, deBack | ||
225 | + | ||
226 | +# rule about 'ㄴ', 'ㄹ', 'ㅂ' | ||
227 | +def ruleOfConnectedBadchim(front, back): | ||
228 | + deFront = decomposition(front) | ||
229 | + deBack = decomposition(back) | ||
230 | + if len(deFront) == 3: | ||
231 | + return deFront[:-1], deBack | ||
232 | + else: | ||
233 | + return deFront, deBack | ||
234 | + | ||
235 | +# rule about '은' | ||
236 | +def ruleOfEun(front, back): | ||
237 | + deFront = decomposition(front) | ||
238 | + deBack = decomposition(back) | ||
239 | + if len(deFront) == 3: | ||
240 | + if deFront[-1] == 'ㅅ': | ||
241 | + return deFront[:-1], deBack | ||
242 | + elif deFront[-1] == 'ㅂ': | ||
243 | + return deFront[:-1], deBack[0] + 'ㅜ' + deBack[-1] | ||
244 | + # Irregular about 'ㄷ' badchim of '듣' | ||
245 | + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ': | ||
246 | + return deFront[:-1] + 'ㄹ', deBack | ||
247 | + # Irregular about 'ㄷ' badchim of '싣', '긷' | ||
248 | + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ': | ||
249 | + return deFront[:-1] + 'ㄹ', deBack | ||
250 | + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻' | ||
251 | + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ': | ||
252 | + return deFront[:-1] + 'ㄹ', deBack | ||
253 | + else: | ||
254 | + return deFront, deBack | ||
255 | + else: | ||
256 | + return deFront, deBack | ||
257 | + | ||
258 | +# rule about '는' | ||
259 | +def ruleOfNeun(front, back): | ||
260 | + deFront = decomposition(front) | ||
261 | + deBack = decomposition(back) | ||
262 | + if len(deFront) == 3 and deFront[-1] == 'ㄹ': | ||
263 | + return deFront[:-1], deBack | ||
264 | + else: | ||
265 | + return deFront, deBack | ||
266 | + | ||
267 | +# rule about '으' | ||
268 | +def ruleOfNEu(front, back): | ||
269 | + deFront = decomposition(front) | ||
270 | + deBack = decomposition(back) | ||
271 | + if len(deFront) == 3: | ||
272 | + if deFront[-1] == 'ㄷ': | ||
273 | + return deFront[:-1] + 'ㄹ', deBack | ||
274 | + # Irregular about 'ㅅ' badchim of '긋' | ||
275 | + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ': | ||
276 | + return deFront[:-1], deBack | ||
277 | + # Irregular about 'ㅅ' badchim of '젓' | ||
278 | + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ': | ||
279 | + return deFront[:-1], deBack | ||
280 | + # Irregular about 'ㅅ' badchim of '낫', '잣' | ||
281 | + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ': | ||
282 | + return deFront[:-1], deBack | ||
283 | + # Irregular about 'ㅅ' badchim of '잇', '짓' | ||
284 | + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ': | ||
285 | + return deFront[:-1], deBack | ||
286 | + else: | ||
287 | + return deFront, deBack | ||
288 | + else: | ||
289 | + return deFront, deBack | ||
290 | + | ||
291 | +# router of rule | ||
292 | +def ruleSet(front, back): | ||
293 | + if back in ['ㄴ', 'ㄹ', 'ㅂ']: | ||
294 | + return ruleOfConnectedBadchim(front, back) | ||
295 | + elif back == '었': | ||
296 | + return ruleOfEot(front, back) | ||
297 | + elif back == '았': | ||
298 | + return ruleOfAt(front, back) | ||
299 | + elif back == '시': | ||
300 | + return ruleOfSi(front, back) | ||
301 | + elif back == '였': | ||
302 | + return ruleOfYeot(front, back) | ||
303 | + elif back == '아': | ||
304 | + return ruleOfAh(front, back) | ||
305 | + elif back == '어': | ||
306 | + return ruleOfEo(front, back) | ||
307 | + elif back == '여': | ||
308 | + return ruleOfYeo(front, back) | ||
309 | + elif back == '은': | ||
310 | + return ruleOfEun(front, back) | ||
311 | + elif back == '는': | ||
312 | + return ruleOfNeun(front, back) | ||
313 | + elif back == '으': | ||
314 | + return ruleOfNEu(front, back) | ||
315 | + else: | ||
316 | + deFront = decomposition(front) | ||
317 | + deBack = decomposition(back) | ||
318 | + return deFront, deBack | ||
319 | + | ||
320 | +# special cases | ||
321 | +fullMorpheme = ['깨닫', '따르'] | ||
322 | +backCase = ['아', '았', '으'] | ||
323 | +def isSpecial(front, back): | ||
324 | + if front in fullMorpheme and back in backCase: | ||
325 | + return True | ||
326 | + else: | ||
327 | + return False | ||
328 | + | ||
329 | +def specialRuleSet(front, back): | ||
330 | + deFront = decomposition(front) | ||
331 | + deBack = decomposition(back) | ||
332 | + if front == '닫': | ||
333 | + return deFront[:-1] + 'ㄹ', deBack | ||
334 | + elif front == '르': | ||
335 | + return '', 'ㄹ' + deBack[1:] | ||
336 | + else: | ||
337 | + return deFront, deBack | ||
338 | + | ||
339 | +def specialChunkToChange(sentence): | ||
340 | + if '셔요' in sentence: | ||
341 | + return sentence.replace('셔요', '세요') | ||
342 | + elif '시어요' in sentence: | ||
343 | + return sentence.replace('시어요', '세요') | ||
344 | + else: | ||
345 | + return sentence | ||
346 | + | ||
347 | +def composer(morphemeList): | ||
348 | + # After it checked, its morpheme set will be seperated into jamo-level | ||
349 | + # So the return variables were named dF which means decomposedFront | ||
350 | + # So this variable checks whether it changed | ||
351 | + final = 1 | ||
352 | + | ||
353 | + # morphemeList examples below | ||
354 | + # morphemeList = ['알리', '어드리', 'ㄹ_것이', 'ㅂ니다'] | ||
355 | + # morphemeList = ['뜨겁', '은데요'] | ||
356 | + | ||
357 | + | ||
358 | + | ||
359 | + # Basically checking connected parts between morphemes. | ||
360 | + for i in range(0, len(morphemeList)-1): | ||
361 | + front = morphemeList[i][-final:] | ||
362 | + back = morphemeList[i+1][0] | ||
363 | + lenBack = len(morphemeList[i+1]) | ||
364 | + | ||
365 | + if i == 0 and isSpecial(morphemeList[i], back): | ||
366 | + dF, dB = specialRuleSet(front, back) | ||
367 | + else: | ||
368 | + dF, dB = ruleSet(front, back) | ||
369 | + | ||
370 | + morphemeList[i] = morphemeList[i][:-final] + dF | ||
371 | + morphemeList[i+1] = dB + morphemeList[i+1][1:] | ||
372 | + | ||
373 | + if lenBack == 1: | ||
374 | + final = len(dB) | ||
375 | + else: | ||
376 | + final = 1 | ||
377 | + | ||
378 | + morphListToString = ''.join(morphemeList) | ||
379 | + | ||
380 | + construct = reconstructor(morphListToString) | ||
381 | + | ||
382 | + result = specialChunkToChange(construct) | ||
383 | + | ||
384 | + return reconstructor(result) |
code/seq2seqImprovedEmbedding.ipynb
0 → 100644
This diff could not be displayed because it is too large.
No preview for this file type
-
Please register or login to post a comment