이예준

FastText + seq2seq model & Morpheme merger

No preview for this file type
No preview for this file type
1 +# need pip install jamo
2 +import sys, re
3 +from jamo import h2j, j2hcj
4 +from seq2seq.merger.unicode import join_jamos
5 +
6 +def isHangul(text):
7 + #Check the Python Version
8 + pyVer3 = sys.version_info >= (3, 0)
9 +
10 + if pyVer3 : # for Ver 3 or later
11 + encText = text
12 + else: # for Ver 2.x
13 + if type(text) is not unicode:
14 + encText = text.decode('utf-8')
15 + else:
16 + encText = text
17 +
18 + hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText))
19 + return hanCount > 0
20 +
21 +# jamo-level
22 +def decomposition(sentence):
23 + sentence = j2hcj(h2j(sentence))
24 + index = []
25 + for item in sentence:
26 + if(not isHangul(item) and item.isalpha()):
27 + index.insert(-1, sentence.find(item))
28 + break
29 + if(len(index)):
30 + part1 = list(sentence[:index[0]-1])
31 + part2 = sentence[index[0]:].split()
32 + return ''.join((part1 + part2))
33 + else:
34 + return sentence
35 +
36 +def reconstructor(decom):
37 + return join_jamos(''.join(decom))
38 +
39 +'''
40 + Here is the cell about rules.
41 + Each return values have 2 values.
42 + And the second values are intended to generate complete 한글
43 + because the second value will become the first value next for-loop(in below 'composer' function).
44 +'''
45 +
46 +# rule about '었'
47 +def ruleOfEot(front, back):
48 + deFront = decomposition(front)
49 + deBack = decomposition(back)
50 + if len(deFront) == 3:
51 + # 어떻었다 ㄸㅓㅎ ㅇㅓㅆ => 어땠다
52 + if deFront[1] == 'ㅓ' and deFront[2] == 'ㅎ':
53 + return '', deFront[0] + 'ㅐ' + deBack[-1]
54 + # Irregular about 'ㄷ' badchim of '듣'
55 + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ':
56 + return deFront[:-1] + 'ㄹ', deBack
57 + # Irregular about 'ㄷ' badchim of '싣', '긷'
58 + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ':
59 + return deFront[:-1] + 'ㄹ', deBack
60 + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻'
61 + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ':
62 + return deFront[:-1] + 'ㄹ', deBack
63 + # '업', '접' have no tallak
64 + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅂ':
65 + return deFront, deBack
66 + # 뜨겁었다 ㄱㅓㅂ ㅇㅓㅆ
67 + elif deFront[2] == 'ㅂ':
68 + return deFront[:2], deBack[0] + 'ㅝ' + deBack[-1]
69 + # Irregular about 'ㅅ' badchim of '긋'
70 + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ':
71 + return deFront[:-1], deBack
72 + # Irregular about 'ㅅ' badchim of '젓'
73 + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ':
74 + return deFront[:-1], deBack
75 + # Irregular about 'ㅅ' badchim of '잇', '짓'
76 + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ':
77 + return deFront[:-1], deBack
78 + else:
79 + return deFront, deBack
80 + else:
81 + if deFront[1] == 'ㅜ':
82 + return '', deFront[0] + 'ㅝ' + deBack[-1]
83 + elif deFront[1] == 'ㅣ':
84 + return '', deFront[0] + 'ㅕ' + deBack[-1]
85 + elif deFront[1] == 'ㅐ':
86 + return '', deFront + deBack[-1]
87 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ':
88 + return deFront[-2], 'ㄹ' + deBack[1:]
89 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅓ':
90 + return '', 'ㄹㅐ' + deBack[-1]
91 + elif deFront[-1] == 'ㅡ':
92 + return '', deFront[-2] + deBack[1:]
93 + else:
94 + return deFront, deBack
95 +
96 +# rule about '았'
97 +def ruleOfAt(front, back):
98 + deFront = decomposition(front)
99 + deBack = decomposition(back)
100 + if len(deFront) == 3:
101 + # Irregular about 'ㅅ' badchim of '낫', '잣'
102 + if (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ':
103 + return deFront[:-1], deBack
104 + else:
105 + return deFront, deBack
106 + else:
107 + if deFront[1] == 'ㅏ':
108 + return '', deFront + deBack[-1]
109 + elif deFront[-1] == 'ㅗ':
110 + return '', deFront[-2] + 'ㅘ' + deBack[-1]
111 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ':
112 + return deFront[-2], 'ㄹ'+deBack[1:]
113 + elif deFront[-1] == 'ㅡ':
114 + return '', deFront[-2] + deBack[1:]
115 + else:
116 + return deFront, deBack
117 +
118 +# rule about '시'
119 +def ruleOfSi(front, back):
120 + deFront = decomposition(front)
121 + deBack = decomposition(back)
122 + if len(deFront) == 3:
123 + if deFront[-1] == 'ㅎ'or deFront[-1] == 'ㄹ':
124 + return deFront[:2], deBack
125 + else:
126 + return deFront, deBack
127 + else:
128 + return deFront, deBack
129 +
130 +# rule about '였'
131 +def ruleOfYeot(front, back):
132 + deFront = decomposition(front)
133 + deBack = decomposition(back)
134 + if deFront[-2] == 'ㅎ' and deFront[-1] == 'ㅏ':
135 + return '', deFront[-2] + 'ㅐ' + deBack[-1]
136 + elif deFront[-2] == 'ㄷ' and deFront[-1] == 'ㅐ':
137 + return 'ㄷㅏ', 'ㅎㅐㅆ'
138 + else:
139 + return deFront, deBack
140 +
141 +# rule about '아'
142 +def ruleOfAh(front, back):
143 + deFront = decomposition(front)
144 + deBack = decomposition(back)
145 + if len(deFront) == 3:
146 + if deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅗ' and deFront[-1] == 'ㅂ':
147 + return deFront[:-1], deBack[0] + 'ㅘ'
148 + # Irregular about 'ㅅ' badchim of '낫', '잣'
149 + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ':
150 + return deFront[:-1], deBack
151 + else:
152 + return deFront, deBack
153 + else:
154 + if deFront[-2] == 'ㅎ' and deFront[-1] == 'ㅏ':
155 + return '', deFront[0] + 'ㅐ'
156 + elif deFront[-1] == 'ㅏ':
157 + return '', deFront
158 + elif deFront[-1] == 'ㅐ':
159 + return '', deFront
160 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ':
161 + return deFront[0], 'ㄹ'+deBack[-1]
162 + elif deFront[-2] == 'ㄷ' and deFront[-1] == 'ㅗ':
163 + return deFront, deBack[0] + 'ㅘ'
164 + elif deFront[-1] == 'ㅗ':
165 + return '', deFront[-2] + 'ㅘ'
166 + else:
167 + return deFront, deBack
168 +
169 +# rule about '어'
170 +def ruleOfEo(front, back):
171 + deFront = decomposition(front)
172 + deBack = decomposition(back)
173 + if len(deFront) == 3:
174 + # '업', '접' have no tallak
175 + if (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅂ':
176 + return deFront, deBack
177 + elif deFront[-1] == 'ㅂ':
178 + return deFront[:2], deBack[0] + 'ㅝ'
179 + elif deFront[-1] == 'ㅎ':
180 + return '', deFront[-3] + 'ㅐ'
181 + # Irregular about 'ㄷ' badchim of '듣'
182 + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ':
183 + return deFront[:-1] + 'ㄹ', deBack
184 + # Irregular about 'ㄷ' badchim of '싣', '긷'
185 + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ':
186 + return deFront[:-1] + 'ㄹ', deBack
187 + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻'
188 + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ':
189 + return deFront[:-1] + 'ㄹ', deBack
190 + # Irregular about 'ㅅ' badchim of '긋'
191 + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ':
192 + return deFront[:-1], deBack
193 + # Irregular about 'ㅅ' badchim of '젓'
194 + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ':
195 + return deFront[:-1], deBack
196 + # Irregular about 'ㅅ' badchim of '잇', '짓'
197 + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ':
198 + return deFront[:-1], deBack
199 + else:
200 + return deFront, deBack
201 + else:
202 + if deFront[-1] == 'ㅜ':
203 + return '', deFront[0] + 'ㅝ'
204 + elif deFront[-2] == 'ㅂ' and deFront[-1] == 'ㅣ':
205 + return deFront, deBack
206 + elif deFront[1] == 'ㅣ':
207 + return '', deFront[0] + 'ㅕ'
208 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅓ':
209 + return '', deFront[-2] + 'ㅐ'
210 + elif deFront[-2] == 'ㄹ' and deFront[-1] == 'ㅡ':
211 + return deFront[-2], deFront[-2] + deBack[-1]
212 + elif deFront[-1] == 'ㅡ':
213 + return '', deFront[-2] + deBack[-1]
214 + else:
215 + return deFront, deBack
216 +
217 +# rule about '여'
218 +def ruleOfYeo(front, back):
219 + deFront = decomposition(front)
220 + deBack = decomposition(back)
221 + if front == '하':
222 + return '', deFront[0] + 'ㅐ'
223 + else:
224 + return deFront, deBack
225 +
226 +# rule about 'ㄴ', 'ㄹ', 'ㅂ'
227 +def ruleOfConnectedBadchim(front, back):
228 + deFront = decomposition(front)
229 + deBack = decomposition(back)
230 + if len(deFront) == 3:
231 + return deFront[:-1], deBack
232 + else:
233 + return deFront, deBack
234 +
235 +# rule about '은'
236 +def ruleOfEun(front, back):
237 + deFront = decomposition(front)
238 + deBack = decomposition(back)
239 + if len(deFront) == 3:
240 + if deFront[-1] == 'ㅅ':
241 + return deFront[:-1], deBack
242 + elif deFront[-1] == 'ㅂ':
243 + return deFront[:-1], deBack[0] + 'ㅜ' + deBack[-1]
244 + # Irregular about 'ㄷ' badchim of '듣'
245 + elif deFront[-3] == 'ㄷ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㄷ':
246 + return deFront[:-1] + 'ㄹ', deBack
247 + # Irregular about 'ㄷ' badchim of '싣', '긷'
248 + elif (deFront[-3] == 'ㅅ' or deFront[-3] == 'ㄱ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㄷ':
249 + return deFront[:-1] + 'ㄹ', deBack
250 + # Irregular about 'ㄷ' badchim of '눋', '붇', '묻'
251 + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅂ' or deFront[-3] == 'ㅁ') and deFront[-2] == 'ㅜ' and deFront[-1] == 'ㄷ':
252 + return deFront[:-1] + 'ㄹ', deBack
253 + else:
254 + return deFront, deBack
255 + else:
256 + return deFront, deBack
257 +
258 +# rule about '는'
259 +def ruleOfNeun(front, back):
260 + deFront = decomposition(front)
261 + deBack = decomposition(back)
262 + if len(deFront) == 3 and deFront[-1] == 'ㄹ':
263 + return deFront[:-1], deBack
264 + else:
265 + return deFront, deBack
266 +
267 +# rule about '으'
268 +def ruleOfNEu(front, back):
269 + deFront = decomposition(front)
270 + deBack = decomposition(back)
271 + if len(deFront) == 3:
272 + if deFront[-1] == 'ㄷ':
273 + return deFront[:-1] + 'ㄹ', deBack
274 + # Irregular about 'ㅅ' badchim of '긋'
275 + elif deFront[-3] == 'ㄱ' and deFront[-2] == 'ㅡ' and deFront[-1] == 'ㅅ':
276 + return deFront[:-1], deBack
277 + # Irregular about 'ㅅ' badchim of '젓'
278 + elif deFront[-3] == 'ㅈ' and deFront[-2] == 'ㅓ' and deFront[-1] == 'ㅅ':
279 + return deFront[:-1], deBack
280 + # Irregular about 'ㅅ' badchim of '낫', '잣'
281 + elif (deFront[-3] == 'ㄴ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅏ' and deFront[-1] == 'ㅅ':
282 + return deFront[:-1], deBack
283 + # Irregular about 'ㅅ' badchim of '잇', '짓'
284 + elif (deFront[-3] == 'ㅇ' or deFront[-3] == 'ㅈ') and deFront[-2] == 'ㅣ' and deFront[-1] == 'ㅅ':
285 + return deFront[:-1], deBack
286 + else:
287 + return deFront, deBack
288 + else:
289 + return deFront, deBack
290 +
291 +# router of rule
292 +def ruleSet(front, back):
293 + if back in ['ㄴ', 'ㄹ', 'ㅂ']:
294 + return ruleOfConnectedBadchim(front, back)
295 + elif back == '었':
296 + return ruleOfEot(front, back)
297 + elif back == '았':
298 + return ruleOfAt(front, back)
299 + elif back == '시':
300 + return ruleOfSi(front, back)
301 + elif back == '였':
302 + return ruleOfYeot(front, back)
303 + elif back == '아':
304 + return ruleOfAh(front, back)
305 + elif back == '어':
306 + return ruleOfEo(front, back)
307 + elif back == '여':
308 + return ruleOfYeo(front, back)
309 + elif back == '은':
310 + return ruleOfEun(front, back)
311 + elif back == '는':
312 + return ruleOfNeun(front, back)
313 + elif back == '으':
314 + return ruleOfNEu(front, back)
315 + else:
316 + deFront = decomposition(front)
317 + deBack = decomposition(back)
318 + return deFront, deBack
319 +
320 +# special cases
321 +fullMorpheme = ['깨닫', '따르']
322 +backCase = ['아', '았', '으']
323 +def isSpecial(front, back):
324 + if front in fullMorpheme and back in backCase:
325 + return True
326 + else:
327 + return False
328 +
329 +def specialRuleSet(front, back):
330 + deFront = decomposition(front)
331 + deBack = decomposition(back)
332 + if front == '닫':
333 + return deFront[:-1] + 'ㄹ', deBack
334 + elif front == '르':
335 + return '', 'ㄹ' + deBack[1:]
336 + else:
337 + return deFront, deBack
338 +
339 +def specialChunkToChange(sentence):
340 + if '셔요' in sentence:
341 + return sentence.replace('셔요', '세요')
342 + elif '시어요' in sentence:
343 + return sentence.replace('시어요', '세요')
344 + else:
345 + return sentence
346 +
347 +def composer(morphemeList):
348 + # After it checked, its morpheme set will be seperated into jamo-level
349 + # So the return variables were named dF which means decomposedFront
350 + # So this variable checks whether it changed
351 + final = 1
352 +
353 + # morphemeList examples below
354 + # morphemeList = ['알리', '어드리', 'ㄹ_것이', 'ㅂ니다']
355 + # morphemeList = ['뜨겁', '은데요']
356 +
357 +
358 +
359 + # Basically checking connected parts between morphemes.
360 + for i in range(0, len(morphemeList)-1):
361 + front = morphemeList[i][-final:]
362 + back = morphemeList[i+1][0]
363 + lenBack = len(morphemeList[i+1])
364 +
365 + if i == 0 and isSpecial(morphemeList[i], back):
366 + dF, dB = specialRuleSet(front, back)
367 + else:
368 + dF, dB = ruleSet(front, back)
369 +
370 + morphemeList[i] = morphemeList[i][:-final] + dF
371 + morphemeList[i+1] = dB + morphemeList[i+1][1:]
372 +
373 + if lenBack == 1:
374 + final = len(dB)
375 + else:
376 + final = 1
377 +
378 + morphListToString = ''.join(morphemeList)
379 +
380 + construct = reconstructor(morphListToString)
381 +
382 + result = specialChunkToChange(construct)
383 +
384 + return reconstructor(result)
This diff could not be displayed because it is too large.
No preview for this file type