신은섭(Shin Eun Seop)

fianl test1

1 +movie2/corpus/*
No preview for this file type
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +
6 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
7 +associated documentation files (the "Software"), to deal in the Software without restriction, including
8 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
10 +the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all copies or substantial
13 +portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
20 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 +"""
22 +
23 +import os
24 +
25 +import numpy as np
26 +
27 +from kor_char_parser import decompose_str_as_one_hot
28 +
29 +
30 +class MovieReviewDataset:
31 + """
32 + 영화리뷰 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
33 + """
34 + def __init__(self, dataset_path: str, max_length: int):
35 + """
36 + initializer
37 +
38 + :param dataset_path: 데이터셋 root path
39 + :param max_length: 문자열의 최대 길이
40 + """
41 + # 데이터, 레이블 각각의 경로
42 + data_review = os.path.join(dataset_path, 'train', 'train_data')
43 + data_label = os.path.join(dataset_path, 'train', 'train_label')
44 +
45 + # 영화리뷰 데이터를 읽고 preprocess까지 진행합니다
46 + with open(data_review, 'rt', encoding='utf-8') as f:
47 + raw_set = f.readlines()
48 +
49 + self.sequence = []
50 + for i in range(len(raw_set)):
51 + self.sequence.append(len(raw_set[i]))
52 + #len(raw_set[i]) - 1
53 +
54 + self.reviews = preprocess(raw_set, max_length)
55 + # 영화리뷰 레이블을 읽고 preprocess까지 진행합니다.
56 + with open(data_label) as f:
57 + self.labels = [np.float32(x) for x in f.readlines()]
58 +
59 + def __len__(self):
60 + """
61 +
62 + :return: 전체 데이터의 수를 리턴합니다
63 + """
64 + return len(self.reviews)
65 +
66 + def __getitem__(self, idx):
67 + """
68 +
69 + :param idx: 필요한 데이터의 인덱스
70 + :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
71 + """
72 + return self.reviews[idx], self.labels[idx], self.sequence[idx]
73 +
74 +
75 +def preprocess(data: list, max_length: int):
76 + """
77 + 입력을 받아서 딥러닝 모델이 학습 가능한 포맷으로 변경하는 함수입니다.
78 + 기본 제공 알고리즘은 char2vec이며, 기본 모델이 MLP이기 때문에, 입력 값의 크기를 모두 고정한 벡터를 리턴합니다.
79 + 문자열의 길이가 고정값보다 길면 긴 부분을 제거하고, 짧으면 0으로 채웁니다.
80 +
81 + :param data: 문자열 리스트 ([문자열1, 문자열2, ...])
82 + :param max_length: 문자열의 최대 길이
83 + :return: 벡터 리스트 ([[0, 1, 5, 6], [5, 4, 10, 200], ...]) max_length가 4일 때
84 + """
85 + vectorized_data = [decompose_str_as_one_hot(datum, warning=False) for datum in data]
86 + zero_padding = np.zeros((len(data), max_length), dtype=np.int32)
87 + for idx, seq in enumerate(vectorized_data):
88 + length = len(seq)
89 + if length >= max_length:
90 + length = max_length
91 + zero_padding[idx, :length] = np.array(seq)[:length]
92 + else:
93 + zero_padding[idx, :length] = np.array(seq)
94 + return zero_padding
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +
6 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
7 +associated documentation files (the "Software"), to deal in the Software without restriction, including
8 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
10 +the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all copies or substantial
13 +portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
20 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 +"""
22 +
23 +cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ" # len = 19
24 +jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ" # len = 21
25 +# len = 27
26 +jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split(
27 + '/')
28 +test = cho + jung + ''.join(jong)
29 +
30 +hangul_length = len(cho) + len(jung) + len(jong) # 67
31 +
32 +
33 +def is_valid_decomposition_atom(x):
34 + return x in test
35 +
36 +
37 +def decompose(x):
38 + in_char = x
39 + if x < ord('가') or x > ord('힣'):
40 + return chr(x)
41 + x = x - ord('가')
42 + y = x // 28
43 + z = x % 28
44 + x = y // 21
45 + y = y % 21
46 + # if there is jong, then is z > 0. So z starts from 1 index.
47 + zz = jong[z - 1] if z > 0 else ''
48 + if x >= len(cho):
49 + print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
50 + return cho[x] + jung[y] + zz
51 +
52 +
53 +def decompose_as_one_hot(in_char, warning=True):
54 + one_hot = []
55 + # print(ord('ㅣ'), chr(0xac00))
56 + # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
57 + # Total 250 dimensions.
58 + if ord('가') <= in_char <= ord('힣'): # 가:44032 , 힣: 55203
59 + x = in_char - 44032 # in_char - ord('가')
60 + y = x // 28
61 + z = x % 28
62 + x = y // 21
63 + y = y % 21
64 + # if there is jong, then is z > 0. So z starts from 1 index.
65 + zz = jong[z - 1] if z > 0 else ''
66 + if x >= len(cho):
67 + if warning:
68 + print('Unknown Exception: ', in_char,
69 + chr(in_char), x, y, z, zz)
70 +
71 + one_hot.append(x)
72 + one_hot.append(len(cho) + y)
73 + if z > 0:
74 + one_hot.append(len(cho) + len(jung) + (z - 1))
75 + return one_hot
76 + else:
77 + if in_char < 128:
78 + result = hangul_length + in_char # 67~
79 + elif ord('ㄱ') <= in_char <= ord('ㅣ'):
80 + # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
81 + result = hangul_length + 128 + (in_char - 12593)
82 + elif in_char == ord('♡'):
83 + result = hangul_length + 128 + 51 # 245~ # ♡
84 + elif in_char == ord('♥'):
85 + result = hangul_length + 128 + 51 + 1 # ♥
86 + elif in_char == ord('★'):
87 + result = hangul_length + 128 + 51 + 2 # ★
88 + elif in_char == ord('☆'):
89 + result = hangul_length + 128 + 51 + 3 # ☆
90 + else:
91 + if warning:
92 + print('Unhandled character:', chr(in_char), in_char)
93 + # unknown character
94 + result = hangul_length + 128 + 51 + 4 # for unknown character
95 +
96 + return [result]
97 +
98 +
99 +def decompose_str(string):
100 + return ''.join([decompose(ord(x)) for x in string])
101 +
102 +
103 +def decompose_str_as_one_hot(string, warning=True):
104 + tmp_list = []
105 + for x in string:
106 + da = decompose_as_one_hot(ord(x), warning=warning)
107 + tmp_list.extend(da)
108 + return tmp_list
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +
6 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
7 +associated documentation files (the "Software"), to deal in the Software without restriction, including
8 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
10 +the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all copies or substantial
13 +portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
20 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 +"""
22 +
23 +import argparse
24 +import os
25 +
26 +import numpy as np
27 +
28 +import tensorflow as tf
29 +
30 +import nsml
31 +from dataset import MovieReviewDataset, preprocess
32 +from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
33 +
34 +
35 +# DONOTCHANGE: They are reserved for nsml
36 +# This is for nsml leaderboard
37 +def bind_model(sess, config):
38 + # 학습한 모델을 저장하는 함수입니다.
39 + def save(dir_name, *args):
40 + # directory
41 + os.makedirs(dir_name, exist_ok=True)
42 + saver = tf.train.Saver()
43 + saver.save(sess, os.path.join(dir_name, 'model'))
44 +
45 + # 저장한 모델을 불러올 수 있는 함수입니다.
46 + def load(dir_name, *args):
47 + saver = tf.train.Saver()
48 + # find checkpoint
49 + ckpt = tf.train.get_checkpoint_state(dir_name)
50 + if ckpt and ckpt.model_checkpoint_path:
51 + checkpoint = os.path.basename(ckpt.model_checkpoint_path)
52 + saver.restore(sess, os.path.join(dir_name, checkpoint))
53 + else:
54 + raise NotImplemented('No checkpoint!')
55 + print('Model loaded')
56 +
57 + def infer(raw_data, **kwargs):
58 + """
59 +
60 + :param raw_data: raw input (여기서는 문자열)을 입력받습니다
61 + :param kwargs:
62 + :return:
63 + """
64 +
65 + sequence = []
66 + for i in range(len(raw_data)):
67 + sequence.append(len(raw_data[i]))
68 +
69 + # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
70 + preprocessed_data = preprocess(raw_data, config.strmaxlen)
71 + # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
72 + pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
73 + point = tf.reshape(pred, [len(pred)])
74 + # DONOTCHANGE: They are reserved for nsml
75 + # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
76 + return list(zip(np.zeros(point.shape[0]), point.eval()))
77 +
78 + # DONOTCHANGE: They are reserved for nsml
79 + # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
80 + nsml.bind(save=save, load=load, infer=infer)
81 +
82 +
83 +def _batch_loader(iterable, n=1):
84 + """
85 + 데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
86 +
87 + :param iterable: 데이터 list, 혹은 다른 포맷
88 + :param n: 배치 사이즈
89 + :return:
90 + """
91 + length = len(iterable)
92 + for n_idx in range(0, length, n):
93 + yield iterable[n_idx:min(n_idx + n, length)]
94 +
95 +
96 +def weight_variable(shape):
97 + initial = tf.truncated_normal(shape, stddev=0.1)
98 + return tf.Variable(initial)
99 +
100 +
101 +def bias_variable(shape):
102 + initial = tf.constant(0.1, shape=shape)
103 + return tf.Variable(initial)
104 +
105 +def lstm_cell(num_units, keep_prob):
106 + cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
107 + cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
108 + return cell
109 +
110 +
111 +if __name__ == '__main__':
112 + args = argparse.ArgumentParser()
113 + # DONOTCHANGE: They are reserved for nsml
114 + args.add_argument('--mode', type=str, default='train')
115 + args.add_argument('--pause', type=int, default=0)
116 + args.add_argument('--iteration', type=str, default='0')
117 +
118 + # User options
119 + args.add_argument('--output', type=int, default=1)
120 + args.add_argument('--epochs', type=int, default=60)
121 + args.add_argument('--batch', type=int, default=2000)
122 + args.add_argument('--strmaxlen', type=int, default=200)
123 + args.add_argument('--embedding', type=int, default=64)
124 + config = args.parse_args()
125 +
126 + if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml
127 + DATASET_PATH = '../sample_data/movie_review/'
128 +
129 +
130 + # 모델의 specification
131 + output_dim = 1
132 + hidden_dim = 128
133 + stack_num = 3
134 + learning_rate = 0.0001
135 + character_size = 251
136 +
137 + # placeholder
138 + x = tf.placeholder(tf.int32, [None, config.strmaxlen])
139 + y_ = tf.placeholder(tf.float32, [None, output_dim])
140 + keep_prob = tf.placeholder(tf.float32)
141 + sequence_list = tf.placeholder(tf.int32, [None])
142 +
143 + # 임베딩
144 + char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
145 + embedded = tf.nn.embedding_lookup(char_embedding, x)
146 +
147 + # LSTM layer
148 +
149 + cells_fw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
150 + cells_bw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
151 +
152 + output, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
153 + cells_fw=cells_fw,
154 + cells_bw=cells_bw,
155 + inputs=embedded,
156 + sequence_length=sequence_list,
157 + dtype=tf.float32)
158 +
159 + range1 = tf.range(tf.shape(sequence_list)[0])
160 + output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
161 + output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
162 + #output_prediction = output3
163 + output_prediction = (tf.sigmoid(output3) * 9) + 1
164 +
165 + # loss와 optimizer
166 + linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
167 + train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
168 +
169 +
170 + # Session
171 + sess = tf.InteractiveSession()
172 + tf.global_variables_initializer().run()
173 +
174 +
175 + # DONOTCHANGE: Reserved for nsml use
176 + bind_model(sess=sess, config=config)
177 +
178 +
179 + # DONOTCHANGE: They are reserved for nsml
180 + if config.pause:
181 + nsml.paused(scope=locals())
182 +
183 +
184 + # 학습 모드일 때 사용합니다. (기본값)
185 + if config.mode == 'train':
186 +
187 + # 데이터를 로드합니다.
188 + dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
189 + dataset_len = len(dataset)
190 + one_batch_size = dataset_len//config.batch
191 + if dataset_len % config.batch != 0:
192 + one_batch_size += 1
193 +
194 +
195 + # epoch마다 학습을 수행합니다.
196 + for epoch in range(config.epochs):
197 + avg_loss = 0.0
198 +
199 + total_batch = 0
200 +
201 + for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
202 +
203 + labels = np.reshape(labels, (len(labels), output_dim))
204 +
205 + _, loss = sess.run([train_step, linear_regression],
206 + feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
207 +
208 + print('Batch : ', i + 1, '/', one_batch_size,
209 + ', BCE in this minibatch: ', float(loss))
210 + avg_loss += float(loss)
211 + total_batch = total_batch + 1
212 +
213 + print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
214 +
215 + # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
216 + #
217 + nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
218 + train__loss=float(avg_loss/total_batch), step=epoch)
219 + # DONOTCHANGE (You can decide how often you want to save the model)
220 + nsml.save(epoch)
221 +
222 + # 로컬 테스트 모드일때 사용합니다
223 + # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
224 + # [(0.0, 9.045), (0.0, 5.91), ... ]
225 + elif config.mode == 'test_local':
226 +
227 + with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
228 + reviews = f.readlines()
229 +
230 + res = []
231 + for batch in _batch_loader(reviews, config.batch):
232 + temp_res = nsml.infer(batch)
233 + res += temp_res
234 + print(res)
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +
6 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
7 +associated documentation files (the "Software"), to deal in the Software without restriction, including
8 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
10 +the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all copies or substantial
13 +portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
20 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 +"""
22 +
23 +import argparse
24 +import os
25 +
26 +import numpy as np
27 +
28 +import tensorflow as tf
29 +
30 +import nsml
31 +from dataset import MovieReviewDataset, preprocess
32 +from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
33 +
34 +
35 +# DONOTCHANGE: They are reserved for nsml
36 +# This is for nsml leaderboard
37 +def bind_model(sess, config):
38 + # 학습한 모델을 저장하는 함수입니다.
39 + def save(dir_name, *args):
40 + # directory
41 + os.makedirs(dir_name, exist_ok=True)
42 + saver = tf.train.Saver()
43 + saver.save(sess, os.path.join(dir_name, 'model'))
44 +
45 + # 저장한 모델을 불러올 수 있는 함수입니다.
46 + def load(dir_name, *args):
47 + saver = tf.train.Saver()
48 + # find checkpoint
49 + ckpt = tf.train.get_checkpoint_state(dir_name)
50 + if ckpt and ckpt.model_checkpoint_path:
51 + checkpoint = os.path.basename(ckpt.model_checkpoint_path)
52 + saver.restore(sess, os.path.join(dir_name, checkpoint))
53 + else:
54 + raise NotImplemented('No checkpoint!')
55 + print('Model loaded')
56 +
57 + def infer(raw_data, **kwargs):
58 + """
59 +
60 + :param raw_data: raw input (여기서는 문자열)을 입력받습니다
61 + :param kwargs:
62 + :return:
63 + """
64 +
65 + sequence = []
66 + for i in range(len(raw_data)):
67 + sequence.append(len(raw_data[i]))
68 +
69 + # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
70 + preprocessed_data = preprocess(raw_data, config.strmaxlen)
71 + # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
72 + pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
73 + point = tf.reshape(pred, [len(pred)])
74 + # DONOTCHANGE: They are reserved for nsml
75 + # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
76 + return list(zip(np.zeros(point.shape[0]), point.eval()))
77 +
78 + # DONOTCHANGE: They are reserved for nsml
79 + # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
80 + nsml.bind(save=save, load=load, infer=infer)
81 +
82 +
83 +def _batch_loader(iterable, n=1):
84 + """
85 + 데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
86 +
87 + :param iterable: 데이터 list, 혹은 다른 포맷
88 + :param n: 배치 사이즈
89 + :return:
90 + """
91 + length = len(iterable)
92 + for n_idx in range(0, length, n):
93 + yield iterable[n_idx:min(n_idx + n, length)]
94 +
95 +
96 +def weight_variable(shape):
97 + initial = tf.truncated_normal(shape, stddev=0.1)
98 + return tf.Variable(initial)
99 +
100 +
101 +def bias_variable(shape):
102 + initial = tf.constant(0.1, shape=shape)
103 + return tf.Variable(initial)
104 +
105 +def lstm_cell(num_units, keep_prob):
106 + cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
107 + cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
108 + return cell
109 +
110 +
111 +if __name__ == '__main__':
112 + args = argparse.ArgumentParser()
113 + # DONOTCHANGE: They are reserved for nsml
114 + args.add_argument('--mode', type=str, default='train')
115 + args.add_argument('--pause', type=int, default=0)
116 + args.add_argument('--iteration', type=str, default='0')
117 +
118 + # User options
119 + args.add_argument('--output', type=int, default=1)
120 + args.add_argument('--epochs', type=int, default=60)
121 + args.add_argument('--batch', type=int, default=2000)
122 + args.add_argument('--strmaxlen', type=int, default=200)
123 + args.add_argument('--embedding', type=int, default=64)
124 + config = args.parse_args()
125 +
126 + if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml
127 + DATASET_PATH = '../sample_data/movie_review/'
128 +
129 +
130 + # 모델의 specification
131 + output_dim = 1
132 + hidden_dim = 256
133 + stack_num = 3
134 + learning_rate = 0.0001
135 + character_size = 251
136 +
137 + # placeholder
138 + x = tf.placeholder(tf.int32, [None, config.strmaxlen])
139 + y_ = tf.placeholder(tf.float32, [None, output_dim])
140 + keep_prob = tf.placeholder(tf.float32)
141 + sequence_list = tf.placeholder(tf.int32, [None])
142 +
143 + # 임베딩
144 + char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
145 + embedded = tf.nn.embedding_lookup(char_embedding, x)
146 +
147 + # LSTM layer
148 + if stack_num > 1:
149 + multi_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)],
150 + state_is_tuple=True)
151 + else:
152 + multi_cells = lstm_cell(hidden_dim, keep_prob)
153 +
154 + output, _states = tf.nn.dynamic_rnn(multi_cells, embedded, dtype=tf.float32, sequence_length=sequence_list)
155 + range1 = tf.range(tf.shape(sequence_list)[0])
156 + output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
157 + output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
158 + #output_prediction = output3
159 + output_prediction = (tf.sigmoid(output3) * 9) + 1
160 +
161 + # loss와 optimizer
162 + linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
163 + train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
164 +
165 +
166 + # Session
167 + sess = tf.InteractiveSession()
168 + tf.global_variables_initializer().run()
169 +
170 +
171 + # DONOTCHANGE: Reserved for nsml use
172 + bind_model(sess=sess, config=config)
173 +
174 +
175 + # DONOTCHANGE: They are reserved for nsml
176 + if config.pause:
177 + nsml.paused(scope=locals())
178 +
179 +
180 + # 학습 모드일 때 사용합니다. (기본값)
181 + if config.mode == 'train':
182 +
183 + # 데이터를 로드합니다.
184 + dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
185 + dataset_len = len(dataset)
186 + one_batch_size = dataset_len//config.batch
187 + if dataset_len % config.batch != 0:
188 + one_batch_size += 1
189 +
190 +
191 + # epoch마다 학습을 수행합니다.
192 + for epoch in range(config.epochs):
193 + avg_loss = 0.0
194 +
195 + total_batch = 0
196 +
197 + for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
198 +
199 + labels = np.reshape(labels, (len(labels), output_dim))
200 +
201 + _, loss = sess.run([train_step, linear_regression],
202 + feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
203 +
204 + print('Batch : ', i + 1, '/', one_batch_size,
205 + ', BCE in this minibatch: ', float(loss))
206 + avg_loss += float(loss)
207 + total_batch = total_batch + 1
208 +
209 + print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
210 +
211 + # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
212 + #
213 + nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
214 + train__loss=float(avg_loss/total_batch), step=epoch)
215 + # DONOTCHANGE (You can decide how often you want to save the model)
216 + nsml.save(epoch)
217 +
218 + # 로컬 테스트 모드일때 사용합니다
219 + # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
220 + # [(0.0, 9.045), (0.0, 5.91), ... ]
221 + elif config.mode == 'test_local':
222 +
223 + with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
224 + reviews = f.readlines()
225 +
226 + res = []
227 + for batch in _batch_loader(reviews, config.batch):
228 + temp_res = nsml.infer(batch)
229 + res += temp_res
230 + print(res)
1 +# -*- coding: utf-8 -*-
2 +
3 +"""
4 +Copyright 2018 NAVER Corp.
5 +
6 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
7 +associated documentation files (the "Software"), to deal in the Software without restriction, including
8 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
10 +the following conditions:
11 +
12 +The above copyright notice and this permission notice shall be included in all copies or substantial
13 +portions of the Software.
14 +
15 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
19 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
20 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 +"""
22 +
23 +import argparse
24 +import os
25 +
26 +import numpy as np
27 +
28 +import tensorflow as tf
29 +
30 +import nsml
31 +from dataset import MovieReviewDataset, preprocess
32 +from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
33 +
34 +
35 +# DONOTCHANGE: They are reserved for nsml
36 +# This is for nsml leaderboard
37 +def bind_model(sess, config):
38 + # 학습한 모델을 저장하는 함수입니다.
39 + def save(dir_name, *args):
40 + # directory
41 + os.makedirs(dir_name, exist_ok=True)
42 + saver = tf.train.Saver()
43 + saver.save(sess, os.path.join(dir_name, 'model'))
44 +
45 + # 저장한 모델을 불러올 수 있는 함수입니다.
46 + def load(dir_name, *args):
47 + saver = tf.train.Saver()
48 + # find checkpoint
49 + ckpt = tf.train.get_checkpoint_state(dir_name)
50 + if ckpt and ckpt.model_checkpoint_path:
51 + checkpoint = os.path.basename(ckpt.model_checkpoint_path)
52 + saver.restore(sess, os.path.join(dir_name, checkpoint))
53 + else:
54 + raise NotImplemented('No checkpoint!')
55 + print('Model loaded')
56 +
57 + def infer(raw_data, **kwargs):
58 + """
59 +
60 + :param raw_data: raw input (여기서는 문자열)을 입력받습니다
61 + :param kwargs:
62 + :return:
63 + """
64 +
65 + sequence = []
66 + for i in range(len(raw_data)):
67 + sequence.append(len(raw_data[i]))
68 +
69 + # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
70 + preprocessed_data = preprocess(raw_data, config.strmaxlen)
71 + # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
72 + pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
73 + point = tf.reshape(pred, [len(pred)])
74 + # DONOTCHANGE: They are reserved for nsml
75 + # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
76 + return list(zip(np.zeros(point.shape[0]), point.eval()))
77 +
78 + # DONOTCHANGE: They are reserved for nsml
79 + # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
80 + nsml.bind(save=save, load=load, infer=infer)
81 +
82 +
83 +def _batch_loader(iterable, n=1):
84 + """
85 + 데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
86 +
87 + :param iterable: 데이터 list, 혹은 다른 포맷
88 + :param n: 배치 사이즈
89 + :return:
90 + """
91 + length = len(iterable)
92 + for n_idx in range(0, length, n):
93 + yield iterable[n_idx:min(n_idx + n, length)]
94 +
95 +
96 +def weight_variable(shape):
97 + initial = tf.truncated_normal(shape, stddev=0.1)
98 + return tf.Variable(initial)
99 +
100 +
101 +def bias_variable(shape):
102 + initial = tf.constant(0.1, shape=shape)
103 + return tf.Variable(initial)
104 +
105 +def lstm_cell(num_units, keep_prob):
106 + cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
107 + cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
108 + return cell
109 +
110 +
111 +if __name__ == '__main__':
112 + args = argparse.ArgumentParser()
113 + # DONOTCHANGE: They are reserved for nsml
114 + args.add_argument('--mode', type=str, default='train')
115 + args.add_argument('--pause', type=int, default=0)
116 + args.add_argument('--iteration', type=str, default='0')
117 +
118 + # User options
119 + args.add_argument('--output', type=int, default=1)
120 + args.add_argument('--epochs', type=int, default=120)
121 + args.add_argument('--batch', type=int, default=2000)
122 + args.add_argument('--strmaxlen', type=int, default=200)
123 + args.add_argument('--embedding', type=int, default=64)
124 + config = args.parse_args()
125 +
126 + if not HAS_DATASET and not IS_ON_NSML: # It is not running on nsml
127 + DATASET_PATH = '../sample_data/movie_review/'
128 +
129 +
130 + # 모델의 specification
131 + output_dim = 1
132 + hidden_dim = 128
133 + stack_num = 3
134 + learning_rate = 0.0001
135 + character_size = 251
136 +
137 + # placeholder
138 + x = tf.placeholder(tf.int32, [None, config.strmaxlen])
139 + y_ = tf.placeholder(tf.float32, [None, output_dim])
140 + keep_prob = tf.placeholder(tf.float32)
141 + sequence_list = tf.placeholder(tf.int32, [None])
142 +
143 + # 임베딩
144 + char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
145 + embedded = tf.nn.embedding_lookup(char_embedding, x)
146 +
147 + # LSTM layer
148 +
149 + cell_fw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
150 + cell_bw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
151 +
152 + with tf.variable_scope("L1"):
153 + (output_fw0, output_bw0), last_state0 = tf.nn.bidirectional_dynamic_rnn(cell_fw[0], cell_bw[0], embedded,
154 + dtype=tf.float32, sequence_length=sequence_list)
155 + output0_0 = tf.concat([output_fw0, output_bw0], axis=2)
156 +
157 + with tf.variable_scope("L2"):
158 + (output_fw1, output_bw1), last_state1 = tf.nn.bidirectional_dynamic_rnn(cell_fw[1], cell_bw[1], output0_0,
159 + dtype=tf.float32, sequence_length=sequence_list)
160 + output0_1 = tf.concat([output_fw1, output_bw1], axis=2)
161 +
162 + with tf.variable_scope("L3"):
163 + (output_fw2, output_bw2), last_state2 = tf.nn.bidirectional_dynamic_rnn(cell_fw[2], cell_bw[2], output0_1,
164 + dtype=tf.float32, sequence_length=sequence_list)
165 + output = tf.concat([output_fw2, output_bw2], axis=2)
166 +
167 +
168 + range1 = tf.range(tf.shape(sequence_list)[0])
169 + output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
170 + output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
171 + #output_prediction = output3
172 + output_prediction = (tf.sigmoid(output3) * 9) + 1
173 +
174 + # loss와 optimizer
175 + linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
176 + train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
177 +
178 +
179 + # Session
180 + sess = tf.InteractiveSession()
181 + tf.global_variables_initializer().run()
182 +
183 +
184 + # DONOTCHANGE: Reserved for nsml use
185 + bind_model(sess=sess, config=config)
186 +
187 +
188 + # DONOTCHANGE: They are reserved for nsml
189 + if config.pause:
190 + nsml.paused(scope=locals())
191 +
192 +
193 + # 학습 모드일 때 사용합니다. (기본값)
194 + if config.mode == 'train':
195 +
196 + # 데이터를 로드합니다.
197 + dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
198 + dataset_len = len(dataset)
199 + one_batch_size = dataset_len//config.batch
200 + if dataset_len % config.batch != 0:
201 + one_batch_size += 1
202 +
203 +
204 + # epoch마다 학습을 수행합니다.
205 + for epoch in range(config.epochs):
206 + avg_loss = 0.0
207 +
208 + total_batch = 0
209 +
210 + for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
211 +
212 + labels = np.reshape(labels, (len(labels), output_dim))
213 +
214 + _, loss = sess.run([train_step, linear_regression],
215 + feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
216 +
217 + print('Batch : ', i + 1, '/', one_batch_size,
218 + ', BCE in this minibatch: ', float(loss))
219 + avg_loss += float(loss)
220 + total_batch = total_batch + 1
221 +
222 + print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
223 +
224 + # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
225 + #
226 + nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
227 + train__loss=float(avg_loss/total_batch), step=epoch)
228 + # DONOTCHANGE (You can decide how often you want to save the model)
229 + nsml.save(epoch)
230 +
231 + # 로컬 테스트 모드일때 사용합니다
232 + # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
233 + # [(0.0, 9.045), (0.0, 5.91), ... ]
234 + elif config.mode == 'test_local':
235 +
236 + with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
237 + reviews = f.readlines()
238 +
239 + res = []
240 + for batch in _batch_loader(reviews, config.batch):
241 + temp_res = nsml.infer(batch)
242 + res += temp_res
243 + print(res)
1 +"""
2 +Copyright 2018 NAVER Corp.
3 +
4 +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
5 +associated documentation files (the "Software"), to deal in the Software without restriction, including
6 +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
8 +the following conditions:
9 +
10 +The above copyright notice and this permission notice shall be included in all copies or substantial
11 +portions of the Software.
12 +
13 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
14 +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
15 +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
16 +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
17 +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
18 +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 +"""
20 +
21 +from distutils.core import setup
22 +setup(
23 + name='nsml movie review',
24 + version='1.0',
25 + description='',
26 + install_requires=[
27 + ]
28 +)
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +우리 집에는 한동안 햇닭 세 마리가 있었다
2 +나는 부엌칼을 장 항아리에 갖다 대고 잠깐 갈았다
3 +붉은 녹이 없어지고 시퍼렇게 날이 섰다
4 +작은 공기 하나를 가지고 대문간으로 갔다
5 +한편 발로 붙들려 매인 두 발을 곽 밟았다
6 +칼로 거기를 몇 번 베었다
7 +몹시 아프고 괴로운지 펼떡펼떡 두 발을 놀리고 온몸을 푸덕푸덕한다
8 +나는 더욱 발에다 힘을 주고 손에 힘을 주어 목을 곽 붙잡고 또 몇 차례 베었다
9 +닭의 목에서는 붉은 피가 줄줄 흘러서 공기에 방울방울 떨어진다
10 +한참 붙들고 피가 나오고 죽기를 기다렸다
11 +나는 잊어버렸던 듯이 얼른 숨구멍을 찾아서 베었다
12 +씨르륵 소리가 나고 한 번 푸르르 떨더니 그만 늘어진다
13 +먼 훗날 당신이 찾으시면
14 +그때에 내 말이 잊었노라
15 +당신이 속으로 나무라면
16 +「무척 그리다가 잊었노라」
17 +그래도 당신이 나무라면
18 +「믿기지 않아서 잊었노라」
19 +오늘도 어제도 아니 잊고
20 +먼 훗날 그때에 「잊었노라」
21 +우리 집 뒷산에는 풀이 푸르고
22 +숲사이의 시냇물 모래 바닥은
23 +파아란 풀 그림자 떠서 흘러요
24 +그리운 우리 임은 어디 계신고
25 +날마다 피어나는 우리 임 생각
26 +날마다 뒷산에 홀로 앉아서
27 +날마다 풀을 따서 물에 던져요
28 +흘러가는 시내의 물에 흘러서
29 +내어던진 풀잎은 엷게 떠갈제
30 +물살이 헤적헤적 품을 헤쳐요
31 +가엾은 이내 속을 둘 곳 없어서
32 +날마다 풀을 따서 물에 던지고
33 +흘러가는 잎이나 맘 헤보아요
34 +산(山) 위에 올라서서 바라다보면
35 +가로막힌 바다를 마주 건너서
36 +님 계시는 마을이 내 눈앞으로
37 +꿈 하늘 하늘같이 떠오릅니다
38 +흰 모래 모래 비낀 선창(船倉)가에는
39 +한가한 뱃노래가 멀리 잦으며
40 +날 저물고 안개는 깊이 덮여서
41 +흩어지는 물꽃뿐 안득입니다
42 +이윽고 밤 어두운 물새가 울면
43 +물결조차 하나 둘 배는 떠나서
44 +저 멀리 한바다로 아주 바다로
45 +마치 가랑잎같이 떠나갑니다
46 +나는 혼자 산(山)에서 밤을 새우고
47 +아침해 붉은 볕에 몸을 씻으며
48 +귀 기울고 솔곳이 엿듣노라면
49 +님 계신 창(窓) 아래로 가는 물노래
50 +흔들어 깨우치는 물노래에는
51 +내 님이 놀라 일어나 찾으신대도
52 +내 몸은 산(山) 위에서 그 산(山) 위에서
53 +고이 깊이 잠들어 다 모릅니다
54 +고요하고 어두운 밤이 오면은
55 +어스러한 등(燈)불에 밤이 오면은
56 +외로움에 아픔에 다만 혼자서
57 +하염없는 눈물에 저는 웁니다
58 +제 한 몸도 예전엔 눈물 모르고
59 +조그만한 세상(世上)을 보냈습니다
60 +그때는 지난날의 옛이야기도
61 +아무 설움 모르고 외웠습니다
62 +그런데 우리 님이 가신 뒤에는
63 +아주 저를 버리고 가신 뒤에는
64 +전(前)날에 제게 있던 모든 것들이
65 +가지가지 없어지고 말았습니다
66 +그러나 그 한때에 외워 두었던
67 +옛이야기뿐만은 남았습니다
68 +나날이 짙어가는 옛이야기는
69 +부질없이 제 몸을 울려 줍니다
70 +그리운 우리 님의 맑은 노래는
71 +언제나 제 가슴에 젖어 있어요
72 +긴 날을 문 밖에서 서서 들어도
73 +그리운 우리 님의 고운 노래는
74 +해지고 저무도록 귀에 들려요
75 +밤들고 잠드도록 귀에 들려요
76 +고이도 흔들리는 노랫가락에
77 +내 잠은 그만이나 깊이 들어요
78 +고적한 잠자리에 홀로 누워도
79 +내 잠은 포스근히 깊이 들어요
80 +그러나 자다깨면 님의 노래는
81 +하나도 남김 없이 잃어버려요
82 +들으며 듣는 대로 님의 노래는
83 +하나도 남김없이 잊고 말아요.
84 +세월이 물과 같이 흐른 두 달은
85 +길어둔 독엣 물도 찌었지마는
86 +가면서 함께 가자 하던 말씀은
87 +살아서 살을 맞는 표적이외다.
88 +봄 풀은 봄이 되면 돋아나지만
89 +"나무는 밑구루를 꺾은 셈이요,"
90 +새라면 두 죽지가 상한 셈이라
91 +내 몸에 꽃필 날은 다시 없구나.
92 +밤마다 닭소리라 날이 첫시면
93 +"당신의 넋맞이로 나가 볼 때요,"
94 +그믐에 지는 달이 산에 걸리면
95 +당신의 길신 가리 차릴 때외다.
96 +세월은 물과 같이 흘러 가지만
97 +당신을 아주 잊던 말씀이지만
98 +동무들 보십시오 해가 집니다
99 +세상의 모든 것은 빛이 납니다
100 +이제는 주춤주춤 어둡습니다
101 +예서 더 저문 때를 밤이랍니다
102 +물 스치던 돌 위엔 물때 뿐이라
103 +물때 묻은 조약돌 마른 갈숲이
104 +이제라고 강(江)물의 터야 아니랴
105 +빨래 소리 물소리 선녀(仙女)의 노래
106 +잎새 위에 밤마다 우는 달빛이
107 +"때린다, 부순다, 무너 버린다."
1 +5
2 +7
3 +10
4 +10
5 +10
6 +10
7 +10
8 +10
9 +10
10 +10
11 +10
12 +10
13 +10
14 +10
15 +10
16 +10
17 +10
18 +10
19 +10
20 +10
21 +10
22 +10
23 +10
24 +10
25 +10
26 +10
27 +10
28 +10
29 +10
30 +10
31 +10
32 +10
33 +10
34 +10
35 +10
36 +10
37 +10
38 +10
39 +10
40 +10
41 +10
42 +10
43 +10
44 +1
45 +6
46 +9
47 +10
48 +10
49 +10
50 +10
51 +10
52 +10
53 +10
54 +10
55 +10
56 +10
57 +10
58 +10
59 +10
60 +10
61 +10
62 +10
63 +10
64 +10
65 +10
66 +10
67 +1
68 +8
69 +8
70 +10
71 +10
72 +10
73 +10
74 +10
75 +10
76 +9
77 +10
78 +10
79 +0
80 +1
81 +1
82 +1
83 +1
84 +3
85 +5
86 +7
87 +8
88 +10
89 +10
90 +10
91 +10
92 +10
93 +10
94 +10
95 +10
96 +10
97 +10
98 +10
99 +10
100 +1
101 +10
102 +10
103 +10
104 +10
105 +10
106 +10
107 +10
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 -from konlpy.corpus import kolaw 2 +
3 +import json
4 +
3 def read_data(filename): 5 def read_data(filename):
4 with open(filename, 'r') as f: 6 with open(filename, 'r') as f:
5 data = [line.split('\t') for line in f.read().splitlines()] 7 data = [line.split('\t') for line in f.read().splitlines()]
6 data = data[1:] # header 제외 8 data = data[1:] # header 제외
7 return data 9 return data
8 10
9 -train_data = kolaw.open('constitution.txt').read() 11 +print(len(train_data))
10 - 12 +print(train_data[0])
11 -print(len(train_data)) # nrows: 150000
12 -print(len(train_data[0]))
13 13
14 from konlpy.tag import Twitter 14 from konlpy.tag import Twitter
15 pos_tagger = Twitter() 15 pos_tagger = Twitter()
...@@ -20,12 +20,12 @@ def tokenize(doc): ...@@ -20,12 +20,12 @@ def tokenize(doc):
20 20
21 train_docs = [] 21 train_docs = []
22 for row in train_data: 22 for row in train_data:
23 - train_docs.append((tokenize(row[0]), '0')) 23 + train_docs.append((tokenize(row), '0'))
24 # train_docs.append((tokenize(row[1]), '0')) 24 # train_docs.append((tokenize(row[1]), '0'))
25 25
26 # 잘 들어갔는지 확인 26 # 잘 들어갔는지 확인
27 from pprint import pprint 27 from pprint import pprint
28 -pprint(train_docs[0]) 28 +pprint(train_docs[0:2])
29 29
30 from gensim.models.doc2vec import TaggedDocument 30 from gensim.models.doc2vec import TaggedDocument
31 tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs] 31 tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
......
1 +import json
2 +
3 +filename = 'corpus/namuwiki_20160229.json'
4 +# Read file to memory, it takes some time.
5 +with open(filename) as data_file:
6 + data = json.load(data_file)
7 +
8 +# this black list article does not contain natural language knowledge
9 +black_list_title = ['공지사항/차단 내역/통합본']
10 +
11 +# Article contains title, text, and other things
12 +# Let's extract title and text from several articles
13 +for i in range(3):
14 + print(data[i]['title'])
15 + print(data[i]['text'])
16 + print()
17 +
18 +# Using regular expression, we can strip some grammar. Let's see how we can do it.
19 +import re
20 +text = "딴 사람도 아니고 프로팀 [[Counter Logic Gaming|CLG]] 소속 전 서포터 [[스티브 차우|차우스터]]가 남긴 말이다."
21 +t1 = re.sub(r"\[\[([^\]|]*)\]\]", r'\1', text) # remove link
22 +print(t1)
23 +t2 = re.sub(r"\[\[(?:[^\]|]*\|)?([^\]|]+)\]\]", r'\1', text) # remove link
24 +print(t2)
25 +
26 +def strip(text):
27 + text = re.sub(r"\{\{\{#\!html[^\}]*\}\}\}", '', text, flags=re.IGNORECASE|re.MULTILINE|re.DOTALL) # remove html
28 + text = re.sub(r"#redirect .*", '', text, flags=re.IGNORECASE) # remove redirect
29 + text = re.sub(r"\[\[분류:.*", '', text) # remove 분류
30 + text = re.sub(r"\[\[파일:.*", '', text) # remove 파일
31 + text = re.sub(r"\* 상위 문서 ?:.*", '', text) # remove 상위문서
32 + text = re.sub(r"\[youtube\(\w+\)\]", '', text, flags=re.IGNORECASE) # remove youtube
33 + text = re.sub(r"\[include\(([^\]|]*)(\|[^]]*)?\]", r'\1', text, flags=re.IGNORECASE) # remove include
34 + text = re.sub(r"\[\[(?:[^\]|]*\|)?([^\]|]+)\]\]", r'\1', text) # remove link
35 + text = re.sub(r"\[\*([^\]]*)\]", '', text) # remove 각주
36 + text = re.sub(r"\{\{\{([^\ }|]*) ([^\}|]*)\}\}\}", r'\2', text) # remove text color/size
37 + text = re.sub(r"'''([^']*)'''", r'\1', text) # remove text bold
38 + text = re.sub(r"(~~|--)([^']*)(~~|--)", '', text) # remove strike-through
39 +
40 + text = re.sub(r"\|\|(.*)\|\|", '', text) # remove table
41 +
42 + text = chinese.sub('', text) # remove chinese
43 + text = japanese.sub('', text) # remove japanese
44 + return text
45 +
46 +for i in range(2):
47 + print(data[i]['title'])
48 + # print(data[i]['text'])
49 + print(strip(data[i]['text']))
50 + print()
51 +
52 +# Generate raw text corpus
53 +
54 +MIN_TEXT_SIZE = 5000
55 +
56 +count = 10
57 +with open('input.txt', 'w') as f:
58 + for article in data:
59 + if len(article['text']) < MIN_TEXT_SIZE or len(article['text']) >= MAX_ARTICLE_SIZE:
60 + continue # skip too small, too large articles
61 +
62 + text = strip(article['text'])
63 + f.write("%s\n%s\n\n\n" % (article['title'], text))
64 + # print(article['title'])
65 + # print(article['text'])
66 + # print(text)
...\ No newline at end of file ...\ No newline at end of file