fianl test1

신은섭(Shin Eun Seop)
Commit eef2c91bf976c40c4f031a1f1d939c5f9ac2814a eef2c91b 1 parent 305a0a0f
Showing 17 changed files with 1225 additions and 7 deletions
.gitignore
ko_word2vec_e.model
movie-review/LSTM/__pycache__/dataset.cpython-35.pyc
movie-review/LSTM/__pycache__/kor_char_parser.cpython-35.pyc
movie-review/LSTM/dataset.py
movie-review/LSTM/kor_char_parser.py
movie-review/LSTM/main_Bi(Stacked_Lstm)).py
movie-review/LSTM/main_Lstm.py
movie-review/LSTM/main_Stacked(Bi_Lstm).py
movie-review/LSTM/setup.py
movie-review/LSTM/training_log/main_Bi(Stacked_Lstm))_movie_phase1_log
movie-review/LSTM/training_log/main_Lstm_movie_phase1_log
movie-review/LSTM/training_log/main_Stacked(Bi_Lstm)_movie_phase1_log
movie-review/sample_data/movie_review/train/train_data
movie-review/sample_data/movie_review/train/train_label
movie2/embadding.py
movie2/namu.py
--- a/.gitignore 0 → 100644
View file @eef2c91
+++ b/.gitignore 0 → 100644
View file @eef2c91
+movie2/corpus/*
--- a/ko_word2vec_e.model 0 → 100644
View file @eef2c91
+++ b/ko_word2vec_e.model 0 → 100644
View file @eef2c91
--- a/movie-review/LSTM/__pycache__/dataset.cpython-35.pyc 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/__pycache__/dataset.cpython-35.pyc 0 → 100644
View file @eef2c91
--- a/movie-review/LSTM/__pycache__/kor_char_parser.cpython-35.pyc 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/__pycache__/kor_char_parser.cpython-35.pyc 0 → 100644
View file @eef2c91
--- a/movie-review/LSTM/dataset.py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/dataset.py 0 → 100644
View file @eef2c91
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+import os
+
+import numpy as np
+
+from kor_char_parser import decompose_str_as_one_hot
+
+
+class MovieReviewDataset:
+    """
+    영화리뷰 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
+    """
+    def __init__(self, dataset_path: str, max_length: int):
+        """
+        initializer
+
+        :param dataset_path: 데이터셋 root path
+        :param max_length: 문자열의 최대 길이
+        """
+        # 데이터, 레이블 각각의 경로
+        data_review = os.path.join(dataset_path, 'train', 'train_data')
+        data_label = os.path.join(dataset_path, 'train', 'train_label')
+
+        # 영화리뷰 데이터를 읽고 preprocess까지 진행합니다
+        with open(data_review, 'rt', encoding='utf-8') as f:
+            raw_set = f.readlines()
+            
+            self.sequence = []
+            for i in range(len(raw_set)):
+                self.sequence.append(len(raw_set[i]))
+                #len(raw_set[i]) - 1
+                
+            self.reviews = preprocess(raw_set, max_length)
+        # 영화리뷰 레이블을 읽고 preprocess까지 진행합니다.
+        with open(data_label) as f:
+            self.labels = [np.float32(x) for x in f.readlines()]
+
+    def __len__(self):
+        """
+
+        :return: 전체 데이터의 수를 리턴합니다
+        """
+        return len(self.reviews)
+
+    def __getitem__(self, idx):
+        """
+
+        :param idx: 필요한 데이터의 인덱스
+        :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
+        """
+        return self.reviews[idx], self.labels[idx], self.sequence[idx]
+
+
+def preprocess(data: list, max_length: int):
+    """
+     입력을 받아서 딥러닝 모델이 학습 가능한 포맷으로 변경하는 함수입니다.
+     기본 제공 알고리즘은 char2vec이며, 기본 모델이 MLP이기 때문에, 입력 값의 크기를 모두 고정한 벡터를 리턴합니다.
+     문자열의 길이가 고정값보다 길면 긴 부분을 제거하고, 짧으면 0으로 채웁니다.
+
+    :param data: 문자열 리스트 ([문자열1, 문자열2, ...])
+    :param max_length: 문자열의 최대 길이
+    :return: 벡터 리스트 ([[0, 1, 5, 6], [5, 4, 10, 200], ...]) max_length가 4일 때
+    """
+    vectorized_data = [decompose_str_as_one_hot(datum, warning=False) for datum in data]
+    zero_padding = np.zeros((len(data), max_length), dtype=np.int32)
+    for idx, seq in enumerate(vectorized_data):
+        length = len(seq)
+        if length >= max_length:
+            length = max_length
+            zero_padding[idx, :length] = np.array(seq)[:length]
+        else:
+            zero_padding[idx, :length] = np.array(seq)
+    return zero_padding
--- a/movie-review/LSTM/kor_char_parser.py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/kor_char_parser.py 0 → 100644
View file @eef2c91
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"  # len = 19
+jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ"  # len = 21
+# len = 27
+jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split(
+    '/')
+test = cho + jung + ''.join(jong)
+
+hangul_length = len(cho) + len(jung) + len(jong)  # 67
+
+
+def is_valid_decomposition_atom(x):
+    return x in test
+
+
+def decompose(x):
+    in_char = x
+    if x < ord('가') or x > ord('힣'):
+        return chr(x)
+    x = x - ord('가')
+    y = x // 28
+    z = x % 28
+    x = y // 21
+    y = y % 21
+    # if there is jong, then is z > 0. So z starts from 1 index.
+    zz = jong[z - 1] if z > 0 else ''
+    if x >= len(cho):
+        print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
+    return cho[x] + jung[y] + zz
+
+
+def decompose_as_one_hot(in_char, warning=True):
+    one_hot = []
+    # print(ord('ㅣ'), chr(0xac00))
+    # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
+    # Total 250 dimensions.
+    if ord('가') <= in_char <= ord('힣'):  # 가:44032 , 힣: 55203
+        x = in_char - 44032  # in_char - ord('가')
+        y = x // 28
+        z = x % 28
+        x = y // 21
+        y = y % 21
+        # if there is jong, then is z > 0. So z starts from 1 index.
+        zz = jong[z - 1] if z > 0 else ''
+        if x >= len(cho):
+            if warning:
+                print('Unknown Exception: ', in_char,
+                      chr(in_char), x, y, z, zz)
+
+        one_hot.append(x)
+        one_hot.append(len(cho) + y)
+        if z > 0:
+            one_hot.append(len(cho) + len(jung) + (z - 1))
+        return one_hot
+    else:
+        if in_char < 128:
+            result = hangul_length + in_char  # 67~
+        elif ord('ㄱ') <= in_char <= ord('ㅣ'):
+            # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
+            result = hangul_length + 128 + (in_char - 12593)
+        elif in_char == ord('♡'):
+            result = hangul_length + 128 + 51  # 245~ # ♡
+        elif in_char == ord('♥'):
+            result = hangul_length + 128 + 51 + 1  # ♥
+        elif in_char == ord('★'):
+            result = hangul_length + 128 + 51 + 2  # ★
+        elif in_char == ord('☆'):
+            result = hangul_length + 128 + 51 + 3  # ☆
+        else:
+            if warning:
+                print('Unhandled character:', chr(in_char), in_char)
+            # unknown character
+            result = hangul_length + 128 + 51 + 4  # for unknown character
+
+        return [result]
+
+
+def decompose_str(string):
+    return ''.join([decompose(ord(x)) for x in string])
+
+
+def decompose_str_as_one_hot(string, warning=True):
+    tmp_list = []
+    for x in string:
+        da = decompose_as_one_hot(ord(x), warning=warning)
+        tmp_list.extend(da)
+    return tmp_list
--- a/movie-review/LSTM/main_Bi(Stacked_Lstm)).py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/main_Bi(Stacked_Lstm)).py 0 → 100644
View file @eef2c91
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+import argparse
+import os
+
+import numpy as np
+
+import tensorflow as tf
+
+import nsml
+from dataset import MovieReviewDataset, preprocess
+from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
+
+
+# DONOTCHANGE: They are reserved for nsml
+# This is for nsml leaderboard
+def bind_model(sess, config):
+    # 학습한 모델을 저장하는 함수입니다.
+    def save(dir_name, *args):
+        # directory
+        os.makedirs(dir_name, exist_ok=True)
+        saver = tf.train.Saver()
+        saver.save(sess, os.path.join(dir_name, 'model'))
+
+    # 저장한 모델을 불러올 수 있는 함수입니다.
+    def load(dir_name, *args):
+        saver = tf.train.Saver()
+        # find checkpoint
+        ckpt = tf.train.get_checkpoint_state(dir_name)
+        if ckpt and ckpt.model_checkpoint_path:
+            checkpoint = os.path.basename(ckpt.model_checkpoint_path)
+            saver.restore(sess, os.path.join(dir_name, checkpoint))
+        else:
+            raise NotImplemented('No checkpoint!')
+        print('Model loaded')
+
+    def infer(raw_data, **kwargs):
+        """
+
+        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
+        :param kwargs:
+        :return:
+        """
+        
+        sequence = []
+        for i in range(len(raw_data)):
+            sequence.append(len(raw_data[i]))
+        
+        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
+        preprocessed_data = preprocess(raw_data, config.strmaxlen)
+        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
+        pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
+        point = tf.reshape(pred, [len(pred)])
+        # DONOTCHANGE: They are reserved for nsml
+        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
+        return list(zip(np.zeros(point.shape[0]), point.eval()))
+
+    # DONOTCHANGE: They are reserved for nsml
+    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
+    nsml.bind(save=save, load=load, infer=infer)
+
+
+def _batch_loader(iterable, n=1):
+    """
+    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
+
+    :param iterable: 데이터 list, 혹은 다른 포맷
+    :param n: 배치 사이즈
+    :return:
+    """
+    length = len(iterable)
+    for n_idx in range(0, length, n):
+        yield iterable[n_idx:min(n_idx + n, length)]
+
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+def lstm_cell(num_units, keep_prob):
+    cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
+    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
+    return cell
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    # DONOTCHANGE: They are reserved for nsml
+    args.add_argument('--mode', type=str, default='train')
+    args.add_argument('--pause', type=int, default=0)
+    args.add_argument('--iteration', type=str, default='0')
+
+    # User options
+    args.add_argument('--output', type=int, default=1)
+    args.add_argument('--epochs', type=int, default=60)
+    args.add_argument('--batch', type=int, default=2000)
+    args.add_argument('--strmaxlen', type=int, default=200)
+    args.add_argument('--embedding', type=int, default=64)
+    config = args.parse_args()
+
+    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
+        DATASET_PATH = '../sample_data/movie_review/'
+
+
+    # 모델의 specification
+    output_dim = 1
+    hidden_dim = 128
+    stack_num = 3
+    learning_rate = 0.0001
+    character_size = 251
+
+    # placeholder
+    x = tf.placeholder(tf.int32, [None, config.strmaxlen])
+    y_ = tf.placeholder(tf.float32, [None, output_dim])
+    keep_prob = tf.placeholder(tf.float32)
+    sequence_list = tf.placeholder(tf.int32, [None])
+    
+    # 임베딩
+    char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
+    embedded = tf.nn.embedding_lookup(char_embedding, x)
+
+    # LSTM layer
+
+    cells_fw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
+    cells_bw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
+    
+    output, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
+        cells_fw=cells_fw,
+        cells_bw=cells_bw,
+        inputs=embedded,
+        sequence_length=sequence_list,
+        dtype=tf.float32)
+    
+    range1 = tf.range(tf.shape(sequence_list)[0])
+    output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
+    output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
+    #output_prediction = output3
+    output_prediction = (tf.sigmoid(output3) * 9) + 1
+
+    # loss와 optimizer
+    linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
+    train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
+    
+    
+    # Session
+    sess = tf.InteractiveSession()
+    tf.global_variables_initializer().run()
+
+    
+    # DONOTCHANGE: Reserved for nsml use
+    bind_model(sess=sess, config=config)
+
+    
+    # DONOTCHANGE: They are reserved for nsml
+    if config.pause:
+        nsml.paused(scope=locals())
+
+    
+    # 학습 모드일 때 사용합니다. (기본값)
+    if config.mode == 'train':
+        
+        # 데이터를 로드합니다.
+        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
+        dataset_len = len(dataset)
+        one_batch_size = dataset_len//config.batch
+        if dataset_len % config.batch != 0:
+            one_batch_size += 1
+
+        
+        # epoch마다 학습을 수행합니다.
+        for epoch in range(config.epochs):
+            avg_loss = 0.0
+
+            total_batch = 0
+
+            for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
+
+                labels = np.reshape(labels, (len(labels), output_dim))
+                                
+                _, loss = sess.run([train_step, linear_regression],
+                                   feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
+
+                print('Batch : ', i + 1, '/', one_batch_size,
+                      ', BCE in this minibatch: ', float(loss))
+                avg_loss += float(loss)
+                total_batch = total_batch + 1
+                
+            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
+
+            # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
+            #
+            nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
+                        train__loss=float(avg_loss/total_batch), step=epoch)
+            # DONOTCHANGE (You can decide how often you want to save the model)
+            nsml.save(epoch)
+
+    # 로컬 테스트 모드일때 사용합니다
+    # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
+    # [(0.0, 9.045), (0.0, 5.91), ... ]
+    elif config.mode == 'test_local':
+        
+        with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
+            reviews = f.readlines()
+
+        res = []
+        for batch in _batch_loader(reviews, config.batch):
+            temp_res = nsml.infer(batch)
+            res += temp_res
+        print(res)
--- a/movie-review/LSTM/main_Lstm.py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/main_Lstm.py 0 → 100644
View file @eef2c91
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+import argparse
+import os
+
+import numpy as np
+
+import tensorflow as tf
+
+import nsml
+from dataset import MovieReviewDataset, preprocess
+from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
+
+
+# DONOTCHANGE: They are reserved for nsml
+# This is for nsml leaderboard
+def bind_model(sess, config):
+    # 학습한 모델을 저장하는 함수입니다.
+    def save(dir_name, *args):
+        # directory
+        os.makedirs(dir_name, exist_ok=True)
+        saver = tf.train.Saver()
+        saver.save(sess, os.path.join(dir_name, 'model'))
+
+    # 저장한 모델을 불러올 수 있는 함수입니다.
+    def load(dir_name, *args):
+        saver = tf.train.Saver()
+        # find checkpoint
+        ckpt = tf.train.get_checkpoint_state(dir_name)
+        if ckpt and ckpt.model_checkpoint_path:
+            checkpoint = os.path.basename(ckpt.model_checkpoint_path)
+            saver.restore(sess, os.path.join(dir_name, checkpoint))
+        else:
+            raise NotImplemented('No checkpoint!')
+        print('Model loaded')
+
+    def infer(raw_data, **kwargs):
+        """
+
+        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
+        :param kwargs:
+        :return:
+        """
+        
+        sequence = []
+        for i in range(len(raw_data)):
+            sequence.append(len(raw_data[i]))
+        
+        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
+        preprocessed_data = preprocess(raw_data, config.strmaxlen)
+        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
+        pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
+        point = tf.reshape(pred, [len(pred)])
+        # DONOTCHANGE: They are reserved for nsml
+        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
+        return list(zip(np.zeros(point.shape[0]), point.eval()))
+
+    # DONOTCHANGE: They are reserved for nsml
+    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
+    nsml.bind(save=save, load=load, infer=infer)
+
+
+def _batch_loader(iterable, n=1):
+    """
+    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
+
+    :param iterable: 데이터 list, 혹은 다른 포맷
+    :param n: 배치 사이즈
+    :return:
+    """
+    length = len(iterable)
+    for n_idx in range(0, length, n):
+        yield iterable[n_idx:min(n_idx + n, length)]
+
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+def lstm_cell(num_units, keep_prob):
+    cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
+    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
+    return cell
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    # DONOTCHANGE: They are reserved for nsml
+    args.add_argument('--mode', type=str, default='train')
+    args.add_argument('--pause', type=int, default=0)
+    args.add_argument('--iteration', type=str, default='0')
+
+    # User options
+    args.add_argument('--output', type=int, default=1)
+    args.add_argument('--epochs', type=int, default=60)
+    args.add_argument('--batch', type=int, default=2000)
+    args.add_argument('--strmaxlen', type=int, default=200)
+    args.add_argument('--embedding', type=int, default=64)
+    config = args.parse_args()
+
+    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
+        DATASET_PATH = '../sample_data/movie_review/'
+
+
+    # 모델의 specification
+    output_dim = 1
+    hidden_dim = 256
+    stack_num = 3
+    learning_rate = 0.0001
+    character_size = 251
+
+    # placeholder
+    x = tf.placeholder(tf.int32, [None, config.strmaxlen])
+    y_ = tf.placeholder(tf.float32, [None, output_dim])
+    keep_prob = tf.placeholder(tf.float32)
+    sequence_list = tf.placeholder(tf.int32, [None])
+    
+    # 임베딩
+    char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
+    embedded = tf.nn.embedding_lookup(char_embedding, x)
+
+    # LSTM layer
+    if stack_num > 1:
+        multi_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)],
+                                                   state_is_tuple=True)
+    else:
+        multi_cells = lstm_cell(hidden_dim, keep_prob)
+
+    output, _states = tf.nn.dynamic_rnn(multi_cells, embedded, dtype=tf.float32, sequence_length=sequence_list)
+    range1 = tf.range(tf.shape(sequence_list)[0])
+    output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
+    output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
+    #output_prediction = output3
+    output_prediction = (tf.sigmoid(output3) * 9) + 1
+
+    # loss와 optimizer
+    linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
+    train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
+    
+    
+    # Session
+    sess = tf.InteractiveSession()
+    tf.global_variables_initializer().run()
+
+    
+    # DONOTCHANGE: Reserved for nsml use
+    bind_model(sess=sess, config=config)
+
+    
+    # DONOTCHANGE: They are reserved for nsml
+    if config.pause:
+        nsml.paused(scope=locals())
+
+    
+    # 학습 모드일 때 사용합니다. (기본값)
+    if config.mode == 'train':
+        
+        # 데이터를 로드합니다.
+        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
+        dataset_len = len(dataset)
+        one_batch_size = dataset_len//config.batch
+        if dataset_len % config.batch != 0:
+            one_batch_size += 1
+
+        
+        # epoch마다 학습을 수행합니다.
+        for epoch in range(config.epochs):
+            avg_loss = 0.0
+
+            total_batch = 0
+
+            for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
+
+                labels = np.reshape(labels, (len(labels), output_dim))
+                                
+                _, loss = sess.run([train_step, linear_regression],
+                                   feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
+
+                print('Batch : ', i + 1, '/', one_batch_size,
+                      ', BCE in this minibatch: ', float(loss))
+                avg_loss += float(loss)
+                total_batch = total_batch + 1
+                
+            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
+
+            # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
+            #
+            nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
+                        train__loss=float(avg_loss/total_batch), step=epoch)
+            # DONOTCHANGE (You can decide how often you want to save the model)
+            nsml.save(epoch)
+
+    # 로컬 테스트 모드일때 사용합니다
+    # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
+    # [(0.0, 9.045), (0.0, 5.91), ... ]
+    elif config.mode == 'test_local':
+        
+        with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
+            reviews = f.readlines()
+
+        res = []
+        for batch in _batch_loader(reviews, config.batch):
+            temp_res = nsml.infer(batch)
+            res += temp_res
+        print(res)
--- a/movie-review/LSTM/main_Stacked(Bi_Lstm).py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/main_Stacked(Bi_Lstm).py 0 → 100644
View file @eef2c91
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+import argparse
+import os
+
+import numpy as np
+
+import tensorflow as tf
+
+import nsml
+from dataset import MovieReviewDataset, preprocess
+from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML
+
+
+# DONOTCHANGE: They are reserved for nsml
+# This is for nsml leaderboard
+def bind_model(sess, config):
+    # 학습한 모델을 저장하는 함수입니다.
+    def save(dir_name, *args):
+        # directory
+        os.makedirs(dir_name, exist_ok=True)
+        saver = tf.train.Saver()
+        saver.save(sess, os.path.join(dir_name, 'model'))
+
+    # 저장한 모델을 불러올 수 있는 함수입니다.
+    def load(dir_name, *args):
+        saver = tf.train.Saver()
+        # find checkpoint
+        ckpt = tf.train.get_checkpoint_state(dir_name)
+        if ckpt and ckpt.model_checkpoint_path:
+            checkpoint = os.path.basename(ckpt.model_checkpoint_path)
+            saver.restore(sess, os.path.join(dir_name, checkpoint))
+        else:
+            raise NotImplemented('No checkpoint!')
+        print('Model loaded')
+
+    def infer(raw_data, **kwargs):
+        """
+
+        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
+        :param kwargs:
+        :return:
+        """
+        
+        sequence = []
+        for i in range(len(raw_data)):
+            sequence.append(len(raw_data[i]))
+        
+        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
+        preprocessed_data = preprocess(raw_data, config.strmaxlen)
+        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
+        pred = sess.run(output_prediction, feed_dict={x: preprocessed_data, keep_prob: 1.0, sequence_list: sequence})
+        point = tf.reshape(pred, [len(pred)])
+        # DONOTCHANGE: They are reserved for nsml
+        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
+        return list(zip(np.zeros(point.shape[0]), point.eval()))
+
+    # DONOTCHANGE: They are reserved for nsml
+    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
+    nsml.bind(save=save, load=load, infer=infer)
+
+
+def _batch_loader(iterable, n=1):
+    """
+    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
+
+    :param iterable: 데이터 list, 혹은 다른 포맷
+    :param n: 배치 사이즈
+    :return:
+    """
+    length = len(iterable)
+    for n_idx in range(0, length, n):
+        yield iterable[n_idx:min(n_idx + n, length)]
+
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+def lstm_cell(num_units, keep_prob):
+    cell = tf.contrib.rnn.BasicLSTMCell(num_units=num_units, activation=tf.nn.softsign)
+    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
+    return cell
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    # DONOTCHANGE: They are reserved for nsml
+    args.add_argument('--mode', type=str, default='train')
+    args.add_argument('--pause', type=int, default=0)
+    args.add_argument('--iteration', type=str, default='0')
+
+    # User options
+    args.add_argument('--output', type=int, default=1)
+    args.add_argument('--epochs', type=int, default=120)
+    args.add_argument('--batch', type=int, default=2000)
+    args.add_argument('--strmaxlen', type=int, default=200)
+    args.add_argument('--embedding', type=int, default=64)
+    config = args.parse_args()
+
+    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
+        DATASET_PATH = '../sample_data/movie_review/'
+
+
+    # 모델의 specification
+    output_dim = 1
+    hidden_dim = 128
+    stack_num = 3
+    learning_rate = 0.0001
+    character_size = 251
+
+    # placeholder
+    x = tf.placeholder(tf.int32, [None, config.strmaxlen])
+    y_ = tf.placeholder(tf.float32, [None, output_dim])
+    keep_prob = tf.placeholder(tf.float32)
+    sequence_list = tf.placeholder(tf.int32, [None])
+    
+    # 임베딩
+    char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
+    embedded = tf.nn.embedding_lookup(char_embedding, x)
+
+    # LSTM layer
+
+    cell_fw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
+    cell_bw = [lstm_cell(hidden_dim, keep_prob) for _ in range(stack_num)]
+
+    with tf.variable_scope("L1"):
+        (output_fw0, output_bw0), last_state0 = tf.nn.bidirectional_dynamic_rnn(cell_fw[0], cell_bw[0], embedded,
+                                                                             dtype=tf.float32, sequence_length=sequence_list)
+        output0_0 = tf.concat([output_fw0, output_bw0], axis=2)
+        
+    with tf.variable_scope("L2"):
+        (output_fw1, output_bw1), last_state1 = tf.nn.bidirectional_dynamic_rnn(cell_fw[1], cell_bw[1], output0_0,
+                                                                             dtype=tf.float32, sequence_length=sequence_list)
+        output0_1 = tf.concat([output_fw1, output_bw1], axis=2)
+
+    with tf.variable_scope("L3"):
+        (output_fw2, output_bw2), last_state2 = tf.nn.bidirectional_dynamic_rnn(cell_fw[2], cell_bw[2], output0_1,
+                                                                             dtype=tf.float32, sequence_length=sequence_list)
+        output = tf.concat([output_fw2, output_bw2], axis=2)
+
+
+    range1 = tf.range(tf.shape(sequence_list)[0])
+    output2 = tf.gather_nd(output, tf.stack((range1, sequence_list - 1), -1))
+    output3 = tf.contrib.layers.fully_connected(output2, output_dim, activation_fn=tf.identity)
+    #output_prediction = output3
+    output_prediction = (tf.sigmoid(output3) * 9) + 1
+
+    # loss와 optimizer
+    linear_regression = tf.reduce_mean(tf.square(output_prediction - y_))
+    train_step = tf.train.AdamOptimizer(learning_rate).minimize(linear_regression)
+    
+    
+    # Session
+    sess = tf.InteractiveSession()
+    tf.global_variables_initializer().run()
+
+    
+    # DONOTCHANGE: Reserved for nsml use
+    bind_model(sess=sess, config=config)
+
+    
+    # DONOTCHANGE: They are reserved for nsml
+    if config.pause:
+        nsml.paused(scope=locals())
+
+    
+    # 학습 모드일 때 사용합니다. (기본값)
+    if config.mode == 'train':
+        
+        # 데이터를 로드합니다.
+        dataset = MovieReviewDataset(DATASET_PATH, config.strmaxlen)
+        dataset_len = len(dataset)
+        one_batch_size = dataset_len//config.batch
+        if dataset_len % config.batch != 0:
+            one_batch_size += 1
+
+        
+        # epoch마다 학습을 수행합니다.
+        for epoch in range(config.epochs):
+            avg_loss = 0.0
+
+            total_batch = 0
+
+            for i, (data, labels, sequence) in enumerate(_batch_loader(dataset, config.batch)):
+
+                labels = np.reshape(labels, (len(labels), output_dim))
+                                
+                _, loss = sess.run([train_step, linear_regression],
+                                   feed_dict={x: data, y_: labels, keep_prob: 0.7, sequence_list: sequence})
+
+                print('Batch : ', i + 1, '/', one_batch_size,
+                      ', BCE in this minibatch: ', float(loss))
+                avg_loss += float(loss)
+                total_batch = total_batch + 1
+                
+            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
+
+            # nsml ps, 혹은 웹 상의 텐서보드에 나타나는 값을 리포트하는 함수입니다.
+            #
+            nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
+                        train__loss=float(avg_loss/total_batch), step=epoch)
+            # DONOTCHANGE (You can decide how often you want to save the model)
+            nsml.save(epoch)
+
+    # 로컬 테스트 모드일때 사용합니다
+    # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
+    # [(0.0, 9.045), (0.0, 5.91), ... ]
+    elif config.mode == 'test_local':
+        
+        with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
+            reviews = f.readlines()
+
+        res = []
+        for batch in _batch_loader(reviews, config.batch):
+            temp_res = nsml.infer(batch)
+            res += temp_res
+        print(res)
--- a/movie-review/LSTM/setup.py 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/setup.py 0 → 100644
View file @eef2c91
+"""
+Copyright 2018 NAVER Corp.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+from distutils.core import setup
+setup(
+    name='nsml movie review',
+    version='1.0',
+    description='',
+    install_requires=[
+    ]
+)
--- a/movie-review/LSTM/training_log/main_Bi(Stacked_Lstm))_movie_phase1_log 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/training_log/main_Bi(Stacked_Lstm))_movie_phase1_log 0 → 100644
View file @eef2c91
--- a/movie-review/LSTM/training_log/main_Lstm_movie_phase1_log 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/training_log/main_Lstm_movie_phase1_log 0 → 100644
View file @eef2c91
--- a/movie-review/LSTM/training_log/main_Stacked(Bi_Lstm)_movie_phase1_log 0 → 100644
View file @eef2c91
+++ b/movie-review/LSTM/training_log/main_Stacked(Bi_Lstm)_movie_phase1_log 0 → 100644
View file @eef2c91
--- a/movie-review/sample_data/movie_review/train/train_data 0 → 100644
View file @eef2c91
+++ b/movie-review/sample_data/movie_review/train/train_data 0 → 100644
View file @eef2c91
+우리 집에는 한동안 햇닭 세 마리가 있었다
+나는 부엌칼을 장 항아리에 갖다 대고 잠깐 갈았다
+붉은 녹이 없어지고 시퍼렇게 날이 섰다
+작은 공기 하나를 가지고 대문간으로 갔다
+한편 발로 붙들려 매인 두 발을 곽 밟았다
+칼로 거기를 몇 번 베었다
+몹시 아프고 괴로운지 펼떡펼떡 두 발을 놀리고 온몸을 푸덕푸덕한다
+나는 더욱 발에다 힘을 주고 손에 힘을 주어 목을 곽 붙잡고 또 몇 차례 베었다
+닭의 목에서는 붉은 피가 줄줄 흘러서 공기에 방울방울 떨어진다
+한참 붙들고 피가 나오고 죽기를 기다렸다
+나는 잊어버렸던 듯이 얼른 숨구멍을 찾아서 베었다
+씨르륵 소리가 나고 한 번 푸르르 떨더니 그만 늘어진다
+먼 훗날 당신이 찾으시면
+그때에 내 말이 잊었노라
+당신이 속으로 나무라면
+「무척 그리다가 잊었노라」
+그래도 당신이 나무라면
+「믿기지 않아서 잊었노라」
+오늘도 어제도 아니 잊고
+먼 훗날 그때에 「잊었노라」
+우리 집 뒷산에는 풀이 푸르고
+숲사이의 시냇물 모래 바닥은
+파아란 풀 그림자 떠서 흘러요
+그리운 우리 임은 어디 계신고
+날마다 피어나는 우리 임 생각
+날마다 뒷산에 홀로 앉아서
+날마다 풀을 따서 물에 던져요
+흘러가는 시내의 물에 흘러서
+내어던진 풀잎은 엷게 떠갈제
+물살이 헤적헤적 품을 헤쳐요
+가엾은 이내 속을 둘 곳 없어서
+날마다 풀을 따서 물에 던지고
+흘러가는 잎이나 맘 헤보아요
+산(山) 위에 올라서서 바라다보면
+가로막힌 바다를 마주 건너서
+님 계시는 마을이 내 눈앞으로
+꿈 하늘 하늘같이 떠오릅니다
+흰 모래 모래 비낀 선창(船倉)가에는
+한가한 뱃노래가 멀리 잦으며
+날 저물고 안개는 깊이 덮여서
+흩어지는 물꽃뿐 안득입니다
+이윽고 밤 어두운 물새가 울면
+물결조차 하나 둘 배는 떠나서
+저 멀리 한바다로 아주 바다로
+마치 가랑잎같이 떠나갑니다
+나는 혼자 산(山)에서 밤을 새우고
+아침해 붉은 볕에 몸을 씻으며
+귀 기울고 솔곳이 엿듣노라면
+님 계신 창(窓) 아래로 가는 물노래
+흔들어 깨우치는 물노래에는
+내 님이 놀라 일어나 찾으신대도
+내 몸은 산(山) 위에서 그 산(山) 위에서
+고이 깊이 잠들어 다 모릅니다
+고요하고 어두운 밤이 오면은
+어스러한 등(燈)불에 밤이 오면은
+외로움에 아픔에 다만 혼자서
+하염없는 눈물에 저는 웁니다
+제 한 몸도 예전엔 눈물 모르고
+조그만한 세상(世上)을 보냈습니다
+그때는 지난날의 옛이야기도
+아무 설움 모르고 외웠습니다
+그런데 우리 님이 가신 뒤에는
+아주 저를 버리고 가신 뒤에는
+전(前)날에 제게 있던 모든 것들이
+가지가지 없어지고 말았습니다
+그러나 그 한때에 외워 두었던
+옛이야기뿐만은 남았습니다
+나날이 짙어가는 옛이야기는
+부질없이 제 몸을 울려 줍니다
+그리운 우리 님의 맑은 노래는
+언제나 제 가슴에 젖어 있어요
+긴 날을 문 밖에서 서서 들어도
+그리운 우리 님의 고운 노래는
+해지고 저무도록 귀에 들려요
+밤들고 잠드도록 귀에 들려요
+고이도 흔들리는 노랫가락에
+내 잠은 그만이나 깊이 들어요
+고적한 잠자리에 홀로 누워도
+내 잠은 포스근히 깊이 들어요
+그러나 자다깨면 님의 노래는
+하나도 남김 없이 잃어버려요
+들으며 듣는 대로 님의 노래는
+하나도 남김없이 잊고 말아요.
+세월이 물과 같이 흐른 두 달은
+길어둔 독엣 물도 찌었지마는
+가면서 함께 가자 하던 말씀은
+살아서 살을 맞는 표적이외다.
+봄 풀은 봄이 되면 돋아나지만
+"나무는 밑구루를 꺾은 셈이요,"
+새라면 두 죽지가 상한 셈이라
+내 몸에 꽃필 날은 다시 없구나.
+밤마다 닭소리라 날이 첫시면
+"당신의 넋맞이로 나가 볼 때요,"
+그믐에 지는 달이 산에 걸리면
+당신의 길신 가리 차릴 때외다.
+세월은 물과 같이 흘러 가지만
+당신을 아주 잊던 말씀이지만
+동무들 보십시오 해가 집니다
+세상의 모든 것은 빛이 납니다
+이제는 주춤주춤 어둡습니다
+예서 더 저문 때를 밤이랍니다
+물 스치던 돌 위엔 물때 뿐이라
+물때 묻은 조약돌 마른 갈숲이
+이제라고 강(江)물의 터야 아니랴
+빨래 소리 물소리 선녀(仙女)의 노래
+잎새 위에 밤마다 우는 달빛이
+"때린다, 부순다, 무너 버린다."
--- a/movie-review/sample_data/movie_review/train/train_label 0 → 100644
View file @eef2c91
+++ b/movie-review/sample_data/movie_review/train/train_label 0 → 100644
View file @eef2c91
+5
+7
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+1
+6
+9
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+1
+8
+8
+10
+10
+10
+10
+10
+10
+9
+10
+10
+0
+1
+1
+1
+1
+3
+5
+7
+8
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+10
+1
+10
+10
+10
+10
+10
+10
+10
--- a/movie2/embadding.py
View file @eef2c91
+++ b/movie2/embadding.py
View file @eef2c91
 # -*- coding: utf-8 -*-
-from konlpy.corpus import kolaw
+
+import json
+
 def read_data(filename):
     with open(filename, 'r') as f:
         data = [line.split('\t') for line in f.read().splitlines()]
         data = data[1:]   # header 제외
     return data
-train_data = kolaw.open('constitution.txt').read()
+print(len(train_data))
-
+print(train_data[0])
-print(len(train_data))      # nrows: 150000
-print(len(train_data[0]))
 from konlpy.tag import Twitter
 pos_tagger = Twitter()
@@ -20,12 +20,12 @@ def tokenize(doc):
 train_docs = []
 for row in train_data:
-    train_docs.append((tokenize(row[0]), '0'))
+    train_docs.append((tokenize(row), '0'))
     # train_docs.append((tokenize(row[1]), '0'))
 # 잘 들어갔는지 확인
 from pprint import pprint
-pprint(train_docs[0])
+pprint(train_docs[0:2])
 from gensim.models.doc2vec import TaggedDocument
 tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
--- a/movie2/namu.py 0 → 100644
View file @eef2c91
+++ b/movie2/namu.py 0 → 100644
View file @eef2c91
+import json
+
+filename = 'corpus/namuwiki_20160229.json'
+# Read file to memory, it takes some time.
+with open(filename) as data_file:    
+    data = json.load(data_file)
+
+# this black list article does not contain natural language knowledge
+black_list_title = ['공지사항/차단 내역/통합본']
+
+# Article contains title, text, and other things
+# Let's extract title and text from several articles
+for i in range(3):
+    print(data[i]['title'])
+    print(data[i]['text'])
+    print()
+
+# Using regular expression, we can strip some grammar. Let's see how we can do it. 
+import re
+text = "딴 사람도 아니고 프로팀 [[Counter Logic Gaming|CLG]] 소속 전 서포터 [[스티브 차우|차우스터]]가 남긴 말이다."
+t1 = re.sub(r"\[\[([^\]|]*)\]\]", r'\1', text) # remove link
+print(t1)
+t2 = re.sub(r"\[\[(?:[^\]|]*\|)?([^\]|]+)\]\]", r'\1', text) # remove link
+print(t2)
+
+def strip(text):               
+    text = re.sub(r"\{\{\{#\!html[^\}]*\}\}\}", '', text, flags=re.IGNORECASE|re.MULTILINE|re.DOTALL) # remove html
+    text = re.sub(r"#redirect .*", '', text, flags=re.IGNORECASE) # remove redirect
+    text = re.sub(r"\[\[분류:.*", '', text) # remove 분류
+    text = re.sub(r"\[\[파일:.*", '', text) # remove 파일
+    text = re.sub(r"\* 상위 문서 ?:.*", '', text) # remove 상위문서        
+    text = re.sub(r"\[youtube\(\w+\)\]", '', text, flags=re.IGNORECASE) # remove youtube
+    text = re.sub(r"\[include\(([^\]|]*)(\|[^]]*)?\]", r'\1', text, flags=re.IGNORECASE) # remove include
+    text = re.sub(r"\[\[(?:[^\]|]*\|)?([^\]|]+)\]\]", r'\1', text) # remove link
+    text = re.sub(r"\[\*([^\]]*)\]", '', text) # remove 각주
+    text = re.sub(r"\{\{\{([^\ }|]*) ([^\}|]*)\}\}\}", r'\2', text) # remove text color/size
+    text = re.sub(r"'''([^']*)'''", r'\1', text) # remove text bold
+    text = re.sub(r"(~~|--)([^']*)(~~|--)", '', text) # remove strike-through
+    
+    text = re.sub(r"\|\|(.*)\|\|", '', text) # remove table
+                                   
+    text = chinese.sub('', text) # remove chinese
+    text = japanese.sub('', text) # remove japanese
+    return text
+
+for i in range(2):
+    print(data[i]['title'])
+    # print(data[i]['text'])
+    print(strip(data[i]['text']))
+    print()
+
+# Generate raw text corpus
+
+MIN_TEXT_SIZE = 5000
+
+count = 10
+with open('input.txt', 'w') as f:
+    for article in data:
+        if len(article['text']) < MIN_TEXT_SIZE or len(article['text']) >= MAX_ARTICLE_SIZE:        
+            continue # skip too small, too large articles
+
+        text = strip(article['text'])
+        f.write("%s\n%s\n\n\n" % (article['title'], text))
+        # print(article['title'])
+        # print(article['text'])
+        # print(text)
\ No newline at end of file