fianl 1st

신은섭(Shin Eun Seop)
Commit 305a0a0f0d0f143f30c98690d2d5c68f08da4778 305a0a0f 1 parent 6c0ef0fd
Showing 8 changed files with 645 additions and 0 deletions
movie2/dataset.py
movie2/doc2vec.model
movie2/embadding.py
movie2/kor_char_parser.py
movie2/main.py
movie2/setup.py
movie2/test.txt
movie2/text_helpers.py
--- a/movie2/dataset.py 0 → 100644
View file @305a0a0
+++ b/movie2/dataset.py 0 → 100644
View file @305a0a0
+"""
+kin dataset 
+"""
+
+import os
+import numpy as np
+# from kor_char_parser import decompose_str_as_one_hot
+
+import text_helpers
+from konlpy.tag import Twitter
+pos_tagger = Twitter()
+
+class KinQueryDataset:
+    """
+        지식인 데이터를 읽어서, tuple (데이터, 레이블)의 형태로 리턴하는 파이썬 오브젝트 입니다.
+    """
+    def __init__(self, dataset_path: str, max_length: int):
+        """
+        :param dataset_path: 데이터셋 root path
+        :param max_length: 문자열의 최대 길이
+        """
+        # 데이터, 레이블 각각의 경로
+        queries_path = os.path.join(dataset_path, 'train', 'train_data')
+        labels_path = os.path.join(dataset_path, 'train', 'train_label')
+
+        # 지식인 데이터를 읽고 preprocess까지 진행합니다
+        with open(queries_path, 'rt', encoding='utf8') as f:
+            self.queries = preprocess(f.readlines(), max_length)
+        # 지식인 레이블을 읽고 preprocess까지 진행합니다.
+        with open(labels_path) as f:
+            self.labels = np.array([[np.float32(x)] for x in f.readlines()])
+
+    def __len__(self):
+        """
+        :return: 전체 데이터의 수를 리턴합니다
+        """
+        return len(self.queries)
+
+    def __getitem__(self, idx):
+        """
+        :param idx: 필요한 데이터의 인덱스
+        :return: 인덱스에 맞는 데이터, 레이블 pair를 리턴합니다
+        """
+        return self.queries[idx], self.labels[idx]
+
+def tokenize(doc):
+    # norm, stem은 optional
+    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
+
+def preprocess(data: list, max_length: int):
+    train_docs = [(tokenize(row[0]), tokenize(row[1])) for row in data]
+
--- a/movie2/doc2vec.model 0 → 100644
View file @305a0a0
+++ b/movie2/doc2vec.model 0 → 100644
View file @305a0a0
--- a/movie2/embadding.py 0 → 100644
View file @305a0a0
+++ b/movie2/embadding.py 0 → 100644
View file @305a0a0
+# -*- coding: utf-8 -*-
+from konlpy.corpus import kolaw
+def read_data(filename):
+    with open(filename, 'r') as f:
+        data = [line.split('\t') for line in f.read().splitlines()]
+        data = data[1:]   # header 제외
+    return data
+
+train_data = kolaw.open('constitution.txt').read()
+
+print(len(train_data))      # nrows: 150000
+print(len(train_data[0]))
+
+from konlpy.tag import Twitter
+pos_tagger = Twitter()
+
+def tokenize(doc):
+    # norm, stem은 optional
+    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
+
+train_docs = []
+for row in train_data:
+    train_docs.append((tokenize(row[0]), '0'))
+    # train_docs.append((tokenize(row[1]), '0'))
+
+# 잘 들어갔는지 확인
+from pprint import pprint
+pprint(train_docs[0])
+
+from gensim.models.doc2vec import TaggedDocument
+tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
+
+from gensim.models import doc2vec
+import multiprocessing
+cores = multiprocessing.cpu_count()
+
+# 사전 구축
+doc_vectorizer = doc2vec.Doc2Vec(vector_size=1000, alpha=0.025, min_alpha=0.025, seed=1234, epochs=100, workers=cores, hs=1)
+doc_vectorizer.build_vocab(tagged_train_docs)
+doc_vectorizer.train(tagged_train_docs, epochs=doc_vectorizer.epochs, total_examples=doc_vectorizer.corpus_count)
+
+# To save
+doc_vectorizer.save('doc2vec.model')
+
+doc_vectorizer = doc2vec.Doc2Vec.load('doc2vec.model')
+pprint(doc_vectorizer.wv.most_similar('한국/Noun'))
--- a/movie2/kor_char_parser.py 0 → 100644
View file @305a0a0
+++ b/movie2/kor_char_parser.py 0 → 100644
View file @305a0a0
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+cho = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"  # len = 19
+jung = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ"  # len = 21
+# len = 27
+jong = "ㄱ/ㄲ/ㄱㅅ/ㄴ/ㄴㅈ/ㄴㅎ/ㄷ/ㄹ/ㄹㄱ/ㄹㅁ/ㄹㅂ/ㄹㅅ/ㄹㅌ/ㄹㅍ/ㄹㅎ/ㅁ/ㅂ/ㅂㅅ/ㅅ/ㅆ/ㅇ/ㅈ/ㅊ/ㅋ/ㅌ/ㅍ/ㅎ".split(
+    '/')
+test = cho + jung + ''.join(jong)
+
+hangul_length = len(cho) + len(jung) + len(jong)  # 67
+
+
+def is_valid_decomposition_atom(x):
+    return x in test
+
+
+def decompose(x):
+    in_char = x
+    if x < ord('가') or x > ord('힣'):
+        return chr(x)
+    x = x - ord('가')
+    y = x // 28
+    z = x % 28
+    x = y // 21
+    y = y % 21
+    # if there is jong, then is z > 0. So z starts from 1 index.
+    zz = jong[z - 1] if z > 0 else ''
+    if x >= len(cho):
+        print('Unknown Exception: ', in_char, chr(in_char), x, y, z, zz)
+    return cho[x] + jung[y] + zz
+
+
+def decompose_as_one_hot(in_char, warning=True):
+    one_hot = []
+    # print(ord('ㅣ'), chr(0xac00))
+    # [0,66]: hangul / [67,194]: ASCII / [195,245]: hangul danja,danmo / [246,249]: special characters
+    # Total 250 dimensions.
+    if ord('가') <= in_char <= ord('힣'):  # 가:44032 , 힣: 55203
+        x = in_char - 44032  # in_char - ord('가')
+        y = x // 28
+        z = x % 28
+        x = y // 21
+        y = y % 21
+        # if there is jong, then is z > 0. So z starts from 1 index.
+        zz = jong[z - 1] if z > 0 else ''
+        if x >= len(cho):
+            if warning:
+                print('Unknown Exception: ', in_char,
+                      chr(in_char), x, y, z, zz)
+
+        one_hot.append(x)
+        one_hot.append(len(cho) + y)
+        if z > 0:
+            one_hot.append(len(cho) + len(jung) + (z - 1))
+        return one_hot
+    else:
+        if in_char < 128:
+            result = hangul_length + in_char  # 67~
+        elif ord('ㄱ') <= in_char <= ord('ㅣ'):
+            # 194~ # [ㄱ:12593]~[ㅣ:12643] (len = 51)
+            result = hangul_length + 128 + (in_char - 12593)
+        elif in_char == ord('♡'):
+            result = hangul_length + 128 + 51  # 245~ # ♡
+        elif in_char == ord('♥'):
+            result = hangul_length + 128 + 51 + 1  # ♥
+        elif in_char == ord('★'):
+            result = hangul_length + 128 + 51 + 2  # ★
+        elif in_char == ord('☆'):
+            result = hangul_length + 128 + 51 + 3  # ☆
+        else:
+            if warning:
+                print('Unhandled character:', chr(in_char), in_char)
+            # unknown character
+            result = hangul_length + 128 + 51 + 4  # for unknown character
+
+        return [result]
+
+
+def decompose_str(string):
+    return ''.join([decompose(ord(x)) for x in string])
+
+
+def decompose_str_as_one_hot(string, warning=True):
+    tmp_list = []
+    for x in string:
+        da = decompose_as_one_hot(ord(x), warning=warning)
+        tmp_list.extend(da)
+    return tmp_list
--- a/movie2/main.py 0 → 100644
View file @305a0a0
+++ b/movie2/main.py 0 → 100644
View file @305a0a0
+# -*- coding: utf-8 -*-
+
+"""
+Copyright 2018 NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+
+import argparse
+import os
+
+import numpy as np
+import tensorflow as tf
+
+import nsml
+from nsml import DATASET_PATH, HAS_DATASET, IS_ON_NSML
+from dataset import KinQueryDataset, preprocess
+
+
+# DONOTCHANGE: They are reserved for nsml
+# This is for nsml leaderboard
+def bind_model(sess, config):
+    # 학습한 모델을 저장하는 함수입니다.
+    def save(dir_name, *args):
+        # directory
+        os.makedirs(dir_name, exist_ok=True)
+        saver = tf.train.Saver()
+        saver.save(sess, os.path.join(dir_name, 'model'))
+
+    # 저장한 모델을 불러올 수 있는 함수입니다.
+    def load(dir_name, *args):
+        saver = tf.train.Saver()
+        # find checkpoint
+        ckpt = tf.train.get_checkpoint_state(dir_name)
+        if ckpt and ckpt.model_checkpoint_path:
+            checkpoint = os.path.basename(ckpt.model_checkpoint_path)
+            saver.restore(sess, os.path.join(dir_name, checkpoint))
+        else:
+            raise NotImplemented('No checkpoint!')
+        print('Model loaded')
+
+    def infer(raw_data, **kwargs):
+        """
+        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
+        :param kwargs:
+        :return:
+        """
+        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
+        preprocessed_data = preprocess(raw_data, config.strmaxlen)
+        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
+        pred = sess.run(output_sigmoid, feed_dict={x: preprocessed_data})
+        clipped = np.array(pred > config.threshold, dtype=np.int)
+        # DONOTCHANGE: They are reserved for nsml
+        # 리턴 결과는 [(확률, 0 or 1)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 확률의 값은 영향을 미치지 않습니다
+        return list(zip(pred.flatten(), clipped.flatten()))
+
+    # DONOTCHANGE: They are reserved for nsml
+    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
+    nsml.bind(save=save, load=load, infer=infer)
+
+
+def _batch_loader(iterable, n=1):
+    """
+    데이터를 배치 사이즈만큼 잘라서 보내주는 함수입니다. PyTorch의 DataLoader와 같은 역할을 합니다
+    :param iterable: 데이터 list, 혹은 다른 포맷
+    :param n: 배치 사이즈
+    :return:
+    """
+    length = len(iterable)
+    for n_idx in range(0, length, n):
+        yield iterable[n_idx:min(n_idx + n, length)]
+
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+
+if __name__ == '__main__':
+    args = argparse.ArgumentParser()
+    # DONOTCHANGE: They are reserved for nsml
+    args.add_argument('--mode', type=str, default='train')
+    args.add_argument('--pause', type=int, default=0)
+    args.add_argument('--iteration', type=str, default='0')
+
+    # User options
+    args.add_argument('--output', type=int, default=1)
+    args.add_argument('--epochs', type=int, default=10)
+    args.add_argument('--batch', type=int, default=2000)
+    args.add_argument('--strmaxlen', type=int, default=400)
+    args.add_argument('--embedding', type=int, default=8)
+    args.add_argument('--threshold', type=float, default=0.5)
+    config = args.parse_args()
+
+    if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
+        DATASET_PATH = '../sample_data/kin/'
+
+    # 모델의 specification
+    input_size = config.embedding*config.strmaxlen
+    output_size = 1
+    hidden_layer_size = 200
+    learning_rate = 0.001
+    character_size = 251
+
+    x = tf.placeholder(tf.int32, [None, config.strmaxlen])
+    y_ = tf.placeholder(tf.float32, [None, output_size])
+    # 임베딩
+    char_embedding = tf.get_variable('char_embedding', [character_size, config.embedding])
+    embedded = tf.nn.embedding_lookup(char_embedding, x)
+
+    # 첫 번째 레이어
+    first_layer_weight = weight_variable([input_size, hidden_layer_size])
+    first_layer_bias = bias_variable([hidden_layer_size])
+    hidden_layer = tf.matmul(tf.reshape(embedded, (-1, input_size)),
+                             first_layer_weight) + first_layer_bias
+
+    # 두 번째 (아웃풋) 레이어
+    second_layer_weight = weight_variable([hidden_layer_size, output_size])
+    second_layer_bias = bias_variable([output_size])
+    output = tf.matmul(hidden_layer, second_layer_weight) + second_layer_bias
+    output_sigmoid = tf.sigmoid(output)
+
+    # loss와 optimizer
+    binary_cross_entropy = tf.reduce_mean(-(y_ * tf.log(output_sigmoid)) - (1-y_) * tf.log(1-output_sigmoid))
+    train_step = tf.train.AdamOptimizer(learning_rate).minimize(binary_cross_entropy)
+
+    sess = tf.InteractiveSession()
+    tf.global_variables_initializer().run()
+
+    # DONOTCHANGE: Reserved for nsml
+    bind_model(sess=sess, config=config)
+
+    # DONOTCHANGE: Reserved for nsml
+    if config.pause:
+        nsml.paused(scope=locals())
+
+    if config.mode == 'train':
+        # 데이터를 로드합니다.
+        dataset = KinQueryDataset(DATASET_PATH, config.strmaxlen)
+        dataset_len = len(dataset)
+        one_batch_size = dataset_len//config.batch
+        if dataset_len % config.batch != 0:
+            one_batch_size += 1
+        # epoch마다 학습을 수행합니다.
+        for epoch in range(config.epochs):
+            avg_loss = 0.0
+            for i, (data, labels) in enumerate(_batch_loader(dataset, config.batch)):
+                _, loss = sess.run([train_step, binary_cross_entropy],
+                                   feed_dict={x: data, y_: labels})
+                print('Batch : ', i + 1, '/', one_batch_size,
+                      ', BCE in this minibatch: ', float(loss))
+                avg_loss += float(loss)
+            print('epoch:', epoch, ' train_loss:', float(avg_loss/one_batch_size))
+            nsml.report(summary=True, scope=locals(), epoch=epoch, epoch_total=config.epochs,
+                        train__loss=float(avg_loss/one_batch_size), step=epoch)
+            # DONOTCHANGE (You can decide how often you want to save the model)
+            nsml.save(epoch)
+
+    # 로컬 테스트 모드일때 사용합니다
+    # 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
+    # [(0.3, 0), (0.7, 1), ... ]
+    elif config.mode == 'test_local':
+        with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
+            queries = f.readlines()
+        res = []
+        for batch in _batch_loader(queries, config.batch):
+            temp_res = nsml.infer(batch)
+            res += temp_res
+    print(res)
--- a/movie2/setup.py 0 → 100644
View file @305a0a0
+++ b/movie2/setup.py 0 → 100644
View file @305a0a0
+"""
+Copyright 2018 NAVER Corp.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+from distutils.core import setup
+setup(
+    name='nsml movie review',
+    version='1.0',
+    description='',
+    install_requires=[
+        'nltk',
+        'konlpy',
+        'twython'
+
+    ]
+)
\ No newline at end of file
--- a/movie2/test.txt 0 → 100644
View file @305a0a0
+++ b/movie2/test.txt 0 → 100644
View file @305a0a0
+경제가 불안하면 국채를 왜발행하나요??	국채 발행 이유?
+경제가 불안하면 국채를 왜발행하나요??	국채를 많이 발행하면 왜 수요가 떨어져요?
+경제가 불안하면 국채를 왜발행하나요??	국채발행은 왜 필요한가
+경제가 불안하면 국채를 왜발행하나요??	국채를 발행하며 왜 시장 금리가 올라가나요?
+경제가 불안하면 국채를 왜발행하나요??	나라가 국채를 발행할때
+국채 발행 이유?	국채를 많이 발행하면 왜 수요가 떨어져요?
+국채 발행 이유?	국채발행은 왜 필요한가
+국채 발행 이유?	국채를 발행하며 왜 시장 금리가 올라가나요?
+국채 발행 이유?	나라가 국채를 발행할때
+외환보유고 에 관한 질문	외환보유고에 대해...
+외환보유고 에 관한 질문	한국의 외환보유고에 관한 질문
+외환보유고 에 관한 질문	외환보유고에 대해서
+외환보유고 에 관한 질문	한국의 외환 보유고 (추가내공 有)
+외환보유고에 대해...	한국의 외환보유고에 관한 질문
+외환보유고에 대해...	외환보유고에 대해서
+외환보유고에 대해...	한국의 외환 보유고 (추가내공 有)
+한국의 외환보유고에 관한 질문	외환보유고에 대해서
+한국의 외환보유고에 관한 질문	한국의 외환 보유고 (추가내공 有)
+외환보유고에 대해서	한국의 외환 보유고 (추가내공 有)
+외환보유고 에 관한 질문	외환보유액에 대해서
+외환보유고에 대해...	외환보유액에 대해서
+한국의 외환보유고에 관한 질문	외환보유액에 대해서
+외환보유고에 대해서	외환보유액에 대해서
+한국의 외환 보유고 (추가내공 有)	외환보유액에 대해서
+중국은 달러 보유가 많아서 미국 금융위기에 영향을 가장 적게 받는다고 하는데 사실인가요?	중국에서 미국달러가치가 높아짐에따라 중국은 어떠한영향을 받는지에대해
+중국은 달러 보유가 많아서 미국 금융위기에 영향을 가장 적게 받는다고 하는데 사실인가요?	미국 중국 금리변화가 끼치는 영향?
+중국은 달러 보유가 많아서 미국 금융위기에 영향을 가장 적게 받는다고 하는데 사실인가요?	중국이 미국을 제치고 1위의 경제대국이 되면 달러의 가치는 어떻게 될까요?
+중국은 달러 보유가 많아서 미국 금융위기에 영향을 가장 적게 받는다고 하는데 사실인가요?	미국국채를 일본과 중국이 구입할때
+중국은 달러 보유가 많아서 미국 금융위기에 영향을 가장 적게 받는다고 하는데 사실인가요?	미국 금리인상이 우리나라 환율에 어떤 영향을 미치나요?
+중국에서 미국달러가치가 높아짐에따라 중국은 어떠한영향을 받는지에대해	미국 중국 금리변화가 끼치는 영향?
+중국에서 미국달러가치가 높아짐에따라 중국은 어떠한영향을 받는지에대해	중국이 미국을 제치고 1위의 경제대국이 되면 달러의 가치는 어떻게 될까요?
+중국에서 미국달러가치가 높아짐에따라 중국은 어떠한영향을 받는지에대해	미국국채를 일본과 중국이 구입할때
+중국에서 미국달러가치가 높아짐에따라 중국은 어떠한영향을 받는지에대해	미국 금리인상이 우리나라 환율에 어떤 영향을 미치나요?
+한-미 통화 스와프체결이 무슨 뜻인가요?	통화스와프가 무슨 뜻이예요?
+한-미 통화 스와프체결이 무슨 뜻인가요?	유상사급이 무슨 뜻인가요?
+한-미 통화 스와프체결이 무슨 뜻인가요?	scrap yard가 무슨 뜻인가요??
+한-미 통화 스와프체결이 무슨 뜻인가요?	컨소시엄이 무슨뜻인가요?
+한-미 통화 스와프체결이 무슨 뜻인가요?	Tango가 무슨 뜻인가요
+노동시장의 유연화	노동시장의 유연성이란?
+노동시장의 유연화	노동시장알려주세요.
+노동시장의 유연화	자신의 노동에 대해 매겨지는 시장시세는 정당한가?
+노동시장의 유연화	노동시장 구조개혁
+노동시장의 유연화	노동시장에서 노동자는 수요자인가요 공급자인가요?
+노동시장의 유연성이란?	노동시장알려주세요.
+노동시장의 유연성이란?	자신의 노동에 대해 매겨지는 시장시세는 정당한가?
+노동시장의 유연성이란?	노동시장 구조개혁
+노동시장의 유연성이란?	노동시장에서 노동자는 수요자인가요 공급자인가요?
+브릭스(BRICs)에 대해서 질문좀 할께요	브릭스에 대한 질문
+브릭스(BRICs)에 대해서 질문좀 할께요	브릭스에 대해서 물어볼게있어요
+브릭스에 대한 질문	브릭스에 대해서 물어볼게있어요
+브릭스(BRICs)에 대해서 질문좀 할께요	베네룩스에 대해서 질문 좀 할게요
+브릭스(BRICs)에 대해서 질문좀 할께요	좌수에 대한 질문입니다.
+브릭스(BRICs)에 대해서 질문좀 할께요	외규장각 의궤에대한 질문
+브릭스에 대한 질문	베네룩스에 대해서 질문 좀 할게요
+브릭스에 대한 질문	좌수에 대한 질문입니다.
+브릭스에 대한 질문	외규장각 의궤에대한 질문
+브릭스에 대해서 물어볼게있어요	베네룩스에 대해서 질문 좀 할게요
+브릭스에 대해서 물어볼게있어요	좌수에 대한 질문입니다.
+브릭스에 대해서 물어볼게있어요	외규장각 의궤에대한 질문
+"전화국에서 교환기,기지국이 하는 역할이 무엇인지?	기지국이뭐고 전화국이뭔가요?"
+"전화국에서 교환기,기지국이 하는 역할이 무엇인지?	PCS와 위성 기지국이 머에요?"
+"전화국에서 교환기,기지국이 하는 역할이 무엇인지?	PCS기지국이 SKT에서 쓰는 기지국보다 좋지 않나요?"
+"전화국에서 교환기,기지국이 하는 역할이 무엇인지?	통화란 무엇이며, 통화의 역할은 무엇인가요?"
+"전화국에서 교환기,기지국이 하는 역할이 무엇인지?	캐치콜의 원리는 무엇인가요?"
+기지국이뭐고 전화국이뭔가요?	PCS와 위성 기지국이 머에요?
+기지국이뭐고 전화국이뭔가요?	PCS기지국이 SKT에서 쓰는 기지국보다 좋지 않나요?
+"기지국이뭐고 전화국이뭔가요?	통화란 무엇이며, 통화의 역할은 무엇인가요?"
+기지국이뭐고 전화국이뭔가요?	캐치콜의 원리는 무엇인가요?
+가등록이 되었던 폰이라는건 뭔가요	자급제폰이 뭔가요?
+가등록이 되었던 폰이라는건 뭔가요	일시구입폰이뭔가요
+가등록이 되었던 폰이라는건 뭔가요	부품용 폰이 뭔가요?
+가등록이 되었던 폰이라는건 뭔가요	제일 최근에 나온 폰이 뭐에여?
+가등록이 되었던 폰이라는건 뭔가요	자급제폰이 뭐죠
+휴대폰 불법복제란 무엇인가요?	휴대폰복제에대해서궁금합니다.
+휴대폰 불법복제란 무엇인가요?	휴대폰 복제란?
+휴대폰복제에대해서궁금합니다.	휴대폰 복제란?
+휴대폰 불법복제란 무엇인가요?	불법 복제가 뭔가요??
+휴대폰 불법복제란 무엇인가요?	복제란 무엇인가?
+휴대폰 불법복제란 무엇인가요?	복제란 무엇인가요 ?
+휴대폰복제에대해서궁금합니다.	불법 복제가 뭔가요??
+휴대폰복제에대해서궁금합니다.	복제란 무엇인가?
+휴대폰복제에대해서궁금합니다.	복제란 무엇인가요 ?
+휴대폰 복제란?	불법 복제가 뭔가요??
+휴대폰 복제란?	복제란 무엇인가?
+휴대폰 복제란?	복제란 무엇인가요 ?
+왜 휴대폰으로 전화 할땐 지역번호를 눌러야 할까요?	집전화는 지역번호를 누르지않아도되고 휴대폰은 눌러야하는 이유?
+왜 휴대폰으로 전화 할땐 지역번호를 눌러야 할까요?	인터넷에는 왜 휴대폰번호나 집전화번호를 입력하는 것은 안 될까요?
+왜 휴대폰으로 전화 할땐 지역번호를 눌러야 할까요?	휴대폰전화번호 뒷자리 4번과 자기 집 전화번호 뒷자리 4번과 같은 이유?
+왜 휴대폰으로 전화 할땐 지역번호를 눌러야 할까요?	한개 휴대폰으로 2개이상 전화번호사용?(내공20)
+왜 휴대폰으로 전화 할땐 지역번호를 눌러야 할까요?	119 장난전화 하면 안되는 이유(숙제)
+집전화는 지역번호를 누르지않아도되고 휴대폰은 눌러야하는 이유?	인터넷에는 왜 휴대폰번호나 집전화번호를 입력하는 것은 안 될까요?
+집전화는 지역번호를 누르지않아도되고 휴대폰은 눌러야하는 이유?	휴대폰전화번호 뒷자리 4번과 자기 집 전화번호 뒷자리 4번과 같은 이유?
+집전화는 지역번호를 누르지않아도되고 휴대폰은 눌러야하는 이유?	한개 휴대폰으로 2개이상 전화번호사용?(내공20)
+집전화는 지역번호를 누르지않아도되고 휴대폰은 눌러야하는 이유?	119 장난전화 하면 안되는 이유(숙제)
+스크랩하는 방법을 알려주세요	스크랩하는 방법 좀 알려주세요~
+스크랩하는 방법을 알려주세요	스크랩하는 방법좀 알려주세요.
+스크랩하는 방법을 알려주세요	스크랩하기 사용방법좀 알려주세요.
+스크랩하는 방법을 알려주세요	스크랩하는 방법좀 알려주세요 자세히요.
+스크랩하는 방법을 알려주세요	스크랩 방법을 알려주세요!!
+스크랩하는 방법 좀 알려주세요~	스크랩하는 방법좀 알려주세요.
+스크랩하는 방법 좀 알려주세요~	스크랩하기 사용방법좀 알려주세요.
+스크랩하는 방법 좀 알려주세요~	스크랩하는 방법좀 알려주세요 자세히요.
+스크랩하는 방법 좀 알려주세요~	스크랩 방법을 알려주세요!!
+스크랩하는 방법좀 알려주세요.	스크랩하기 사용방법좀 알려주세요.
+스크랩하는 방법좀 알려주세요.	스크랩하는 방법좀 알려주세요 자세히요.
+스크랩하는 방법좀 알려주세요.	스크랩 방법을 알려주세요!!
+스크랩하기 사용방법좀 알려주세요.	스크랩하는 방법좀 알려주세요 자세히요.
+스크랩하기 사용방법좀 알려주세요.	스크랩 방법을 알려주세요!!
+스크랩하는 방법좀 알려주세요 자세히요.	스크랩 방법을 알려주세요!!
\ No newline at end of file
--- a/movie2/text_helpers.py 0 → 100644
View file @305a0a0
+++ b/movie2/text_helpers.py 0 → 100644
View file @305a0a0
+# Text Helper Functions
+#---------------------------------------
+#
+# We pull out text helper functions to reduce redundant code
+
+import string
+import os
+import urllib.request
+import io
+import tarfile
+import collections
+import numpy as np
+
+# Normalize text
+def normalize_text(texts, stops):
+    # Lower case
+    texts = [x.lower() for x in texts]
+
+    # Remove punctuation
+    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
+
+    # Remove numbers
+    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
+
+    # Remove stopwords
+    texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
+
+    # Trim extra whitespace
+    texts = [' '.join(x.split()) for x in texts]
+    
+    return(texts)
+
+
+# Build dictionary of words
+def build_dictionary(sentences, vocabulary_size):
+    # Turn sentences (list of strings) into lists of words
+    split_sentences = [s.split() for s in sentences]
+    words = [x for sublist in split_sentences for x in sublist]
+    
+    # Initialize list of [word, word_count] for each word, starting with unknown
+    count = [['RARE', -1]]
+    
+    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
+    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
+    
+    # Now create the dictionary
+    word_dict = {}
+    # For each word, that we want in the dictionary, add it, then make it
+    # the value of the prior dictionary length
+    for word, word_count in count:
+        word_dict[word] = len(word_dict)
+    
+    return(word_dict)
+    
+
+# Turn text data into lists of integers from dictionary
+def text_to_numbers(sentences, word_dict):
+    # Initialize the returned data
+    data = []
+    for sentence in sentences:
+        sentence_data = []
+        # For each word, either use selected index or rare word index
+        for word in sentence.split():
+            if word in word_dict:
+                word_ix = word_dict[word]
+            else:
+                word_ix = 0
+            sentence_data.append(word_ix)
+        data.append(sentence_data)
+    return(data)
+    
+
+# Generate data randomly (N words behind, target, N words ahead)
+def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
+    # Fill up data batch
+    batch_data = []
+    label_data = []
+    while len(batch_data) < batch_size:
+        # select random sentence to start
+        rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
+        rand_sentence = sentences[rand_sentence_ix]
+        # Generate consecutive windows to look at
+        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
+        # Denote which element of each window is the center word of interest
+        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
+        
+        # Pull out center word of interest for each window and create a tuple for each window
+        if method=='skip_gram':
+            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
+            # Make it in to a big list of tuples (target word, surrounding word)
+            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
+            batch, labels = [list(x) for x in zip(*tuple_data)]
+        elif method=='cbow':
+            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
+            # Only keep windows with consistent 2*window_size
+            batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
+            batch, labels = [list(x) for x in zip(*batch_and_labels)]
+        elif method=='doc2vec':
+            # For doc2vec we keep LHS window only to predict target word
+            batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
+            batch, labels = [list(x) for x in zip(*batch_and_labels)]
+            # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
+            batch = [x + [rand_sentence_ix] for x in batch]
+        else:
+            raise ValueError('Method {} not implmented yet.'.format(method))
+            
+        # extract batch and labels
+        batch_data.extend(batch[:batch_size])
+        label_data.extend(labels[:batch_size])
+    # Trim batch and label at the end
+    batch_data = batch_data[:batch_size]
+    label_data = label_data[:batch_size]
+    
+    # Convert to numpy array
+    batch_data = np.array(batch_data)
+    label_data = np.transpose(np.array([label_data]))
+    
+    return(batch_data, label_data)
\ No newline at end of file