chatbot_py files

bongminkim
Commit d3ad9ecc5cefb7d71f76ca6f84d04eff1db53649 d3ad9ecc 1 parent def52dcb
Showing 5 changed files with 862 additions and 0 deletions
Chatbot/Styling.py
Chatbot/generation.py
Chatbot/get_data.py
Chatbot/metric.py
Chatbot/model.py
--- a/Chatbot/Styling.py 0 → 100644
View file @d3ad9ec
+++ b/Chatbot/Styling.py 0 → 100644
View file @d3ad9ec
+import torch
+import csv
+import hgtk
+from konlpy.tag import Mecab
+import random
+
+mecab = Mecab()
+empty_list = []
+positive_emo = ['ㅎㅎ', '~']
+negative_emo = ['...', 'ㅠㅠ']
+asdf = []
+
+# mecab 을 통한 형태소 분석.
+def mecab_token_pos_flat_fn(string):
+    tokens_ko = mecab.pos(string)
+    return [str(pos[0]) + '/' + str(pos[1]) for pos in tokens_ko]
+
+# rough 를 위한 함수. 대명사 NP (저, 제) 를 찾아 나 or 내 로 바꿔준다.
+def exchange_NP(target, args):
+    keyword = []
+    ko_sp = mecab_token_pos_flat_fn(target)
+    for idx, word in enumerate(ko_sp):
+        if word.find('NP') > 0:
+            keyword.append(word.split('/'))
+            _idx = idx
+            break
+    if keyword == []:
+        return '', -1, False
+
+    if keyword[0][0] == '저':
+        keyword[0][0] = '나'
+    elif keyword[0][0] == '제':
+        keyword[0][0] = '내'
+    else:
+        return keyword[0], _idx, False
+
+    return keyword[0][0], _idx, True
+
+# 단어를 soft or rough 말투로 바꾸는 과정
+def make_special_word(target, args, search_ec):
+    # mecab 를 통해 문장을 구분 (example output : ['오늘/MAG', '날씨/NNG', '좋/VA', '다/EF', './SF'])
+    ko_sp = mecab_token_pos_flat_fn(target)
+
+    keyword = []
+
+    # word 에 종결어미 'EF' or 'EC' 가 포함 되어 있을 경우 index 와 keyword 추출.
+    for idx, word in enumerate(ko_sp):
+        if word.find('EF') > 0:
+            keyword.append(word.split('/'))
+            _idx = idx
+            break
+        if search_ec:
+            if ko_sp[-2].find('EC') > 0:
+                keyword.append(ko_sp[-2].split('/'))
+                _idx = len(ko_sp) -1
+                break
+            else:
+                continue
+
+    # 'EF'가 없을 시 return.
+    if keyword == []:
+        return '', -1
+    else:
+        keyword = keyword[0]
+
+    if args.per_rough:
+        return keyword[0], _idx
+
+    # hgtk 를 사용하여 keyword 를 쪼갬. (ex output : 하ᴥ세요)
+    h_separation = hgtk.text.decompose(keyword[0])
+    total_word = ''
+
+    for idx, word in enumerate(h_separation):
+        total_word += word
+
+    # 'EF' 에 종성 'ㅇ' 를 붙여 Styling
+    total_word = replaceRight(total_word, "ᴥ", "ㅇᴥ", 1)
+
+    # 다 이어 붙임. ' 하세요 -> 하세용 ' 으로 변환.
+    h_combine = hgtk.text.compose(total_word)
+
+    return h_combine, _idx
+
+# special token 을 만드는 함수
+def make_special_token(args):
+    # 감정을 나타내기 위한 special token
+    target_special_voca=[]
+
+    banmal_dict = get_rough_dic()
+
+    # train data set 의 chatbot answer 에서 'EF' 를 뽑아 종성 'ㅇ' 을 붙인 special token 생성
+    with open('chatbot_0325_ALLLABEL_train.txt', 'r', encoding='utf-8') as f:
+        rdr = csv.reader(f, delimiter='\t')
+        for idx, line in enumerate(rdr):
+            target = line[2] # chatbot answer
+            exchange_word, _ = make_special_word(target, args, False)
+            target_special_voca.append(str(exchange_word))
+    target_special_voca = list(set(target_special_voca))
+
+    banmal_special_voca = []
+    for i in range(len(target_special_voca)):
+        try:
+            banmal_special_voca.append(banmal_dict[target_special_voca[i]])
+        except KeyError:
+            if args.per_rough:
+                print("not include banmal dictionary")
+            pass
+
+    # 임의 이모티콘 추가.
+    target_special_voca.append('ㅎㅎ')
+    target_special_voca.append('~')
+    target_special_voca.append('ㅠㅠ')
+    target_special_voca.append('...')
+    target_special_voca = target_special_voca + banmal_special_voca
+
+    # '<posi> : positive, <nega> : negative' 를 의미
+    return ['<posi>', '<nega>'], target_special_voca
+
+# python string 함수 replace 를 오른쪽부터 시작하는 함수.
+def replaceRight(original, old, new, count_right):
+    repeat = 0
+    text = original
+
+    count_find = original.count(old)
+    if count_right > count_find:  # 바꿀 횟수가 문자열에 포함된 old보다 많다면
+        repeat = count_find  # 문자열에 포함된 old의 모든 개수(count_find)만큼 교체한다
+    else:
+        repeat = count_right  # 아니라면 입력받은 개수(count)만큼 교체한다
+
+    for _ in range(repeat):
+        find_index = text.rfind(old)  # 오른쪽부터 index를 찾기위해 rfind 사용
+        text = text[:find_index] + new + text[find_index + 1:]
+
+    return text
+
+# transformer 에 input 과 output 으로 들어갈 tensor Styling 변환.
+def styling(enc_input, dec_input, dec_output, dec_outputs, enc_label, args, TEXT, LABEL):
+
+    pad_tensor = torch.tensor([LABEL.vocab.stoi['<pad>']]).type(dtype=torch.int32).cuda()
+
+    temp_enc = enc_input.data.cpu().numpy()
+    batch_sentiment_list = []
+
+    # 부드러운 성격
+    if args.per_soft:
+        # encoder input : 나는 너를 좋아해 <posi> <pad> <pad> ... - 형식으로 바꿔줌.
+        for i in range(len(temp_enc)):
+            for j in range(args.max_len):
+                if temp_enc[i][j] == 1 and enc_label[i] == 0:
+                    temp_enc[i][j] = TEXT.vocab.stoi["<nega>"]
+                    batch_sentiment_list.append(0)
+                    break
+                elif temp_enc[i][j] == 1 and enc_label[i] == 1:
+                    temp_enc[i][j] = TEXT.vocab.stoi["<posi>"]
+                    batch_sentiment_list.append(1)
+                    break
+
+        enc_input = torch.tensor(temp_enc, dtype=torch.int32).cuda()
+
+        for i in range(len(dec_outputs)):
+            dec_outputs[i] = torch.cat([dec_output[i], pad_tensor], dim=-1)
+
+        temp_dec = dec_outputs.data.cpu().numpy()
+
+        dec_outputs_sentiment_list = [] # decoder 에 들어가 감정표현 저장.
+
+        # decoder outputs : 저도 좋아용 ㅎㅎ <eos> <pad> <pad> ... - 형식으로 바꿔줌.
+        for i in range(len(temp_dec)): # i = batch size
+            temp_sentence = ''
+            sa_ = batch_sentiment_list[i]
+            if sa_ == 0:
+                sa_ = random.choice(negative_emo)
+            elif sa_ == 1:
+                sa_ = random.choice(positive_emo)
+            dec_outputs_sentiment_list.append(sa_)
+
+            for ix, token_i in enumerate(temp_dec[i]):
+                if LABEL.vocab.itos[token_i] == '<sos>' or LABEL.vocab.itos[token_i] == '<eos>' or LABEL.vocab.itos[token_i] == '<pad>':
+                    continue
+                temp_sentence = temp_sentence + LABEL.vocab.itos[token_i]
+            temp_sentence = temp_sentence + '.'  # 마침표에 유무에 따라 형태소 분석이 달라짐.
+            exchange_word, idx = make_special_word(temp_sentence, args, True)
+
+            if exchange_word == '':
+                for j in range(len(temp_dec[i])):
+                    if temp_dec[i][j] == LABEL.vocab.stoi['<eos>']:
+                        temp_dec[i][j] = LABEL.vocab.stoi[sa_]
+                        temp_dec[i][j+1] = LABEL.vocab.stoi['<eos>']
+                        break
+                continue
+
+            for j in range(len(temp_dec[i])):
+                if LABEL.vocab.itos[temp_dec[i][j]] == '<eos>':
+                    temp_dec[i][j - 1] = LABEL.vocab.stoi[exchange_word]
+                    temp_dec[i][j] = LABEL.vocab.stoi[dec_outputs_sentiment_list[i]]
+                    temp_dec[i][j + 1] = LABEL.vocab.stoi['<eos>']
+                    break
+                elif temp_dec[i][j] != LABEL.vocab.stoi['<eos>'] and j + 1 == len(temp_dec[i]):
+                    print("\t-ERROR- No <EOS> token")
+                    exit()
+
+        dec_outputs = torch.tensor(temp_dec, dtype=torch.int32).cuda()
+
+        temp_dec_input = dec_input.data.cpu().numpy()
+        # decoder input : <sos> 저도 좋아용 ㅎㅎ <eos> <pad> <pad> ... - 형식으로 바꿔줌.
+        for i in range(len(temp_dec_input)):
+            temp_sentence = ''
+            for ix, token_i in enumerate(temp_dec_input[i]):
+                if LABEL.vocab.itos[token_i] == '<sos>' or LABEL.vocab.itos[token_i] == '<eos>' or LABEL.vocab.itos[token_i] == '<pad>':
+                    continue
+                temp_sentence = temp_sentence + LABEL.vocab.itos[token_i]
+            temp_sentence = temp_sentence + '.'  # 마침표에 유무에 따라 형태소 분석이 달라짐.
+            exchange_word, idx = make_special_word(temp_sentence, args, True)
+
+            if exchange_word == '':
+                for j in range(len(temp_dec_input[i])):
+                    if temp_dec_input[i][j] == LABEL.vocab.stoi['<eos>']:
+                        temp_dec_input[i][j] = LABEL.vocab.stoi[dec_outputs_sentiment_list[i]]
+                        temp_dec_input[i][j+1] = LABEL.vocab.stoi['<eos>']
+                        break
+                continue
+
+            for j in range(len(temp_dec_input[i])):
+                if LABEL.vocab.itos[temp_dec_input[i][j]] == '<eos>':
+                    temp_dec_input[i][j-1] = LABEL.vocab.stoi[exchange_word]
+                    temp_dec_input[i][j] = LABEL.vocab.stoi[dec_outputs_sentiment_list[i]]
+                    temp_dec_input[i][j+1] = LABEL.vocab.stoi['<eos>']
+                    break
+                elif temp_dec_input[i][j] != LABEL.vocab.stoi['<eos>'] and j+1 == len(temp_dec_input[i]):
+                    print("\t-ERROR- No <EOS> token")
+                    exit()
+
+        dec_input = torch.tensor(temp_dec_input, dtype=torch.int32).cuda()
+
+    # 거친 성격
+    elif args.per_rough:
+        banmal_dic = get_rough_dic()
+
+        for i in range(len(dec_outputs)):
+            dec_outputs[i] = torch.cat([dec_output[i], pad_tensor], dim=-1)
+
+        temp_dec = dec_outputs.data.cpu().numpy()
+
+        # decoder outputs : 나도 좋아  <eos> <pad> <pad> ... - 형식으로 바꿔줌.
+        for i in range(len(temp_dec)):  # i = batch size
+            temp_sentence = ''
+            for ix, token_i in enumerate(temp_dec[i]):
+                if LABEL.vocab.itos[token_i] == '<eos>':
+                    break
+                temp_sentence = temp_sentence + LABEL.vocab.itos[token_i]
+            temp_sentence = temp_sentence+'.' # 마침표에 유무에 따라 형태소 분석이 달라짐.
+            exchange_word, idx = make_special_word(temp_sentence, args, True)
+            exchange_NP_word, NP_idx, exist = exchange_NP(temp_sentence, args)
+
+            if exist:
+                temp_dec[i][NP_idx] = LABEL.vocab.stoi[exchange_NP_word]
+
+            if exchange_word == '':
+                continue
+            try:
+                exchange_word = banmal_dic[exchange_word]
+            except KeyError:
+                asdf.append(exchange_word)
+                print("not include banmal dictionary")
+                pass
+
+            temp_dec[i][idx] = LABEL.vocab.stoi[exchange_word]
+            temp_dec[i][idx+1] = LABEL.vocab.stoi['<eos>']
+            for k in range(idx+2, args.max_len):
+                temp_dec[i][k] = LABEL.vocab.stoi['<pad>']
+
+            # for j in range(len(temp_dec[i])):
+            #     if LABEL.vocab.itos[temp_dec[i][j]]=='<eos>':
+            #         break
+            #     print(LABEL.vocab.itos[temp_dec[i][j]], end='')
+            # print()
+
+        dec_outputs = torch.tensor(temp_dec, dtype=torch.int32).cuda()
+
+        temp_dec_input = dec_input.data.cpu().numpy()
+        # decoder input : <sos> 나도 좋아 <eos> <pad> <pad> ... - 형식으로 바꿔줌.
+        for i in range(len(temp_dec_input)):
+            temp_sentence = ''
+            for ix, token_i in enumerate(temp_dec_input[i]):
+                if ix == 0 :
+                    continue # because of token <sos>
+                if LABEL.vocab.itos[token_i] == '<eos>':
+                    break
+                temp_sentence = temp_sentence + LABEL.vocab.itos[token_i]
+            temp_sentence = temp_sentence + '.'  # 마침표에 유무에 따라 형태소 분석이 달라짐.
+            exchange_word, idx = make_special_word(temp_sentence, args, True)
+            exchange_NP_word, NP_idx, exist = exchange_NP(temp_sentence, args)
+            idx = idx + 1  # because of token <sos>
+            NP_idx = NP_idx + 1
+
+            if exist:
+                temp_dec_input[i][NP_idx] = LABEL.vocab.stoi[exchange_NP_word]
+
+            if exchange_word == '':
+                continue
+
+            try:
+                exchange_word = banmal_dic[exchange_word]
+            except KeyError:
+                print("not include banmal dictionary")
+                pass
+
+            temp_dec_input[i][idx] = LABEL.vocab.stoi[exchange_word]
+            temp_dec_input[i][idx + 1] = LABEL.vocab.stoi['<eos>']
+
+            for k in range(idx+2, args.max_len):
+                temp_dec_input[i][k] = LABEL.vocab.stoi['<pad>']
+
+            # for j in range(len(temp_dec_input[i])):
+            #     if LABEL.vocab.itos[temp_dec_input[i][j]]=='<eos>':
+            #         break
+            #     print(LABEL.vocab.itos[temp_dec_input[i][j]], end='')
+            # print()
+
+        dec_input = torch.tensor(temp_dec_input, dtype=torch.int32).cuda()
+
+    return enc_input, dec_input, dec_outputs
+
+# 반말로 바꾸기위한 딕셔너리
+def get_rough_dic():
+    my_exword = {
+        '돌아와요': '돌아와',
+        '으세요': '으셈',
+        '잊어버려요': '잊어버려',
+        '나온대요': '나온대',
+        '될까요': '될까',
+        '할텐데': '할텐데',
+        '옵니다': '온다',
+        '봅니다': '본다',
+        '네요': '네',
+        '된답니다': '된대',
+        '데요': '데',
+        '봐요': '봐',
+        '부러워요': '부러워',
+        '바랄게요': '바랄게',
+        '지나갑니다': "지가간다",
+        '이뻐요': "이뻐",
+        '지요': "지",
+        '사세요': "사라",
+        '던가요': "던가",
+        '모릅니다': "몰라",
+        '은가요': "은가",
+        '심해요': "심해",
+        '몰라요': "몰라",
+        '라요': "라",
+        '더라고요': '더라고',
+        '입니다': '이라고',
+        '는다면요': '는다면',
+        '멋져요': '멋져',
+        '다면요': '다면',
+        '다니': '다나',
+        '져요': '져',
+        '만드세요': '만들어',
+        '야죠': '야지',
+        '죠': '지',
+        '해줄게요': '해줄게',
+        '대요': '대',
+        '돌아갑시다': '돌아가자',
+        '해보여요': '해봐',
+        '라뇨': '라니',
+        '편합니다': '편해',
+        '합시다': '하자',
+        '드세요': '먹어',
+        '아름다워요': '아름답네',
+        '드립니다': '줄게',
+        '받아들여요': '받아들여',
+        '건가요': '간기',
+        '쏟아진다': '쏟아지네',
+        '슬퍼요': '슬퍼',
+        '해서요': '해서',
+        '다릅니다': '다르다',
+        '니다': '니',
+        '내려요': '내려',
+        '마셔요': '마셔',
+        '아세요': '아냐',
+        '변해요': '뱐헤',
+        '드려요': '드려',
+        '아요': '아',
+        '어서요': '어서',
+        '뜁니다': '뛴다',
+        '속상해요': '속상해',
+        '래요': '래',
+        '까요': '까',
+        '어야죠': '어야지',
+        '라니': '라니',
+        '해집니다': '해진다',
+        '으련만': '으련만',
+        '지워져요': '지워져',
+        '잘라요': '잘라',
+        '고요': '고',
+        '셔야죠': '셔야지',
+        '다쳐요': '다쳐',
+        '는구나': '는구만',
+        '은데요': '은데',
+        '일까요': '일까',
+        '인가요': '인가',
+        '아닐까요': '아닐까',
+        '텐데요': '텐데',
+        '할게요': '할게',
+        '보입니다': '보이네',
+        '에요': '야',
+        '걸요': '걸',
+        '한답니다': '한대',
+        '을까요': '을까',
+        '못해요': '못해',
+        '베푸세요': '베풀어',
+        '어때요': '어떄',
+        '더라구요': '더라구',
+        '노라': '노라',
+        '반가워요': '반가워',
+        '군요': '군',
+        '만납시다': '만나자',
+        '어떠세요': '어때',
+        '달라져요': '달라져',
+        '예뻐요': '예뻐',
+        '됩니다': '된다',
+        '봅시다': '보자',
+        '한대요': '한대',
+        '싸워요': '싸워',
+        '와요': '와',
+        '인데요': '인데',
+        '야': '야',
+        '줄게요': '줄게',
+        '기에요': '기',
+        '던데요': '던데',
+        '걸까요': '걸까',
+        '신가요': '신가',
+        '어요': '어',
+        '따져요': '따져',
+        '갈게요': '갈게',
+        '봐': '봐',
+        '나요': '나',
+        '니까요': '니까',
+        '마요': '마',
+        '씁니다': '쓴다',
+        '집니다': '진다',
+        '건데요': '건데',
+        '지웁시다': '지우자',
+        '바랍니다': '바래',
+        '는데요': '는데',
+        '으니까요': '으니까',
+        '셔요': '셔',
+        '네여': '네',
+        '달라요': '달라',
+        '거려요': '거려',
+        '보여요': '보여',
+        '겁니다': '껄',
+        '다': '다',
+        '그래요': '그래',
+        '한가요': '한가',
+        '잖아요': '잖아',
+        '한데요': '한데',
+        '우세요': '우셈',
+        '해야죠': '해야지',
+        '세요': '셈',
+        '걸려요': '걸려',
+        '텐데': '텐데',
+        '어딘가': '어딘가',
+        '요': '',
+        '흘러갑니다': '흘러간다',
+        '줘요': '줘',
+        '편해요': '편해',
+        '거예요': '거야',
+        '예요': '야',
+        '습니다': '어',
+        '아닌가요': '아닌가',
+        '합니다': '한다',
+        '사라집니다': '사라져',
+        '드릴게요': '줄게',
+        '다면': '다면',
+        '그럴까요': '그럴까',
+        '해요': '해',
+        '답니다': '다',
+        '주무세요': '자라',
+        '마세요': '마라',
+        '아픈가요': '아프냐',
+        '그런가요': '그런가',
+        '했잖아요': '했잖아',
+        '버려요': '버려',
+        '갑니다': '간다',
+        '가요': '가',
+        '라면요': '라면',
+        '아야죠': '아야지',
+        '살펴봐요': '살펴봐',
+        '남겨요': '남겨',
+        '내려놔요': '내려놔',
+        '떨려요': '떨려',
+        '랍니다': '란다',
+        '돼요': '돼',
+        '버텨요': '버텨',
+        '만나': '만나',
+        '일러요': '일러',
+        '을게요': '을게',
+        '갑시다': '가자',
+        '나아요': '나아',
+        '어려요': '어려',
+        '온대요': '온대',
+        '다고요': '다고',
+        '할래요': '할래',
+        '된대요': '된대',
+        '어울려요': '어울려',
+        '는군요': '는군',
+        '볼까요': '볼까',
+        '드릴까요': '줄까',
+        '라던데요': '라던데',
+        '올게요': '올게',
+        '기뻐요': '기뻐',
+        '아닙니다': '아냐',
+        '둬요': '둬',
+        '십니다': '십',
+        '아파요': '아파',
+        '생겨요': '생겨',
+        '해줘요': '해줘',
+        '로군요': '로군요',
+        '시켜요': '시켜',
+        '느껴져요': '느껴져',
+        '가재요': '가재',
+        '어 ': ' ',
+        '느려요': '느려',
+        '볼게요': '볼게',
+        '쉬워요': '쉬워',
+        '나빠요': '나빠',
+        '불러줄게요': '불러줄게',
+        '살쪄요': '살쪄',
+        '봐야겠어요': '봐야겠어',
+        '네': '네',
+        '어': '어',
+        '든지요': '든지',
+        '드신다': '드심',
+        '가져요': '가져',
+        '할까요': '할까',
+        '졸려요': '졸려',
+        '그럴게요': '그럴게',
+        '': '',
+        '어린가': '어린가',
+        '나와요': '나와',
+        '빨라요': '빨라',
+        '겠죠': '겠지',
+        '졌어요': '졌어',
+        '해봐요': '해봐',
+        '게요': '게',
+        '해드릴까요': '해줄까',
+        '인걸요': '인걸',
+        '했어요': '했어',
+        '원해요': '원해',
+        '는걸요': '는걸',
+        '좋아합니다': '좋아해',
+        '했으면': '했으면',
+        '나갑니다': '나간다',
+        '왔어요': '왔어',
+        '해봅시다': '해보자',
+        '물어봐요': '물어봐',
+        '생겼어요': '생겼어',
+        '해': '해',
+        '다녀올게요': '다녀올게',
+        '납시다': '나자'
+    }
+    return my_exword
\ No newline at end of file
--- a/Chatbot/generation.py 0 → 100644
View file @d3ad9ec
+++ b/Chatbot/generation.py 0 → 100644
View file @d3ad9ec
+import torch
+from get_data import tokenizer1
+from torch.autograd import Variable
+from chatspace import ChatSpace
+spacer = ChatSpace()
+
+def inference(device, args, TEXT, LABEL, model, sa_model):
+    from KoBERT.Sentiment_Analysis_BERT_main import bert_inference
+    sentence = input("문장을 입력하세요 : ")
+    se_list = [sentence]
+
+    # https://github.com/SKTBrain/KoBERT
+    # SKT 에서 공개한 KoBert Sentiment Analysis 를 통해 입력문장의 긍정 부정 판단.
+    sa_label = int(bert_inference(sa_model, se_list))
+
+    sa_token = ''
+    # SA Label 에 따른 encoder input 변화.
+    if sa_label == 0:
+        sa_token = TEXT.vocab.stoi['<nega>']
+    else:
+        sa_token = TEXT.vocab.stoi['<posi>']
+
+    enc_input = tokenizer1(sentence)
+    enc_input_index = []
+
+    for tok in enc_input:
+        enc_input_index.append(TEXT.vocab.stoi[tok])
+
+    # encoder input string to index tensor and plus <pad>
+    if args.per_soft:
+        enc_input_index.append(sa_token)
+
+    for j in range(args.max_len - len(enc_input_index)):
+        enc_input_index.append(TEXT.vocab.stoi['<pad>'])
+
+    enc_input_index = Variable(torch.LongTensor([enc_input_index]))
+
+    dec_input = torch.LongTensor([[LABEL.vocab.stoi['<sos>']]])
+    #print("긍정" if sa_label == 1 else "부정")
+
+    model.eval()
+    pred = []
+    for i in range(args.max_len):
+        y_pred = model(enc_input_index.to(device), dec_input.to(device))
+        y_pred_ids = y_pred.max(dim=-1)[1]
+        if (y_pred_ids[0, -1] == LABEL.vocab.stoi['<eos>']):
+            y_pred_ids = y_pred_ids.squeeze(0)
+            print(">", end=" ")
+            for idx in range(len(y_pred_ids)):
+                if LABEL.vocab.itos[y_pred_ids[idx]] == '<eos>':
+                    pred_sentence = "".join(pred)
+                    pred_str = spacer.space(pred_sentence)
+                    print(pred_str)
+                    break
+                else:
+                    pred.append(LABEL.vocab.itos[y_pred_ids[idx]])
+            return 0
+
+        dec_input = torch.cat(
+            [dec_input.to(torch.device('cpu')),
+             y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))], dim=-1)
+    return 0
\ No newline at end of file
--- a/Chatbot/get_data.py 0 → 100644
View file @d3ad9ec
+++ b/Chatbot/get_data.py 0 → 100644
View file @d3ad9ec
+import torch
+from torchtext import data
+from torchtext.data import TabularDataset
+from torchtext.data import BucketIterator
+from torchtext.vocab import Vectors
+from konlpy.tag import Mecab
+import re
+from Styling import styling, make_special_token
+
+# tokenizer
+def tokenizer1(text):
+    result_text = re.sub('[-=+.,#/\:$@*\"※&%ㆍ!?』\\‘|\(\)\[\]\<\>`\'…》;]', '', text)
+    a = Mecab().morphs(result_text)
+    return ([a[i] for i in range(len(a))])
+
+# 데이터 전처리 및 loader return
+def data_preprocessing(args, device):
+
+    # ID는 사용하지 않음. SA는 Sentiment Analysis 라벨(0,1) 임.
+    ID = data.Field(sequential=False,
+                    use_vocab=False)
+
+    TEXT = data.Field(sequential=True,
+                      use_vocab=True,
+                      tokenize=tokenizer1,
+                      batch_first=True,
+                      fix_length=args.max_len,
+                      dtype=torch.int32
+                      )
+
+    LABEL = data.Field(sequential=True,
+                       use_vocab=True,
+                       tokenize=tokenizer1,
+                       batch_first=True,
+                       fix_length=args.max_len,
+                       init_token='<sos>',
+                       eos_token='<eos>',
+                       dtype=torch.int32
+                       )
+
+    SA = data.Field(sequential=False,
+                    use_vocab=False)
+
+    train_data, test_data = TabularDataset.splits(
+        path='.', train='chatbot_0325_ALLLABEL_train.txt', test='chatbot_0325_ALLLABEL_test.txt', format='tsv',
+        fields=[('id', ID), ('text', TEXT), ('target_text', LABEL), ('SA', SA)], skip_header=True
+    )
+
+    vectors = Vectors(name="kr-projected.txt")
+
+    # TEXT, LABEL 에 필요한 special token 만듦.
+    text_specials, label_specials = make_special_token(args)
+
+    TEXT.build_vocab(train_data, vectors=vectors, max_size=15000, specials=text_specials)
+    LABEL.build_vocab(train_data, vectors=vectors, max_size=15000, specials=label_specials)
+
+    train_loader = BucketIterator(dataset=train_data, batch_size=args.batch_size, device=device, shuffle=True)
+    test_loader = BucketIterator(dataset=test_data, batch_size=args.batch_size, device=device, shuffle=True)
+    # BucketIterator(dataset=traing_data check)
+    return TEXT, LABEL, train_loader, test_loader
--- a/Chatbot/metric.py 0 → 100644
View file @d3ad9ec
+++ b/Chatbot/metric.py 0 → 100644
View file @d3ad9ec
+import torch
+
+# acc 출력
+def acc(yhat, y):
+    with torch.no_grad():
+        yhat = yhat.max(dim=-1)[1] # [0]: max value, [1]: index of max value
+        acc = (yhat == y).float()[y != 1].mean() # padding은 acc에서 제거
+    return acc
+
+# 학습시 모델에 넣는 입력과 모델의 예측 출력.
+def train_test(step, y_pred, dec_output, real_value_index, enc_input, args, TEXT, LABEL):
+
+    if 0 <= step < 3:
+        _, ix = y_pred[real_value_index].data.topk(1)
+        train_Q = enc_input[0]
+        print("<<Q>> :", end=" ")
+        for i in train_Q:
+            if TEXT.vocab.itos[i] == "<pad>":
+                break
+            print(TEXT.vocab.itos[i], end=" ")
+
+        print("\n<<trg A>> :", end=" ")
+        for jj, jx in enumerate(dec_output[real_value_index]):
+            if LABEL.vocab.itos[jx] == "<eos>":
+                break
+            print(LABEL.vocab.itos[jx], end=" ")
+
+        print("\n<<pred A>> :", end=" ")
+        for jj, ix in enumerate(ix):
+            if jj == args.max_len:
+                break
+            if LABEL.vocab.itos[ix] == '<eos>':
+                break
+            print(LABEL.vocab.itos[ix], end=" ")
+        print("\n")
--- a/Chatbot/model.py 0 → 100644
View file @d3ad9ec
+++ b/Chatbot/model.py 0 → 100644
View file @d3ad9ec
+import torch
+import torch.nn as nn
+import math
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+class Transformer(nn.Module):
+    def __init__(self, args, SRC_vocab, TRG_vocab):
+        super(Transformer, self).__init__()
+        self.d_model = args.embedding_dim
+        self.n_head = args.nhead
+        self.num_encoder_layers = args.nlayers
+        self.num_decoder_layers = args.nlayers
+        self.dim_feedforward = args.embedding_dim
+        self.dropout = args.dropout
+
+        self.SRC_vo = SRC_vocab
+        self.TRG_vo = TRG_vocab
+
+        self.pos_encoder = PositionalEncoding(self.d_model, self.dropout)
+
+        self.src_embedding = nn.Embedding(len(self.SRC_vo.vocab), self.d_model)
+        self.trg_embedding = nn.Embedding(len(self.TRG_vo.vocab), self.d_model)
+
+        self.transfomrer = torch.nn.Transformer(d_model=self.d_model,
+                                                nhead=self.n_head,
+                                                num_encoder_layers=self.num_encoder_layers,
+                                                num_decoder_layers=self.num_decoder_layers,
+                                                dim_feedforward=self.dim_feedforward,
+                                                dropout=self.dropout)
+        self.proj_vocab_layer = nn.Linear(
+            in_features=self.dim_feedforward, out_features=len(self.TRG_vo.vocab))
+
+        #self.apply(self._initailze)
+
+    def forward(self, en_input, de_input):
+        x_en_embed = self.src_embedding(en_input.long()) * math.sqrt(self.d_model)
+        x_de_embed = self.trg_embedding(de_input.long()) * math.sqrt(self.d_model)
+        x_en_embed = self.pos_encoder(x_en_embed)
+        x_de_embed = self.pos_encoder(x_de_embed)
+
+        # Masking
+        src_key_padding_mask = en_input == self.SRC_vo.vocab.stoi['<pad>']
+        tgt_key_padding_mask = de_input == self.TRG_vo.vocab.stoi['<pad>']
+        memory_key_padding_mask = src_key_padding_mask
+        tgt_mask = self.transfomrer.generate_square_subsequent_mask(de_input.size(1))
+
+        x_en_embed = torch.einsum('ijk->jik', x_en_embed)
+        x_de_embed = torch.einsum('ijk->jik', x_de_embed)
+
+        feature = self.transfomrer(src=x_en_embed,
+                                   tgt=x_de_embed,
+                                   src_key_padding_mask=src_key_padding_mask,
+                                   tgt_key_padding_mask=tgt_key_padding_mask,
+                                   memory_key_padding_mask=memory_key_padding_mask,
+                                   tgt_mask=tgt_mask.to(device))
+
+        logits = self.proj_vocab_layer(feature)
+        logits = torch.einsum('ijk->jik', logits)
+
+        return logits
+
+    def _initailze(self, layer):
+        if isinstance(layer, (nn.Linear)):
+            nn.init.kaiming_uniform_(layer.weight)
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout, max_len=15000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+
+class GradualWarmupScheduler(_LRScheduler):
+
+    """ Gradually warm-up(increasing) learning rate in optimizer.
+    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        multiplier: target learning rate = base lr * multiplier
+        total_epoch: target learning rate is reached at total_epoch, gradually
+        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    """
+
+    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
+        self.multiplier = multiplier
+        if self.multiplier <= 1.:
+            raise ValueError('multiplier should be greater than 1.')
+        self.total_epoch = total_epoch
+        self.after_scheduler = after_scheduler
+        self.finished = False
+        super().__init__(optimizer)
+
+    def get_lr(self):
+        if self.last_epoch > self.total_epoch:
+            if self.after_scheduler:
+                if not self.finished:
+                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
+                    self.finished = True
+                return self.after_scheduler.get_lr()
+            return [base_lr * self.multiplier for base_lr in self.base_lrs]
+
+        return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+
+    def step_ReduceLROnPlateau(self, metrics, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
+        if self.last_epoch <= self.total_epoch:
+            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
+            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
+                param_group['lr'] = lr
+        else:
+            if epoch is None:
+                self.after_scheduler.step(metrics, None)
+            else:
+                self.after_scheduler.step(metrics, epoch - self.total_epoch)
+
+    def step(self, epoch=None, metrics=None):
+        if type(self.after_scheduler) != ReduceLROnPlateau:
+            if self.finished and self.after_scheduler:
+                if epoch is None:
+                    self.after_scheduler.step(None)
+                else:
+                    self.after_scheduler.step(epoch - self.total_epoch)
+            else:
+                return super(GradualWarmupScheduler, self).step(epoch)
+        else:
+            self.step_ReduceLROnPlateau(metrics, epoch)