Showing
5 changed files
with
299 additions
and
0 deletions
Chatbot/Styling.py
0 → 100644
This diff is collapsed. Click to expand it.
Chatbot/generation.py
0 → 100644
1 | +import torch | ||
2 | +from get_data import tokenizer1 | ||
3 | +from torch.autograd import Variable | ||
4 | +from chatspace import ChatSpace | ||
5 | +spacer = ChatSpace() | ||
6 | + | ||
7 | +def inference(device, args, TEXT, LABEL, model, sa_model): | ||
8 | + from KoBERT.Sentiment_Analysis_BERT_main import bert_inference | ||
9 | + sentence = input("문장을 입력하세요 : ") | ||
10 | + se_list = [sentence] | ||
11 | + | ||
12 | + # https://github.com/SKTBrain/KoBERT | ||
13 | + # SKT 에서 공개한 KoBert Sentiment Analysis 를 통해 입력문장의 긍정 부정 판단. | ||
14 | + sa_label = int(bert_inference(sa_model, se_list)) | ||
15 | + | ||
16 | + sa_token = '' | ||
17 | + # SA Label 에 따른 encoder input 변화. | ||
18 | + if sa_label == 0: | ||
19 | + sa_token = TEXT.vocab.stoi['<nega>'] | ||
20 | + else: | ||
21 | + sa_token = TEXT.vocab.stoi['<posi>'] | ||
22 | + | ||
23 | + enc_input = tokenizer1(sentence) | ||
24 | + enc_input_index = [] | ||
25 | + | ||
26 | + for tok in enc_input: | ||
27 | + enc_input_index.append(TEXT.vocab.stoi[tok]) | ||
28 | + | ||
29 | + # encoder input string to index tensor and plus <pad> | ||
30 | + if args.per_soft: | ||
31 | + enc_input_index.append(sa_token) | ||
32 | + | ||
33 | + for j in range(args.max_len - len(enc_input_index)): | ||
34 | + enc_input_index.append(TEXT.vocab.stoi['<pad>']) | ||
35 | + | ||
36 | + enc_input_index = Variable(torch.LongTensor([enc_input_index])) | ||
37 | + | ||
38 | + dec_input = torch.LongTensor([[LABEL.vocab.stoi['<sos>']]]) | ||
39 | + #print("긍정" if sa_label == 1 else "부정") | ||
40 | + | ||
41 | + model.eval() | ||
42 | + pred = [] | ||
43 | + for i in range(args.max_len): | ||
44 | + y_pred = model(enc_input_index.to(device), dec_input.to(device)) | ||
45 | + y_pred_ids = y_pred.max(dim=-1)[1] | ||
46 | + if (y_pred_ids[0, -1] == LABEL.vocab.stoi['<eos>']): | ||
47 | + y_pred_ids = y_pred_ids.squeeze(0) | ||
48 | + print(">", end=" ") | ||
49 | + for idx in range(len(y_pred_ids)): | ||
50 | + if LABEL.vocab.itos[y_pred_ids[idx]] == '<eos>': | ||
51 | + pred_sentence = "".join(pred) | ||
52 | + pred_str = spacer.space(pred_sentence) | ||
53 | + print(pred_str) | ||
54 | + break | ||
55 | + else: | ||
56 | + pred.append(LABEL.vocab.itos[y_pred_ids[idx]]) | ||
57 | + return 0 | ||
58 | + | ||
59 | + dec_input = torch.cat( | ||
60 | + [dec_input.to(torch.device('cpu')), | ||
61 | + y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))], dim=-1) | ||
62 | + return 0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
Chatbot/get_data.py
0 → 100644
1 | +import torch | ||
2 | +from torchtext import data | ||
3 | +from torchtext.data import TabularDataset | ||
4 | +from torchtext.data import BucketIterator | ||
5 | +from torchtext.vocab import Vectors | ||
6 | +from konlpy.tag import Mecab | ||
7 | +import re | ||
8 | +from Styling import styling, make_special_token | ||
9 | + | ||
10 | +# tokenizer | ||
11 | +def tokenizer1(text): | ||
12 | + result_text = re.sub('[-=+.,#/\:$@*\"※&%ㆍ!?』\\‘|\(\)\[\]\<\>`\'…》;]', '', text) | ||
13 | + a = Mecab().morphs(result_text) | ||
14 | + return ([a[i] for i in range(len(a))]) | ||
15 | + | ||
16 | +# 데이터 전처리 및 loader return | ||
17 | +def data_preprocessing(args, device): | ||
18 | + | ||
19 | + # ID는 사용하지 않음. SA는 Sentiment Analysis 라벨(0,1) 임. | ||
20 | + ID = data.Field(sequential=False, | ||
21 | + use_vocab=False) | ||
22 | + | ||
23 | + TEXT = data.Field(sequential=True, | ||
24 | + use_vocab=True, | ||
25 | + tokenize=tokenizer1, | ||
26 | + batch_first=True, | ||
27 | + fix_length=args.max_len, | ||
28 | + dtype=torch.int32 | ||
29 | + ) | ||
30 | + | ||
31 | + LABEL = data.Field(sequential=True, | ||
32 | + use_vocab=True, | ||
33 | + tokenize=tokenizer1, | ||
34 | + batch_first=True, | ||
35 | + fix_length=args.max_len, | ||
36 | + init_token='<sos>', | ||
37 | + eos_token='<eos>', | ||
38 | + dtype=torch.int32 | ||
39 | + ) | ||
40 | + | ||
41 | + SA = data.Field(sequential=False, | ||
42 | + use_vocab=False) | ||
43 | + | ||
44 | + train_data, test_data = TabularDataset.splits( | ||
45 | + path='.', train='chatbot_0325_ALLLABEL_train.txt', test='chatbot_0325_ALLLABEL_test.txt', format='tsv', | ||
46 | + fields=[('id', ID), ('text', TEXT), ('target_text', LABEL), ('SA', SA)], skip_header=True | ||
47 | + ) | ||
48 | + | ||
49 | + vectors = Vectors(name="kr-projected.txt") | ||
50 | + | ||
51 | + # TEXT, LABEL 에 필요한 special token 만듦. | ||
52 | + text_specials, label_specials = make_special_token(args) | ||
53 | + | ||
54 | + TEXT.build_vocab(train_data, vectors=vectors, max_size=15000, specials=text_specials) | ||
55 | + LABEL.build_vocab(train_data, vectors=vectors, max_size=15000, specials=label_specials) | ||
56 | + | ||
57 | + train_loader = BucketIterator(dataset=train_data, batch_size=args.batch_size, device=device, shuffle=True) | ||
58 | + test_loader = BucketIterator(dataset=test_data, batch_size=args.batch_size, device=device, shuffle=True) | ||
59 | + # BucketIterator(dataset=traing_data check) | ||
60 | + return TEXT, LABEL, train_loader, test_loader |
Chatbot/metric.py
0 → 100644
1 | +import torch | ||
2 | + | ||
3 | +# acc 출력 | ||
4 | +def acc(yhat, y): | ||
5 | + with torch.no_grad(): | ||
6 | + yhat = yhat.max(dim=-1)[1] # [0]: max value, [1]: index of max value | ||
7 | + acc = (yhat == y).float()[y != 1].mean() # padding은 acc에서 제거 | ||
8 | + return acc | ||
9 | + | ||
10 | +# 학습시 모델에 넣는 입력과 모델의 예측 출력. | ||
11 | +def train_test(step, y_pred, dec_output, real_value_index, enc_input, args, TEXT, LABEL): | ||
12 | + | ||
13 | + if 0 <= step < 3: | ||
14 | + _, ix = y_pred[real_value_index].data.topk(1) | ||
15 | + train_Q = enc_input[0] | ||
16 | + print("<<Q>> :", end=" ") | ||
17 | + for i in train_Q: | ||
18 | + if TEXT.vocab.itos[i] == "<pad>": | ||
19 | + break | ||
20 | + print(TEXT.vocab.itos[i], end=" ") | ||
21 | + | ||
22 | + print("\n<<trg A>> :", end=" ") | ||
23 | + for jj, jx in enumerate(dec_output[real_value_index]): | ||
24 | + if LABEL.vocab.itos[jx] == "<eos>": | ||
25 | + break | ||
26 | + print(LABEL.vocab.itos[jx], end=" ") | ||
27 | + | ||
28 | + print("\n<<pred A>> :", end=" ") | ||
29 | + for jj, ix in enumerate(ix): | ||
30 | + if jj == args.max_len: | ||
31 | + break | ||
32 | + if LABEL.vocab.itos[ix] == '<eos>': | ||
33 | + break | ||
34 | + print(LABEL.vocab.itos[ix], end=" ") | ||
35 | + print("\n") |
Chatbot/model.py
0 → 100644
1 | +import torch | ||
2 | +import torch.nn as nn | ||
3 | +import math | ||
4 | +device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | ||
5 | + | ||
6 | +class Transformer(nn.Module): | ||
7 | + def __init__(self, args, SRC_vocab, TRG_vocab): | ||
8 | + super(Transformer, self).__init__() | ||
9 | + self.d_model = args.embedding_dim | ||
10 | + self.n_head = args.nhead | ||
11 | + self.num_encoder_layers = args.nlayers | ||
12 | + self.num_decoder_layers = args.nlayers | ||
13 | + self.dim_feedforward = args.embedding_dim | ||
14 | + self.dropout = args.dropout | ||
15 | + | ||
16 | + self.SRC_vo = SRC_vocab | ||
17 | + self.TRG_vo = TRG_vocab | ||
18 | + | ||
19 | + self.pos_encoder = PositionalEncoding(self.d_model, self.dropout) | ||
20 | + | ||
21 | + self.src_embedding = nn.Embedding(len(self.SRC_vo.vocab), self.d_model) | ||
22 | + self.trg_embedding = nn.Embedding(len(self.TRG_vo.vocab), self.d_model) | ||
23 | + | ||
24 | + self.transfomrer = torch.nn.Transformer(d_model=self.d_model, | ||
25 | + nhead=self.n_head, | ||
26 | + num_encoder_layers=self.num_encoder_layers, | ||
27 | + num_decoder_layers=self.num_decoder_layers, | ||
28 | + dim_feedforward=self.dim_feedforward, | ||
29 | + dropout=self.dropout) | ||
30 | + self.proj_vocab_layer = nn.Linear( | ||
31 | + in_features=self.dim_feedforward, out_features=len(self.TRG_vo.vocab)) | ||
32 | + | ||
33 | + #self.apply(self._initailze) | ||
34 | + | ||
35 | + def forward(self, en_input, de_input): | ||
36 | + x_en_embed = self.src_embedding(en_input.long()) * math.sqrt(self.d_model) | ||
37 | + x_de_embed = self.trg_embedding(de_input.long()) * math.sqrt(self.d_model) | ||
38 | + x_en_embed = self.pos_encoder(x_en_embed) | ||
39 | + x_de_embed = self.pos_encoder(x_de_embed) | ||
40 | + | ||
41 | + # Masking | ||
42 | + src_key_padding_mask = en_input == self.SRC_vo.vocab.stoi['<pad>'] | ||
43 | + tgt_key_padding_mask = de_input == self.TRG_vo.vocab.stoi['<pad>'] | ||
44 | + memory_key_padding_mask = src_key_padding_mask | ||
45 | + tgt_mask = self.transfomrer.generate_square_subsequent_mask(de_input.size(1)) | ||
46 | + | ||
47 | + x_en_embed = torch.einsum('ijk->jik', x_en_embed) | ||
48 | + x_de_embed = torch.einsum('ijk->jik', x_de_embed) | ||
49 | + | ||
50 | + feature = self.transfomrer(src=x_en_embed, | ||
51 | + tgt=x_de_embed, | ||
52 | + src_key_padding_mask=src_key_padding_mask, | ||
53 | + tgt_key_padding_mask=tgt_key_padding_mask, | ||
54 | + memory_key_padding_mask=memory_key_padding_mask, | ||
55 | + tgt_mask=tgt_mask.to(device)) | ||
56 | + | ||
57 | + logits = self.proj_vocab_layer(feature) | ||
58 | + logits = torch.einsum('ijk->jik', logits) | ||
59 | + | ||
60 | + return logits | ||
61 | + | ||
62 | + def _initailze(self, layer): | ||
63 | + if isinstance(layer, (nn.Linear)): | ||
64 | + nn.init.kaiming_uniform_(layer.weight) | ||
65 | + | ||
66 | +class PositionalEncoding(nn.Module): | ||
67 | + | ||
68 | + def __init__(self, d_model, dropout, max_len=15000): | ||
69 | + super(PositionalEncoding, self).__init__() | ||
70 | + self.dropout = nn.Dropout(p=dropout) | ||
71 | + | ||
72 | + pe = torch.zeros(max_len, d_model) | ||
73 | + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) | ||
74 | + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) | ||
75 | + pe[:, 0::2] = torch.sin(position * div_term) | ||
76 | + pe[:, 1::2] = torch.cos(position * div_term) | ||
77 | + pe = pe.unsqueeze(0).transpose(0, 1) | ||
78 | + self.register_buffer('pe', pe) | ||
79 | + | ||
80 | + def forward(self, x): | ||
81 | + x = x + self.pe[:x.size(0), :] | ||
82 | + return self.dropout(x) | ||
83 | + | ||
84 | +from torch.optim.lr_scheduler import _LRScheduler | ||
85 | +from torch.optim.lr_scheduler import ReduceLROnPlateau | ||
86 | + | ||
87 | +class GradualWarmupScheduler(_LRScheduler): | ||
88 | + | ||
89 | + """ Gradually warm-up(increasing) learning rate in optimizer. | ||
90 | + Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'. | ||
91 | + Args: | ||
92 | + optimizer (Optimizer): Wrapped optimizer. | ||
93 | + multiplier: target learning rate = base lr * multiplier | ||
94 | + total_epoch: target learning rate is reached at total_epoch, gradually | ||
95 | + after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau) | ||
96 | + """ | ||
97 | + | ||
98 | + def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None): | ||
99 | + self.multiplier = multiplier | ||
100 | + if self.multiplier <= 1.: | ||
101 | + raise ValueError('multiplier should be greater than 1.') | ||
102 | + self.total_epoch = total_epoch | ||
103 | + self.after_scheduler = after_scheduler | ||
104 | + self.finished = False | ||
105 | + super().__init__(optimizer) | ||
106 | + | ||
107 | + def get_lr(self): | ||
108 | + if self.last_epoch > self.total_epoch: | ||
109 | + if self.after_scheduler: | ||
110 | + if not self.finished: | ||
111 | + self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs] | ||
112 | + self.finished = True | ||
113 | + return self.after_scheduler.get_lr() | ||
114 | + return [base_lr * self.multiplier for base_lr in self.base_lrs] | ||
115 | + | ||
116 | + return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs] | ||
117 | + | ||
118 | + def step_ReduceLROnPlateau(self, metrics, epoch=None): | ||
119 | + if epoch is None: | ||
120 | + epoch = self.last_epoch + 1 | ||
121 | + self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning | ||
122 | + if self.last_epoch <= self.total_epoch: | ||
123 | + warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs] | ||
124 | + for param_group, lr in zip(self.optimizer.param_groups, warmup_lr): | ||
125 | + param_group['lr'] = lr | ||
126 | + else: | ||
127 | + if epoch is None: | ||
128 | + self.after_scheduler.step(metrics, None) | ||
129 | + else: | ||
130 | + self.after_scheduler.step(metrics, epoch - self.total_epoch) | ||
131 | + | ||
132 | + def step(self, epoch=None, metrics=None): | ||
133 | + if type(self.after_scheduler) != ReduceLROnPlateau: | ||
134 | + if self.finished and self.after_scheduler: | ||
135 | + if epoch is None: | ||
136 | + self.after_scheduler.step(None) | ||
137 | + else: | ||
138 | + self.after_scheduler.step(epoch - self.total_epoch) | ||
139 | + else: | ||
140 | + return super(GradualWarmupScheduler, self).step(epoch) | ||
141 | + else: | ||
142 | + self.step_ReduceLROnPlateau(metrics, epoch) |
-
Please register or login to post a comment