bongminkim

chatbot_py files

This diff is collapsed. Click to expand it.
1 +import torch
2 +from get_data import tokenizer1
3 +from torch.autograd import Variable
4 +from chatspace import ChatSpace
5 +spacer = ChatSpace()
6 +
7 +def inference(device, args, TEXT, LABEL, model, sa_model):
8 + from KoBERT.Sentiment_Analysis_BERT_main import bert_inference
9 + sentence = input("문장을 입력하세요 : ")
10 + se_list = [sentence]
11 +
12 + # https://github.com/SKTBrain/KoBERT
13 + # SKT 에서 공개한 KoBert Sentiment Analysis 를 통해 입력문장의 긍정 부정 판단.
14 + sa_label = int(bert_inference(sa_model, se_list))
15 +
16 + sa_token = ''
17 + # SA Label 에 따른 encoder input 변화.
18 + if sa_label == 0:
19 + sa_token = TEXT.vocab.stoi['<nega>']
20 + else:
21 + sa_token = TEXT.vocab.stoi['<posi>']
22 +
23 + enc_input = tokenizer1(sentence)
24 + enc_input_index = []
25 +
26 + for tok in enc_input:
27 + enc_input_index.append(TEXT.vocab.stoi[tok])
28 +
29 + # encoder input string to index tensor and plus <pad>
30 + if args.per_soft:
31 + enc_input_index.append(sa_token)
32 +
33 + for j in range(args.max_len - len(enc_input_index)):
34 + enc_input_index.append(TEXT.vocab.stoi['<pad>'])
35 +
36 + enc_input_index = Variable(torch.LongTensor([enc_input_index]))
37 +
38 + dec_input = torch.LongTensor([[LABEL.vocab.stoi['<sos>']]])
39 + #print("긍정" if sa_label == 1 else "부정")
40 +
41 + model.eval()
42 + pred = []
43 + for i in range(args.max_len):
44 + y_pred = model(enc_input_index.to(device), dec_input.to(device))
45 + y_pred_ids = y_pred.max(dim=-1)[1]
46 + if (y_pred_ids[0, -1] == LABEL.vocab.stoi['<eos>']):
47 + y_pred_ids = y_pred_ids.squeeze(0)
48 + print(">", end=" ")
49 + for idx in range(len(y_pred_ids)):
50 + if LABEL.vocab.itos[y_pred_ids[idx]] == '<eos>':
51 + pred_sentence = "".join(pred)
52 + pred_str = spacer.space(pred_sentence)
53 + print(pred_str)
54 + break
55 + else:
56 + pred.append(LABEL.vocab.itos[y_pred_ids[idx]])
57 + return 0
58 +
59 + dec_input = torch.cat(
60 + [dec_input.to(torch.device('cpu')),
61 + y_pred_ids[0, -1].unsqueeze(0).unsqueeze(0).to(torch.device('cpu'))], dim=-1)
62 + return 0
...\ No newline at end of file ...\ No newline at end of file
1 +import torch
2 +from torchtext import data
3 +from torchtext.data import TabularDataset
4 +from torchtext.data import BucketIterator
5 +from torchtext.vocab import Vectors
6 +from konlpy.tag import Mecab
7 +import re
8 +from Styling import styling, make_special_token
9 +
10 +# tokenizer
11 +def tokenizer1(text):
12 + result_text = re.sub('[-=+.,#/\:$@*\"※&%ㆍ!?』\\‘|\(\)\[\]\<\>`\'…》;]', '', text)
13 + a = Mecab().morphs(result_text)
14 + return ([a[i] for i in range(len(a))])
15 +
16 +# 데이터 전처리 및 loader return
17 +def data_preprocessing(args, device):
18 +
19 + # ID는 사용하지 않음. SA는 Sentiment Analysis 라벨(0,1) 임.
20 + ID = data.Field(sequential=False,
21 + use_vocab=False)
22 +
23 + TEXT = data.Field(sequential=True,
24 + use_vocab=True,
25 + tokenize=tokenizer1,
26 + batch_first=True,
27 + fix_length=args.max_len,
28 + dtype=torch.int32
29 + )
30 +
31 + LABEL = data.Field(sequential=True,
32 + use_vocab=True,
33 + tokenize=tokenizer1,
34 + batch_first=True,
35 + fix_length=args.max_len,
36 + init_token='<sos>',
37 + eos_token='<eos>',
38 + dtype=torch.int32
39 + )
40 +
41 + SA = data.Field(sequential=False,
42 + use_vocab=False)
43 +
44 + train_data, test_data = TabularDataset.splits(
45 + path='.', train='chatbot_0325_ALLLABEL_train.txt', test='chatbot_0325_ALLLABEL_test.txt', format='tsv',
46 + fields=[('id', ID), ('text', TEXT), ('target_text', LABEL), ('SA', SA)], skip_header=True
47 + )
48 +
49 + vectors = Vectors(name="kr-projected.txt")
50 +
51 + # TEXT, LABEL 에 필요한 special token 만듦.
52 + text_specials, label_specials = make_special_token(args)
53 +
54 + TEXT.build_vocab(train_data, vectors=vectors, max_size=15000, specials=text_specials)
55 + LABEL.build_vocab(train_data, vectors=vectors, max_size=15000, specials=label_specials)
56 +
57 + train_loader = BucketIterator(dataset=train_data, batch_size=args.batch_size, device=device, shuffle=True)
58 + test_loader = BucketIterator(dataset=test_data, batch_size=args.batch_size, device=device, shuffle=True)
59 + # BucketIterator(dataset=traing_data check)
60 + return TEXT, LABEL, train_loader, test_loader
1 +import torch
2 +
3 +# acc 출력
4 +def acc(yhat, y):
5 + with torch.no_grad():
6 + yhat = yhat.max(dim=-1)[1] # [0]: max value, [1]: index of max value
7 + acc = (yhat == y).float()[y != 1].mean() # padding은 acc에서 제거
8 + return acc
9 +
10 +# 학습시 모델에 넣는 입력과 모델의 예측 출력.
11 +def train_test(step, y_pred, dec_output, real_value_index, enc_input, args, TEXT, LABEL):
12 +
13 + if 0 <= step < 3:
14 + _, ix = y_pred[real_value_index].data.topk(1)
15 + train_Q = enc_input[0]
16 + print("<<Q>> :", end=" ")
17 + for i in train_Q:
18 + if TEXT.vocab.itos[i] == "<pad>":
19 + break
20 + print(TEXT.vocab.itos[i], end=" ")
21 +
22 + print("\n<<trg A>> :", end=" ")
23 + for jj, jx in enumerate(dec_output[real_value_index]):
24 + if LABEL.vocab.itos[jx] == "<eos>":
25 + break
26 + print(LABEL.vocab.itos[jx], end=" ")
27 +
28 + print("\n<<pred A>> :", end=" ")
29 + for jj, ix in enumerate(ix):
30 + if jj == args.max_len:
31 + break
32 + if LABEL.vocab.itos[ix] == '<eos>':
33 + break
34 + print(LABEL.vocab.itos[ix], end=" ")
35 + print("\n")
1 +import torch
2 +import torch.nn as nn
3 +import math
4 +device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
5 +
6 +class Transformer(nn.Module):
7 + def __init__(self, args, SRC_vocab, TRG_vocab):
8 + super(Transformer, self).__init__()
9 + self.d_model = args.embedding_dim
10 + self.n_head = args.nhead
11 + self.num_encoder_layers = args.nlayers
12 + self.num_decoder_layers = args.nlayers
13 + self.dim_feedforward = args.embedding_dim
14 + self.dropout = args.dropout
15 +
16 + self.SRC_vo = SRC_vocab
17 + self.TRG_vo = TRG_vocab
18 +
19 + self.pos_encoder = PositionalEncoding(self.d_model, self.dropout)
20 +
21 + self.src_embedding = nn.Embedding(len(self.SRC_vo.vocab), self.d_model)
22 + self.trg_embedding = nn.Embedding(len(self.TRG_vo.vocab), self.d_model)
23 +
24 + self.transfomrer = torch.nn.Transformer(d_model=self.d_model,
25 + nhead=self.n_head,
26 + num_encoder_layers=self.num_encoder_layers,
27 + num_decoder_layers=self.num_decoder_layers,
28 + dim_feedforward=self.dim_feedforward,
29 + dropout=self.dropout)
30 + self.proj_vocab_layer = nn.Linear(
31 + in_features=self.dim_feedforward, out_features=len(self.TRG_vo.vocab))
32 +
33 + #self.apply(self._initailze)
34 +
35 + def forward(self, en_input, de_input):
36 + x_en_embed = self.src_embedding(en_input.long()) * math.sqrt(self.d_model)
37 + x_de_embed = self.trg_embedding(de_input.long()) * math.sqrt(self.d_model)
38 + x_en_embed = self.pos_encoder(x_en_embed)
39 + x_de_embed = self.pos_encoder(x_de_embed)
40 +
41 + # Masking
42 + src_key_padding_mask = en_input == self.SRC_vo.vocab.stoi['<pad>']
43 + tgt_key_padding_mask = de_input == self.TRG_vo.vocab.stoi['<pad>']
44 + memory_key_padding_mask = src_key_padding_mask
45 + tgt_mask = self.transfomrer.generate_square_subsequent_mask(de_input.size(1))
46 +
47 + x_en_embed = torch.einsum('ijk->jik', x_en_embed)
48 + x_de_embed = torch.einsum('ijk->jik', x_de_embed)
49 +
50 + feature = self.transfomrer(src=x_en_embed,
51 + tgt=x_de_embed,
52 + src_key_padding_mask=src_key_padding_mask,
53 + tgt_key_padding_mask=tgt_key_padding_mask,
54 + memory_key_padding_mask=memory_key_padding_mask,
55 + tgt_mask=tgt_mask.to(device))
56 +
57 + logits = self.proj_vocab_layer(feature)
58 + logits = torch.einsum('ijk->jik', logits)
59 +
60 + return logits
61 +
62 + def _initailze(self, layer):
63 + if isinstance(layer, (nn.Linear)):
64 + nn.init.kaiming_uniform_(layer.weight)
65 +
66 +class PositionalEncoding(nn.Module):
67 +
68 + def __init__(self, d_model, dropout, max_len=15000):
69 + super(PositionalEncoding, self).__init__()
70 + self.dropout = nn.Dropout(p=dropout)
71 +
72 + pe = torch.zeros(max_len, d_model)
73 + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
74 + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
75 + pe[:, 0::2] = torch.sin(position * div_term)
76 + pe[:, 1::2] = torch.cos(position * div_term)
77 + pe = pe.unsqueeze(0).transpose(0, 1)
78 + self.register_buffer('pe', pe)
79 +
80 + def forward(self, x):
81 + x = x + self.pe[:x.size(0), :]
82 + return self.dropout(x)
83 +
84 +from torch.optim.lr_scheduler import _LRScheduler
85 +from torch.optim.lr_scheduler import ReduceLROnPlateau
86 +
87 +class GradualWarmupScheduler(_LRScheduler):
88 +
89 + """ Gradually warm-up(increasing) learning rate in optimizer.
90 + Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
91 + Args:
92 + optimizer (Optimizer): Wrapped optimizer.
93 + multiplier: target learning rate = base lr * multiplier
94 + total_epoch: target learning rate is reached at total_epoch, gradually
95 + after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
96 + """
97 +
98 + def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
99 + self.multiplier = multiplier
100 + if self.multiplier <= 1.:
101 + raise ValueError('multiplier should be greater than 1.')
102 + self.total_epoch = total_epoch
103 + self.after_scheduler = after_scheduler
104 + self.finished = False
105 + super().__init__(optimizer)
106 +
107 + def get_lr(self):
108 + if self.last_epoch > self.total_epoch:
109 + if self.after_scheduler:
110 + if not self.finished:
111 + self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
112 + self.finished = True
113 + return self.after_scheduler.get_lr()
114 + return [base_lr * self.multiplier for base_lr in self.base_lrs]
115 +
116 + return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
117 +
118 + def step_ReduceLROnPlateau(self, metrics, epoch=None):
119 + if epoch is None:
120 + epoch = self.last_epoch + 1
121 + self.last_epoch = epoch if epoch != 0 else 1 # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
122 + if self.last_epoch <= self.total_epoch:
123 + warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
124 + for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
125 + param_group['lr'] = lr
126 + else:
127 + if epoch is None:
128 + self.after_scheduler.step(metrics, None)
129 + else:
130 + self.after_scheduler.step(metrics, epoch - self.total_epoch)
131 +
132 + def step(self, epoch=None, metrics=None):
133 + if type(self.after_scheduler) != ReduceLROnPlateau:
134 + if self.finished and self.after_scheduler:
135 + if epoch is None:
136 + self.after_scheduler.step(None)
137 + else:
138 + self.after_scheduler.step(epoch - self.total_epoch)
139 + else:
140 + return super(GradualWarmupScheduler, self).step(epoch)
141 + else:
142 + self.step_ReduceLROnPlateau(metrics, epoch)