김민수

Upload Wekly report & codes

# -*- coding: utf-8 -*-
import argparse
from argparse import ArgumentError
import os
import glob
import time
import subprocess
import gluonnlp as nlp
from numpy.lib.function_base import delete
import torch
from torch.utils.data import DataLoader, Dataset
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
from tqdm import tqdm
from util.data_loader import ArticleDataset, ToTensor
def get_gpu_memory_map():
"""Get the current gpu usage.
Returns
-------
usage: dict
Keys are device ids as integers.
Values are memory usage as integers in MB.
"""
result = subprocess.check_output(
[
'nvidia-smi', '--query-gpu=memory.used',
'--format=csv,nounits,noheader'
], encoding='utf-8')
# Convert lines into a dictionary
gpu_memory = [int(x) for x in result.strip().split('\n')]
gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
return gpu_memory_map
if __name__ == "__main__":
parser=argparse.ArgumentParser(description='Train KoGPT2 with ArticleDataset.')
parser.add_argument('--docker', action='store_true', help="Train on docker. Sets model cache path:/code/model, dataset path:/dataset, save path:/code/save.")
parser.add_argument('--default', action='store_true', help="Use un-tuned KoGPT2")
parser.add_argument('--model_topic', choices=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'] )
parser.add_argument('--epoch', type=int)
parser.add_argument('--topic', nargs='+',choices=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'], default=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'])
parser.add_argument('device', choices=['cpu', 'cuda', 'cuda:0', 'cuda:1'])
args = parser.parse_args()
print(args)
model_cache_path='/code/model' if args.docker else 'model'
dataset_path='/dataset' if args.docker else '../dataset'
save_path='/code/save' if args.docker else 'save'
ctx=args.device if torch.cuda.is_available() else 'cpu'
print(ctx)
device=torch.device(ctx)
tokenizer_path = get_tokenizer(cachedir=model_cache_path)
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir=model_cache_path)
tokenizer = SentencepieceTokenizer(tokenizer_path, num_best=0, alpha=0)
num_workers=32
batch_size=64
padding_id=vocab[vocab.padding_token]
topics=set(sorted(args.topic))
transform=ToTensor(tokenizer,vocab,128)
print("Preparing dataloader...")
dataloaders={}
dataloaders["all"]=DataLoader(ArticleDataset(dataset_path,label='test', transform=transform),batch_size=batch_size, num_workers=0)
for topic in tqdm(topics):
dataloaders[topic]=DataLoader(ArticleDataset(dataset_path, topics={topic},label='test', transform=transform),batch_size=batch_size, num_workers=0)
print("Prepared dataloader.")
epoches=30
checkpoint_epoch=0
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
topic_all=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학']
model_topic=topic_all if args.model_topic is None else sorted(list({args.model_topic}))
model_epoch='*' if args.epoch is None else args.epoch
dev=ctx if ctx in {'cpu', 'cuda'} else 'cuda:*'
braced=str("{'생활', '경제', 'IT_과학', '미용_건강', '스포츠', '사회', '연예', '문화', '정치'}") if args.model_topic is None else '{'+str(model_topic)[1:-1]+'}'
saves=glob.glob(f'{save_path}/KoGPT2_checkpoint_{dev}_{braced}_{transform.max_len}_{model_epoch}.state')
if not args.default:
if len(saves)>0:
last_save=max(saves,key=os.path.getmtime)
checkpoint = torch.load(last_save, map_location=device)
print(f"Loading save from {last_save}")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
checkpoint_epoch = checkpoint['epoch']
last_test_loss = checkpoint['loss']
else:
print("No save exists.")
raise FileNotFoundError(f'{save_path}/KoGPT2_checkpoint_{ctx}_{model_topic}_{transform.max_len}_{model_epoch}.state')
model.to(device)
model.eval()
cached_testset_path=f"{save_path}/test_{topic_all}_{transform.max_len}"
if os.path.isfile(cached_testset_path+'.npy'):
dataloaders["all"].dataset.load_from_file(cached_testset_path+'.npy')
else:
print("Caching testset... topic: all")
for temp in tqdm(dataloaders["all"]):
pass
dataloaders["all"].dataset.set_use_cache(True, cached_testset_path)
print("Cached. topic: all")
dataloaders["all"].dataset.num_workers=num_workers
for topic in tqdm(topics):
cached_testset_path=f"{save_path}/test_{{topic}}_{transform.max_len}"
if os.path.isfile(cached_testset_path+'.npy'):
dataloaders[topic].dataset.load_from_file(cached_testset_path+'.npy')
else:
print(f"Caching testset... topic: {topic}")
for temp in tqdm(dataloaders[topic]):
pass
dataloaders[topic].dataset.set_use_cache(True, cached_testset_path)
print(f"Cached. topic: {topic}")
dataloaders[topic].dataset.num_workers=num_workers
last_test_loss=float('infinity')
overfit=-1
states=[]
for topic in tqdm(dataloaders):
try:
test_loss_list=[]
for data in tqdm(dataloaders[topic]):
data = data.to(ctx)
label = torch.where(data!=padding_id, data, torch.ones_like(data)*-100)
mask = torch.where(data!=padding_id,torch.ones_like(data),torch.zeros_like(data))
output=model(data,labels=label, attention_mask=mask)
loss=output[0]
test_loss_list.append(loss.item())
del label
del mask
del loss
del output
del data
test_loss=sum(test_loss_list)/len(test_loss_list)
print(f"data_topic: {topic}, model_topic: {model_topic} test loss: {test_loss}")
states.append((topic, model_topic,test_loss))
except KeyboardInterrupt:
break
log_path=f"{save_path}/test_{'DEFAULT' if args.default else model_topic}_{topics}_{transform.max_len}_{int(time.time())}.log"
with open(log_path, 'w') as log:
log.write("data_topic, model_topic, test loss,\n")
for state in states:
log.write(f"{state[0]}, {state[1]},{state[2]},\n")
print(f"Log written at: {log_path}")
\ No newline at end of file
import torch
from random import choice, choices, randint
import argparse
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer
def top_k(predict, vocab, k):
# topk 중 랜덤으로 선택된 값을 반환.
probs, indices = torch.topk(predict, k=k,dim=-1)
return vocab.to_tokens(choice(indices.tolist()))
def top_p(logits, vocab, threshold = 0.9):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
indexs = sorted_indices.tolist()
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
cum_prob = 0
top_p_index = 0
# Top-p에 해당하는 index를 획득
for i, prob in enumerate(sorted_softmax_logits):
if cum_prob>threshold:
top_p_index = 0 if i==0 else i-1
break
cum_prob+=prob
rand_num = randint(0, top_p_index) # top-p 분포에서 랜덤 샘플링
return vocab.to_tokens(indexs[rand_num])
def weighted_random(logits, vocab):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
indexs = sorted_indices.tolist()
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
return vocab.to_tokens(choices(indexs,weights=sorted_softmax_logits)[0])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='KoGPT2 generation example')
parser.add_argument('sentence', metavar='S', type=str, nargs='?',default= '2019년 한해를 보내며,',
help='korean sentence to use as input.')
ctx='cuda' if torch.cuda.is_available() else 'cpu'
tok_path = get_tokenizer(cachedir='/code/model')
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir='/code/model')
tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())[-1]
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Greedy:',sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = top_k(pred.squeeze()[-1], vocab, 3)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Top 3:', sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = top_k(pred.squeeze()[-1], vocab, 5)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Top 5:', sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = top_p(pred.squeeze()[-1], vocab,0.5)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Top p=0.5:', sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = top_p(pred.squeeze()[-1], vocab,0.7)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Top p=0.7:', sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = top_p(pred.squeeze()[-1], vocab)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Top p=0.9:', sent)
sent = parser.parse_args().sentence
toked = tok(sent)
token_count=0
while token_count<100:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0)
pred = model(input_ids)[0]
gen = weighted_random(pred.squeeze()[-1], vocab)
if gen == '</s>':
break
sent += gen.replace('▁', ' ')
toked = tok(sent)
token_count+=1
except KeyboardInterrupt:
break
print('Weighted random:', sent)
\ No newline at end of file
import argparse
import tqdm
import gluonnlp as nlp
import torch
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from .utils.data_loader import ArticleDataset, ToTensor
class KoGPT2Chat(LightningModule):
def __init__(self, hparams, **kwargs):
super(KoGPT2Chat, self).__init__()
self.hparams = hparams
self.tok_path = get_tokenizer()
self.neg = -1e18
self.model, self.vocab = get_pytorch_kogpt2_model()
self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
@staticmethod
def add_model_specific_args(parent_parser):
# add model specific args
parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
parser.add_argument('--max-len',
type=int,
default=32,
help='max sentence length on input (default: 32)')
parser.add_argument('--batch-size',
type=int,
default=96,
help='batch size for training (default: 96)')
parser.add_argument('--lr',
type=float,
default=5e-5,
help='The initial learning rate')
parser.add_argument('--warmup_ratio',
type=float,
default=0.1,
help='warmup ratio')
return parser
def forward(self, inputs):
# (batch, seq_len, hiddens)
output, _ = self.kogpt2(inputs)
return output
def training_step(self, batch, batch_idx):
token_ids, mask, label = batch
out = self(token_ids)
mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
mask_out = torch.where(mask_3d == 1, out, self.neg * torch.ones_like(out))
loss = self.loss_function(mask_out.transpose(2, 1), label)
loss_avg = loss.sum() / mask.sum()
tensorboard_logs = {'train_loss': loss_avg}
return {'loss': loss_avg, 'log': tensorboard_logs}
def configure_optimizers(self):
# Prepare optimizer
param_optimizer = list(self.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
lr=self.hparams.lr, correct_bias=False)
# warm up lr
num_train_steps = len(self.train_dataloader()) * self.hparams.max_epochs
num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(
optimizer,
num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
lr_scheduler = {'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup',
'monitor': 'loss', 'interval': 'step',
'frequency': 1}
return [optimizer], [lr_scheduler]
def _collate_fn(self, batch):
data = [item[0] for item in batch]
mask = [item[1] for item in batch]
label = [item[2] for item in batch]
return torch.LongTensor(data), torch.LongTensor(mask), torch.LongTensor(label)
def train_dataloader(self):
data = pd.read_csv('Chatbot_data/ChatbotData.csv')
jhkhk self.train_set = ArticleDataset(data, self.tok_path, self.vocab, max_len=self.hparams.max_len)
train_dataloader = DataLoader(
self.train_set, batch_size=self.hparams.batch_size, num_workers=2,
shuffle=True, collate_fn=self._collate_fn)
return train_dataloader
parser = KoGPT2Chat.add_model_specific_args(parser)
parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()
if __name__ == "__main__":
if args.train:
checkpoint_callback = ModelCheckpoint(
filepath='model_chp/{epoch:02d}-{loss:.2f}',
verbose=True,
save_last=True,
monitor='loss',
mode='min',
prefix='model_'
)
# python train_torch.py --train --gpus 1 --max_epochs 3
model = KoGPT2Chat(args)
model.train()
trainer = Trainer.from_argparse_args(
args,
checkpoint_callback=checkpoint_callback, gradient_clip_val=1.0)
trainer.fit(model, )
if args.chat:
model = KoGPT2Chat.load_from_checkpoint(args.model_params)
model.eval()
\ No newline at end of file
from random import choice, choices, randint
import argparse
import re
import time
import torch
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer
def greedy(predict):
return (torch.argmax(predict, axis=-1).tolist())
def top_k(predict, k):
# topk 중 랜덤으로 선택된 값을 반환.
probs, indices = torch.topk(predict, k=k,dim=-1)
return choice(indices.tolist())
def top_p(logits, threshold = 0.9):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
indices = sorted_indices.tolist()
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
cum_prob = 0
top_p_index = 0
# Top-p에 해당하는 index를 획득
for i, prob in enumerate(sorted_softmax_logits):
if cum_prob>threshold:
top_p_index = 0 if i==0 else i-1
break
cum_prob+=prob
rand_num = randint(0, top_p_index) # top-p 분포에서 랜덤 샘플링
return indices[rand_num]
def weighted_random(logits):
indices=torch.where(logits>=0)[0] #음수 고려 안 함
selected_logits=torch.index_select(logits,-1,indices)
softmax_logits = torch.nn.functional.softmax(selected_logits, dim=-1)
return choices(indices.tolist(),weights=softmax_logits)[0]
def weighted_top_k(predict, k):
probs, indices = torch.topk(predict, k=k,dim=-1)
softmax_probs = torch.nn.functional.softmax(probs, dim=-1)
return choices(indices.tolist(),weights=softmax_probs)[0]
def weighted_top_p(logits, threshold = 0.9):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
cum_prob = 0
last_cum_prob=0
top_p_bound = 0
# Top-p에 해당하는 index를 획득
for i, prob in enumerate(sorted_softmax_logits):
if cum_prob>threshold:
top_p_bound = i
break
last_cum_prob=cum_prob
cum_prob+=prob
return choices(sorted_indices[:top_p_bound].tolist(),weights=sorted_softmax_logits[:top_p_bound]/last_cum_prob)[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='KoGPT2 generation example')
group=parser.add_mutually_exclusive_group()
group.add_argument('-g','--greedy',action='store_const',const='greedy',help='Greedy sampling')
group.add_argument('-k','--topk',type=int, choices=range(1,51), help='Top k sampling. 1<=K<=50', metavar='K')
group.add_argument('-p','--topp',type=float, help='Top p sampling. 0<P<=1.0', metavar='P')
parser.add_argument('-w','--weighted',action='store_true', help='Use weighted version of sampling.')
parser.add_argument('-d','--docker', action='store_true', help="Train on docker. Sets model cache path:/code/model, dataset path:/dataset, save path:/code/save.")
parser.add_argument('-c','--checkpoint',type=str , help='Model chekpoint path',metavar='PATH')
parser.add_argument('-f','--full_sentence', action='store_true' , help='Treat last S as a full_sentence. (Do not append it.)')
parser.add_argument('-l','--length', type=int, choices=range(1,21) , help='Set length of paragraph.', metavar='LENGTH', default=15)
parser.add_argument('sentence', metavar='S', type=str, nargs='*',
help='korean sentence to use as input.')
args = parser.parse_args()
print(args)
model_cache_path='/code/model' if args.docker else 'model'
save_path='/code/save' if args.docker else 'save'
if args.greedy:
sampling_name = "Weighted" if args.weighted else "Greedy"
sampling=weighted_random if args.weighted else greedy
elif args.topk is not None:
sampling_name=f"Weighted Top k={args.topk}" if args.weighted else f"Top k={args.topk}"
sampling= (lambda pred: weighted_top_k(pred,args.topk)) if args.weighted else (lambda pred: top_k(pred,args.topk))
elif args.topp is not None:
sampling_name=f"Weighted Top p={args.topp}" if args.weighted else f"Top p={args.topp}"
sampling= (lambda pred: weighted_top_p(pred,args.topp)) if args.weighted else (lambda pred: top_p(pred,args.topp))
else: #if args.weighted:
sampling_name="Weighted"
sampling=weighted_random
ctx='cuda:0' if torch.cuda.is_available() else 'cpu'
device=torch.device(ctx)
tok_path = get_tokenizer(cachedir=model_cache_path)
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir=model_cache_path)
tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
if args.checkpoint:
checkpoint = torch.load(args.checkpoint, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
epoch = checkpoint['epoch']
model.eval()
toked=[]
for sent in args.sentence:
toked += (tok(sent)+[vocab.eos_token,vocab.bos_token])
else:
if not args.full_sentence:
toked=toked[:-2]
token_count=0
sent_count=0
started=time.time()
while token_count<1000:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0).to(device=device)
pred = model(input_ids)[0]
gen_id = sampling(pred.squeeze()[-1])
gen_token=vocab.to_tokens(gen_id)
if gen_token == vocab.eos_token:
sent_count+=1
print(sent_count, token_count)
if sent_count>=args.length:
break
else:
toked+=[vocab.eos_token,vocab.bos_token]
token_count+=2
else:
toked.append(gen_token)
token_count+=1
except KeyboardInterrupt:
break
print(f'{sampling_name}:',re.sub('</s>', '\r\n',re.sub('(▁|<s>)',' ',''.join(toked))))
print("Time elapsed:", time.time()-started)
from random import choice, choices, randint
import argparse
import torch
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
from tqdm import trange
def greedy(predict):
return (torch.argmax(predict, axis=-1).tolist())
def top_k(predict, k):
# topk 중 랜덤으로 선택된 값을 반환.
probs, indices = torch.topk(predict, k=k,dim=-1)
return choice(indices.tolist())
def top_p(logits, threshold = 0.9):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
indices = sorted_indices.tolist()
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
cum_prob = 0
top_p_index = 0
# Top-p에 해당하는 index를 획득
for i, prob in enumerate(sorted_softmax_logits):
if cum_prob>threshold:
top_p_index = 0 if i==0 else i-1
break
cum_prob+=prob
rand_num = randint(0, top_p_index) # top-p 분포에서 랜덤 샘플링
return indices[rand_num]
def weighted_random(logits):
indices=torch.where(logits>=0)[0] #음수 고려 안 함
selected_logits=torch.index_select(logits,-1,indices)
softmax_logits = torch.nn.functional.softmax(selected_logits, dim=-1)
return choices(indices.tolist(),weights=softmax_logits)[0]
def weighted_top_k(predict, k):
probs, indices = torch.topk(predict, k=k,dim=-1)
softmax_probs = torch.nn.functional.softmax(probs, dim=-1)
return choices(indices.tolist(),weights=softmax_probs)[0]
def weighted_top_p(logits, threshold = 0.9):
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
sorted_softmax_logits = torch.nn.functional.softmax(sorted_logits, dim=-1)
cum_prob = 0
last_cum_prob=0
top_p_bound = 0
# Top-p에 해당하는 index를 획득
for i, prob in enumerate(sorted_softmax_logits):
if cum_prob>threshold:
top_p_bound = i
break
last_cum_prob=cum_prob
cum_prob+=prob
return choices(sorted_indices[:top_p_bound].tolist(),weights=sorted_softmax_logits[:top_p_bound]/last_cum_prob)[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='KoGPT2 generation example')
group=parser.add_mutually_exclusive_group()
group.add_argument('-g','--greedy',action='store_const',const='greedy',help='Greedy sampling')
group.add_argument('-k','--topk',type=int, choices=range(1,51), help='Top k sampling. 1<=K<=50', metavar='K')
group.add_argument('-p','--topp',type=float, help='Top p sampling. 0<P<=1.0', metavar='P')
parser.add_argument('-w','--weighted',action='store_true', help='Use weighted version of sampling.')
parser.add_argument('-d','--docker', action='store_true', help="Train on docker. Sets model cache path:/code/model, dataset path:/dataset, save path:/code/save.")
parser.add_argument('-c','--checkpoint',type=str , help='Model chekpoint path',metavar='PATH')
parser.add_argument('-f','--full_sentence', action='store_true' , help='Treat last S as a full_sentence. (Do not append it.)')
parser.add_argument('-l','--length', type=int, choices=range(1,21) , help='Set length of paragraph.', metavar='LENGTH', default=15)
parser.add_argument('sentence', metavar='S', type=str, nargs='*',
help='korean sentence to use as input.')
args = parser.parse_args()
print(args)
model_cache_path='/code/model' if args.docker else 'model'
save_path='/code/save' if args.docker else 'save'
if args.greedy:
sampling_name = "Weighted" if args.weighted else "Greedy"
sampling=weighted_random if args.weighted else greedy
elif args.topk is not None:
sampling_name=f"Weighted Top k={args.topk}" if args.weighted else f"Top k={args.topk}"
sampling= (lambda pred: weighted_top_k(pred,args.topk)) if args.weighted else (lambda pred: top_k(pred,args.topk))
elif args.topp is not None:
sampling_name=f"Weighted Top p={args.topp}" if args.weighted else f"Top p={args.topp}"
sampling= (lambda pred: weighted_top_p(pred,args.topp)) if args.weighted else (lambda pred: top_p(pred,args.topp))
else: #if args.weighted:
sampling_name="Weighted"
sampling=weighted_random
ctx='cuda:0' if torch.cuda.is_available() else 'cpu'
device=torch.device(ctx)
tok_path = get_tokenizer(cachedir=model_cache_path)
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir=model_cache_path)
tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
if args.checkpoint:
checkpoint = torch.load(args.checkpoint, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
epoch = checkpoint['epoch']
model.eval()
lenght_list=[]
for i in trange(20):
toked=[]
for sent in args.sentence:
toked += (tok(sent)+[vocab.eos_token,vocab.bos_token])
else:
if not args.full_sentence:
toked=toked[:-2]
token_count=0
sent_count=0
while token_count<1000:
try:
input_ids = torch.tensor([vocab[vocab.bos_token],] + vocab[toked]).unsqueeze(0).to(device=device)
pred = model(input_ids)[0]
gen_id = sampling(pred.squeeze()[-1])
gen_token=vocab.to_tokens(gen_id)
if gen_token == vocab.eos_token:
sent_count+=1
lenght_list.append(f"{i},{sent_count}, {token_count}\n")
if sent_count>=args.length:
break
else:
toked+=[vocab.eos_token,vocab.bos_token]
token_count+=2
else:
toked.append(gen_token)
token_count+=1
except KeyboardInterrupt:
break
with open('length.log','a') as log:
log.write(f'#-*- {args.checkpoint} -*-\n')
log.writelines(lenght_list)
log.write('#-*- -*-\n')
\ No newline at end of file
import torch
from tqdm import tqdm
from util.data_loader import ArticleDataset, ToTensor
from torch.utils.data import DataLoader
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
max_len=1024
ctx='cuda' if torch.cuda.is_available() else 'cpu'
device=torch.device(ctx)
tokenizer_path = get_tokenizer(cachedir='/code/model')
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir='/code/model')
tokenizer = SentencepieceTokenizer(tokenizer_path, num_best=0, alpha=0)
transform=ToTensor(tokenizer,vocab,max_len=max_len)
batch_size=64
trainset=DataLoader(ArticleDataset('/dataset', label='train', transform=transform, use_cache=False),batch_size=batch_size, num_workers=32,shuffle=True)
count_dict=dict((idx,0)for idx in range(256,max_len,256))
for (data, original_len) in tqdm(trainset):
original_len.to(device)
for bound in count_dict:
count_dict[bound]+= torch.sum(torch.where( original_len<=bound , torch.ones_like(original_len), torch.zeros_like(original_len))).item()
for bound in count_dict:
print(f"count[{bound}]: {count_dict[bound]}/{len(trainset.dataset)} ({100*count_dict[bound]/len(trainset.dataset):.1f}%)")
\ No newline at end of file
# -*- coding: utf-8 -*-
import argparse
import os
import glob
import time
import gluonnlp as nlp
import torch
from torch.nn import DataParallel
from torch.utils.data import DataLoader, Dataset
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
from tqdm import tqdm
from util.data_loader import ArticleDataset, ToTensor
if __name__ == "__main__":
parser=argparse.ArgumentParser(description='Train KoGPT2 with ArticleDataset.')
parser.add_argument('--docker', action='store_true', help="Train on docker. Sets model cache path:/code/model, dataset path:/dataset, save path:/code/save.")
parser.add_argument('--resume', choices=['default', 'cpu', 'cuda', 'cuda:0', 'cuda:1'], nargs='?', const='default', help="Load state file to device; then resume train.")
parser.add_argument('--topic', nargs='+',choices=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'], default=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'])
parser.add_argument('--length', type=int, default=128, choices=[2**i for i in range(11)], help="token length for transform")
parser.add_argument('--epoch', type=int, default=30, help="Train epoch")
parser.add_argument('device', choices=['cpu', 'cuda', 'cuda:0', 'cuda:1'])
args = parser.parse_args()
print(args)
model_cache_path='/code/model' if args.docker else 'model'
dataset_path='/dataset' if args.docker else '../dataset'
save_path='/code/save' if args.docker else 'save'
ctx=args.device if torch.cuda.is_available() else 'cpu'
print(ctx)
device=torch.device(ctx)
tokenizer_path = get_tokenizer(cachedir=model_cache_path)
model, vocab = get_pytorch_kogpt2_model(ctx=ctx,cachedir=model_cache_path)
tokenizer = SentencepieceTokenizer(tokenizer_path, num_best=0, alpha=0)
num_workers=int(32*(128/args.length)) if args.length<1024 else 4
batch_size=int(64*(128/args.length)) if args.length<1024 else 4
padding_id=vocab[vocab.padding_token]
topics=set(set(sorted(args.topic)))
transform=ToTensor(tokenizer,vocab,args.length)
print("Preparing dataloader...")
trainset=DataLoader(ArticleDataset(dataset_path, topics=topics,label='train', transform=transform),batch_size=batch_size, num_workers=0,shuffle=True)
validset=DataLoader(ArticleDataset(dataset_path,topics=topics,label='valid', transform=transform),batch_size=batch_size, num_workers=0)
#testset=DataLoader(ArticleDataset(dataset_path,label='test', transform=transform),batch_size=128, num_workers=4)
print("Prepared dataloader.")
epoches=args.epoch
checkpoint_epoch=0
learning_rate = 3e-5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
if args.resume:
save_ctx=ctx if args.resume=="default" else args.resume
saves=glob.glob(f'{save_path}/KoGPT2_checkpoint_{save_ctx}_{topics}_{transform.max_len}_*.state')
if len(saves)>0:
last_save=max(saves,key=os.path.getmtime)
checkpoint = torch.load(last_save, map_location=device)
print(f"Loading save from {last_save}")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
checkpoint_epoch = checkpoint['epoch']
last_valid_loss = checkpoint['loss']
print("Loaded.")
else:
print("No save exists.")
model.to(device)
model.train()
cached_trainset_path=f"{save_path}/train_{topics}_{transform.max_len}"
cached_vaildset_path=f"{save_path}/valid_{topics}_{transform.max_len}"
if os.path.isfile(cached_trainset_path+'.npy'):
trainset.dataset.load_from_file(cached_trainset_path+'.npy')
else:
print("Caching trainset...")
for temp in tqdm(trainset):
pass
trainset.dataset.set_use_cache(True, cached_trainset_path)
if os.path.isfile(cached_vaildset_path+'.npy'):
validset.dataset.load_from_file(cached_vaildset_path+'.npy')
else:
print("Caching validset...")
for temp in tqdm(validset):
pass
validset.dataset.set_use_cache(True, cached_vaildset_path)
print("Cached.")
trainset.num_workers=num_workers
validset.num_workers=num_workers
last_valid_loss=float('infinity')
overfit=-1
states=[]
for epoch in tqdm(range(checkpoint_epoch+1,epoches)):
try:
train_loss_list=[]
valid_loss_list=[]
for data in tqdm(trainset):
optimizer.zero_grad()
data = data.to(ctx)
label = torch.where(data!=padding_id, data, torch.ones_like(data)*-100)
mask = torch.where(data!=padding_id,torch.ones_like(data),torch.zeros_like(data))
output=model(data, labels=label, attention_mask=mask)
loss=output[0]
loss.backward()
optimizer.step()
train_loss_list.append(loss.item())
del loss
del output
del label
del mask
del data
with torch.no_grad():
for v_data in tqdm(validset):
v_data = v_data.to(ctx)
v_label = torch.where(v_data!=padding_id, v_data, torch.ones_like(v_data)*-100)
v_mask = torch.where(v_data!=padding_id,torch.ones_like(v_data),torch.zeros_like(v_data))
v_output=model(v_data,labels=v_label, attention_mask=v_mask)
v_loss=v_output[0]
valid_loss_list.append(v_loss.item())
del v_loss
del v_output
del v_mask
del v_label
del v_data
valid_loss=sum(valid_loss_list)/len(valid_loss_list)
train_loss=sum(train_loss_list)/len(train_loss_list)
print(f"epoch: {epoch} train loss: {train_loss} valid loss: {valid_loss}")
states.append((epoch,train_loss,valid_loss))
if valid_loss>last_valid_loss:
overfit=epoch
try:
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': train_loss
}, f"{save_path}/KoGPT2_checkpoint_{ctx}_{topics}_{transform.max_len}_{epoch}.state")
except Exception as e:
print(e)
last_valid_loss=valid_loss
except KeyboardInterrupt:
break
log_path=f"{save_path}/{topics}_{transform.max_len}_{int(time.time())}.log"
with open(log_path, 'w') as log:
log.write(f"Overfit at: {overfit}\n")
for state in states:
log.write(f"epoch: {state[0]} train loss: {state[1]} valid loss: {state[2]}\n")
print(f"Log written at: {log_path}")
import os
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import torch
......@@ -10,12 +11,15 @@ class ArticleDataset(Dataset):
기사 학습을 위한 데이터셋
dataset for learn articles
"""
def __init__(self, dataset_path:str, topics:list=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'], label:str='train'):
def __init__(self, dataset_path:str, topics:set=set(['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학']), label:str='train',
transform=None, use_cache=False):
"""
Initializer
:param dataset_path: path of parquet dataset
:param topic: if not None, only use specified topics; must be sublist of [경제, 문화, 미용_건강, 사회, 생활, 스포츠, 연예, 정치, IT_과학]
:param topic: if not None, only use specified topics; must be subset of {경제, 문화, 미용_건강, 사회, 생활, 스포츠, 연예, 정치, IT_과학}
:param label: specify type of dataset; must be one of [train, test, valid] (default is train)
:param transform: if not None, transforms data. (paragraph:stringScalar, topic:stringScalar)=>Tensor
:param use_cache: if True, __getitem__ uses cache. Must be used after first epoch.
"""
expanded_dataset_path = os.path.expanduser(dataset_path)
tables=[]
......@@ -23,26 +27,68 @@ class ArticleDataset(Dataset):
table=pq.read_table(f'{expanded_dataset_path}/topic={topic}/label={label}',columns=['paragraph'])
tables.append(table.append_column('topic',pa.array([topic]*len(table))))
self.data=pa.concat_tables(tables)
self.transform=transform
self.use_cache=use_cache
self.cache=[None]*len(self.data)
#if self.transform is not None: too slow
# self.data=[ self.transform((p,t)) for p, t in zip(self.data['paragraph'],self.data['topic'])]
def __len__(self):
return len(self.data)
def __getitem__(self,index):
return self.data['paragraph'][index], self.data['topic'][index]
item=(self.data['paragraph'][index], self.data['topic'][index]) if self.transform is None \
else self.transform((self.data['paragraph'][index], self.data['topic'][index]))
if self.use_cache and self.cache[index] is not None:
return self.cache[index]
else:
self.cache[index]=item
return item
def load_from_file(self, cache_file_path:str):
self.use_cache=True
self.cache=torch.from_numpy(np.load(cache_file_path))
def set_use_cache(self, use_cache:bool, cache_file_path:str=None):
self.use_cache=use_cache
if use_cache:
if isinstance(self.cache,torch.Tensor):
if cache_file_path is not None:
np.save(cache_file_path,self.cache.numpy())
else:
print("Already fully cached.")
return
try:
self.cache=torch.stack(self.cache)
if cache_file_path is not None:
np.save(cache_file_path,self.cache.numpy())
except RuntimeError:
print("Not fully cached yet. Please run epoch with num_worker=0.")
return
else:
self.cache=[]
class ToTensor(object):
"""
Convert Article dataset paragraph to Tensor using tokenizer
"""
def __init__(self, tokenizer, vocab):
def __init__(self, tokenizer, vocab, max_len=512):
self.tokenizer=tokenizer
self.vocab=vocab
self.max_len=max_len
def __call__(self, sample):
tokens=[]
for i, sentence in enumerate(sample[0]):
paragraph=sample[0]
topic=sample[1]
for i, sentence in enumerate(paragraph):
if i==0:
tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sample[1].as_py())+self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
line=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(topic.as_py())+self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
else:
line=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
if len(tokens)+len(line)<=self.max_len: # prevent sentence fragment
tokens+=line
else:
tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
return torch.Tensor(tokens)
\ No newline at end of file
break
tokens+=([self.vocab[self.vocab.padding_token]]*(self.max_len-len(tokens))) # indicate padding with -100 ref: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
return torch.tensor(tokens,dtype=torch.long)
\ No newline at end of file
......