dataset.py
1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import random
import pandas as pd
from keras.preprocessing.text import Tokenizer
from utils import *
def save_dataset(path, pairData, pairLabels, compressed=True):
if compressed:
np.savez_compressed(path, pairData=pairData, pairLabels=pairLabels)
else:
np.savez(path, pairData=pairData, pairLabels=pairLabels)
def load_dataset(path):
data = np.load(path, allow_pickle=True)
return (data['pairData'], data['pairLabels'])
def make_dataset_small(path): # couldn't make dataser for shuffled/merged/obfuscated, as memory run out.
vecs = np.load(path, allow_pickle=True)['vecs']
pairData = []
pairLabels = [] # 1 for plagiarism
# original pair
for i in range(len(vecs)):
currentData = vecs[i]
pairData.append([currentData, currentData])
pairLabels.append([1])
j = i
while j == i:
j = random.randint(0, len(vecs) - 1)
pairData.append([currentData, vecs[j]])
pairLabels.append([0])
return (np.array(pairData), np.array(pairLabels))
def load_embedding(path):
data = np.load(path, allow_pickle=True)
return (data['vocab_size'], data['embedding_matrix'])