extractVocab.py
3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 20 17:52:11 2017
@author: red-sky
"""
import sys
import json
import numpy as np
def updateDict(words, dictUp):
# update word dictionary with given "words" and the dict "dictUp"
for w in words:
if w in dictUp:
dictUp[w] += 1
else:
dictUp[w] = 0
return dictUp
def extractVocab(eventsFile, fromIndex=0, toIndex="END"):
# from Events file, extract infor about words and create a mapping
vocab = dict()
with open(eventsFile, "r") as file:
list_events = file.read().strip().splitlines()
if toIndex == -1:
list_events = list_events[fromIndex:]
else:
list_events = sorted(set(list_events[fromIndex:toIndex]))
for i, event in enumerate(list_events):
if event[0] != "\t":
index = i
break
list_events = list_events[index:]
for event in list_events:
event = event.split("\t")
words = event[1].split(" ") + \
event[2].split(" ") + \
event[3].split(" ")
vocab = updateDict(words, vocab)
vocab_words = vocab.keys()
support_words = ["NOISEWORDS"]
vocab_words = support_words + \
sorted(vocab_words, key=lambda x: vocab[x], reverse=True)
IndexWords = range(len(vocab_words))
Count = ["NOISEWORDS"] + [vocab[w] for w in vocab_words[1:]]
result = [dict(zip(vocab_words, Count)),
dict(zip(IndexWords, vocab_words)),
dict(zip(vocab_words, IndexWords))]
return result, list_events
def convertEvent(eventsFile, vocabMapping, countMin=5):
# convert all Events to index for training
wordCount, _, word2index = vocabMapping
Events = []
with open(eventsFile, "r") as file:
list_events = file.read().strip().splitlines()
for event in list_events:
event = event.split("\t")
list_obj = [event[1].split(" "),
event[2].split(" "),
event[3].split(" ")]
# Covert only words that appear more than countMin
wordsIndexed = []
for obj in list_obj:
objIndex = []
for w in obj:
if wordCount[w] >= countMin:
objIndex.append(word2index[w])
else:
objIndex.append(0)
wordsIndexed.append(objIndex)
Events.append(wordsIndexed)
return Events
if __name__ == "__main__":
# in
EventPath = "../../Thesis_data/Apple_query_result_body.txt"
fromIndex = 0
toIndex = -1
minCountWord = 5
# out
EventNewPath = "./Events_for_training.txt"
VocabPath = "./Vocab_in_events_for_training.json"
IndexdEventPath = "./IndexedEvents_for_training.npy"
vocabMapping, EventNew = extractVocab(EventPath, fromIndex, toIndex)
with open(VocabPath, "w") as W:
json.dump(vocabMapping, W, indent=2)
with open(EventNewPath, "w") as W:
W.write("\n".join(EventNew))
indexed_events = convertEvent(EventNewPath, vocabMapping, minCountWord)
np.save(arr=np.array(indexed_events), file=IndexdEventPath)