createDataset.py
3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
import sys
sys.stdout.reconfigure(encoding='utf-8')
currentPath = os.getcwd()
def getFacebookData():
print("function active")
personName = "이혜연"
personName = personName.rstrip('\r')
responseDictionary = dict()
with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\final\uploads\dataset.txt', 'r', encoding='UTF8') as fbFile:
allLines = fbFile.readlines()
myMessage, otherPersonsMessage, currentSpeaker = "","",""
for index,lines in enumerate(allLines):
rightBracket = lines.find(':') + 5
justMessage = lines[rightBracket:]
colon = justMessage.find(':')
# Find messages that I sent
if (justMessage[:colon-1] == personName):
if not myMessage:
# Want to find the first message that I send (if I send multiple
# in a row)
startMessageIndex = index - 1
myMessage += justMessage[colon + 2:]
elif myMessage:
# Now go and see what message the other person sent by looking at
# previous messages
for counter in range(startMessageIndex, 0, -1):
currentLine = allLines[counter]
rightBracket = currentLine.find(':') + 5
justMessage = currentLine[rightBracket:]
colon = justMessage.find(':')
if not currentSpeaker:
# The first speaker not named me
currentSpeaker = justMessage[:colon]
elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
# A different person started speaking, so now I know that the
# first person's message is done
otherPersonsMessage = cleanMessage(otherPersonsMessage)
myMessage = cleanMessage(myMessage)
responseDictionary[otherPersonsMessage] = myMessage
break
otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
myMessage, otherPersonsMessage, currentSpeaker = "","",""
return responseDictionary
def cleanMessage(message):
# Remove new lines within message
cleanedMessage = message.replace('\n',' ').lower()
# Deal with some weird tokens
cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
# Remove punctuation
cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
# Remove multiple spaces in message
cleanedMessage = re.sub(' +',' ', cleanedMessage)
return cleanedMessage
combinedDictionary = {}
combinedDictionary.update(getFacebookData())
print ('Total len of dictionary', len(combinedDictionary))
print('Saving conversation data dictionary')
np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\final\conversationDictionary.npy', combinedDictionary)
conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\final\conversationData.txt', 'w', encoding='UTF8')
for key, value in combinedDictionary.items():
if (not key.strip() or not value.strip()):
# If there are empty strings
continue
conversationFile.write(key.strip() + value.strip())