hyeyeon-sun

kakao parsing script && SeqtoSeq analysis

1 -import tensorflow as tf 1 +import tensorflow.compat.v1 as tf
2 +tf.disable_v2_behavior()
2 import numpy as np 3 import numpy as np
3 import sys 4 import sys
4 from random import randint 5 from random import randint
...@@ -10,11 +11,11 @@ import os ...@@ -10,11 +11,11 @@ import os
10 os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 11 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
11 12
12 def createTrainingMatrices(conversationFileName, wList, maxLen): 13 def createTrainingMatrices(conversationFileName, wList, maxLen):
13 - conversationDictionary = np.load(conversationFileName).item() 14 + conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
14 numExamples = len(conversationDictionary) 15 numExamples = len(conversationDictionary)
15 xTrain = np.zeros((numExamples, maxLen), dtype='int32') 16 xTrain = np.zeros((numExamples, maxLen), dtype='int32')
16 yTrain = np.zeros((numExamples, maxLen), dtype='int32') 17 yTrain = np.zeros((numExamples, maxLen), dtype='int32')
17 - for index,(key,value) in enumerate(conversationDictionary.iteritems()): 18 + for index,(key,value) in enumerate(conversationDictionary.items()):
18 # Will store integerized representation of strings here (initialized as padding) 19 # Will store integerized representation of strings here (initialized as padding)
19 encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') 20 encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
20 decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') 21 decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
...@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2 ...@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
165 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')): 166 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
166 xTrain = np.load('Seq2SeqXTrain.npy') 167 xTrain = np.load('Seq2SeqXTrain.npy')
167 yTrain = np.load('Seq2SeqYTrain.npy') 168 yTrain = np.load('Seq2SeqYTrain.npy')
168 - print 'Finished loading training matrices' 169 + print ('Finished loading training matrices')
169 numTrainingExamples = xTrain.shape[0] 170 numTrainingExamples = xTrain.shape[0]
170 else: 171 else:
171 numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength) 172 numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
172 np.save('Seq2SeqXTrain.npy', xTrain) 173 np.save('Seq2SeqXTrain.npy', xTrain)
173 np.save('Seq2SeqYTrain.npy', yTrain) 174 np.save('Seq2SeqYTrain.npy', yTrain)
174 - print 'Finished creating training matrices' 175 + print ('Finished creating training matrices')
175 176
176 -tf.reset_default_graph() 177 +tf.compat.v1.reset_default_graph()
177 178
178 # Create the placeholders 179 # Create the placeholders
179 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)] 180 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
...@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) ...@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
186 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True) 187 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
187 # Architectural choice of of whether or not to include ^ 188 # Architectural choice of of whether or not to include ^
188 189
189 -decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM, 190 +decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
190 vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious) 191 vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
191 192
192 decoderPrediction = tf.argmax(decoderOutputs, 2) 193 decoderPrediction = tf.argmax(decoderOutputs, 2)
...@@ -233,14 +234,14 @@ for i in range(numIterations): ...@@ -233,14 +234,14 @@ for i in range(numIterations):
233 writer.add_summary(summary, i) 234 writer.add_summary(summary, i)
234 if (i % 25 == 0 and i != 0): 235 if (i % 25 == 0 and i != 0):
235 num = randint(0,len(encoderTestStrings) - 1) 236 num = randint(0,len(encoderTestStrings) - 1)
236 - print encoderTestStrings[num] 237 + print (encoderTestStrings[num])
237 inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength); 238 inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
238 feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)} 239 feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
239 feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)}) 240 feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
240 feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)}) 241 feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
241 feedDict.update({feedPrevious: True}) 242 feedDict.update({feedPrevious: True})
242 ids = (sess.run(decoderPrediction, feed_dict=feedDict)) 243 ids = (sess.run(decoderPrediction, feed_dict=feedDict))
243 - print idsToSentence(ids, wordList) 244 + print (idsToSentence(ids, wordList))
244 245
245 if (i % 10000 == 0 and i != 0): 246 if (i % 10000 == 0 and i != 0):
246 savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i) 247 savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
......
1 -import tensorflow as tf 1 +import tensorflow.compat.v1 as tf
2 +tf.disable_v2_behavior()
2 import numpy as np 3 import numpy as np
3 import re 4 import re
4 from collections import Counter 5 from collections import Counter
...@@ -25,8 +26,9 @@ numIterations = 100000 ...@@ -25,8 +26,9 @@ numIterations = 100000
25 # into one huge string, and then uses a Counter to identify words 26 # into one huge string, and then uses a Counter to identify words
26 # and the number of occurences 27 # and the number of occurences
27 def processDataset(filename): 28 def processDataset(filename):
28 - openedFile = open(filename, 'r') 29 + openedFile = open(filename, 'r', encoding='UTF8')
29 allLines = openedFile.readlines() 30 allLines = openedFile.readlines()
31 + print(allLines)
30 myStr = "" 32 myStr = ""
31 for line in allLines: 33 for line in allLines:
32 myStr += line 34 myStr += line
...@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus): ...@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
41 yTrain=[] 43 yTrain=[]
42 for i in range(numTotalWords): 44 for i in range(numTotalWords):
43 if i % 100000 == 0: 45 if i % 100000 == 0:
44 - print 'Finished %d/%d total words' % (i, numTotalWords) 46 + print ('Finished %d/%d total words' % (i, numTotalWords))
45 wordsAfter = allWords[i + 1:i + windowSize + 1] 47 wordsAfter = allWords[i + 1:i + windowSize + 1]
46 wordsBefore = allWords[max(0, i - windowSize):i] 48 wordsBefore = allWords[max(0, i - windowSize):i]
47 wordsAdded = wordsAfter + wordsBefore 49 wordsAdded = wordsAfter + wordsBefore
...@@ -61,61 +63,22 @@ continueWord2Vec = True ...@@ -61,61 +63,22 @@ continueWord2Vec = True
61 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')): 63 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
62 xTrain = np.load('Word2VecXTrain.npy') 64 xTrain = np.load('Word2VecXTrain.npy')
63 yTrain = np.load('Word2VecYTrain.npy') 65 yTrain = np.load('Word2VecYTrain.npy')
64 - print 'Finished loading training matrices' 66 + print ('Finished loading training matrices')
65 with open("wordList.txt", "rb") as fp: 67 with open("wordList.txt", "rb") as fp:
66 wordList = pickle.load(fp) 68 wordList = pickle.load(fp)
67 - print 'Finished loading word list' 69 + print ('Finished loading word list')
68 70
69 else: 71 else:
70 - fullCorpus, datasetDictionary = processDataset('conversationData.txt') 72 + fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
71 - print 'Finished parsing and cleaning dataset' 73 + print ('Finished parsing and cleaning dataset')
72 wordList = list(datasetDictionary.keys()) 74 wordList = list(datasetDictionary.keys())
73 - createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?') 75 + createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
74 if (createOwnVectors == 'y'): 76 if (createOwnVectors == 'y'):
75 xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus) 77 xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus)
76 - print 'Finished creating training matrices' 78 + print ('Finished creating training matrices')
77 np.save('Word2VecXTrain.npy', xTrain) 79 np.save('Word2VecXTrain.npy', xTrain)
78 np.save('Word2VecYTrain.npy', yTrain) 80 np.save('Word2VecYTrain.npy', yTrain)
79 else: 81 else:
80 continueWord2Vec = False 82 continueWord2Vec = False
81 with open("wordList.txt", "wb") as fp: 83 with open("wordList.txt", "wb") as fp:
82 pickle.dump(wordList, fp) 84 pickle.dump(wordList, fp)
83 -
84 -# If you do not want to create your own word vectors and you'd just like to
85 -# have Tensorflow's seq2seq take care of that, then you don't need to run
86 -# anything below this line.
87 -if (continueWord2Vec == False):
88 - sys.exit()
89 -
90 -numTrainingExamples = len(xTrain)
91 -vocabSize = len(wordList)
92 -
93 -sess = tf.Session()
94 -embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
95 -nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
96 -nceBiases = tf.Variable(tf.zeros([vocabSize]))
97 -
98 -inputs = tf.placeholder(tf.int32, shape=[batchSize])
99 -outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
100 -
101 -embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
102 -
103 -loss = tf.reduce_mean(
104 - tf.nn.nce_loss(weights=nceWeights,
105 - biases=nceBiases,
106 - labels=outputs,
107 - inputs=embed,
108 - num_sampled=numNegativeSample,
109 - num_classes=vocabSize))
110 -
111 -optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
112 -
113 -sess.run(tf.global_variables_initializer())
114 -for i in range(numIterations):
115 - trainInputs, trainLabels = getTrainingBatch()
116 - _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
117 - if (i % 10000 == 0):
118 - print ('Current loss is:', curLoss)
119 -print 'Saving the word embedding matrix'
120 -embedMatrix = embeddingMatrix.eval(session=sess)
121 -np.save('embeddingMatrix.npy', embedMatrix)
......
1 +Stack trace:
2 +Frame Function Args
3 +00600000010 001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
4 +00600000010 001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
5 +00600000010 00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
6 +00600000010 0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
7 +00600000010 0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
8 +00600000010 0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
9 +00600000010 001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
10 +00600000010 0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
11 +00600000010 0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
12 +000FFFFCCE0 0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
13 +000FFFFCCE0 00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
14 +000FFFFFFF0 001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
15 +000FFFFFFF0 00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
16 +End of stack trace
1 +import pandas as pd
2 +import numpy as np
3 +import os
4 +import re
5 +from datetime import datetime
6 +
7 +def getKakaotalkbookData():
8 + print("function active")
9 + personName = raw_input('Enter your full kakao name: ')
10 + personName = personName.rstrip('\r')
11 + responseDictionary = dict()
12 + with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
13 + allLines = fbFile.readlines()
14 +
15 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
16 +
17 + for index,lines in enumerate(allLines):
18 + rightBracket = lines.find(']') + 2
19 + justMessage = lines[rightBracket:]
20 + colon = justMessage.find(':')
21 + # Find messages that I sent
22 + print("*input*")
23 + print(personName, len(personName))
24 + print("*file*")
25 + print(justMessage[:colon-1], len(personName))
26 + print("*same?*")
27 + print(justMessage[:colon-1] == personName)
28 + print(justMessage[:colon] == "Second User")
29 + print(personName == "Second User")
30 + print("------------------------------------------")
31 +
32 + if (justMessage[:colon-1] == personName):
33 + print('a')
34 + if not myMessage:
35 + # Want to find the first message that I send (if I send multiple
36 + # in a row)
37 + startMessageIndex = index - 1
38 + myMessage += justMessage[colon + 2:]
39 +
40 + elif myMessage:
41 + # Now go and see what message the other person sent by looking at
42 + # previous messages
43 + for counter in range(startMessageIndex, 0, -1):
44 + currentLine = allLines[counter]
45 + rightBracket = currentLine.find(']') + 2
46 + justMessage = currentLine[rightBracket:]
47 + colon = justMessage.find(':')
48 + if not currentSpeaker:
49 + # The first speaker not named me
50 + currentSpeaker = justMessage[:colon]
51 + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
52 + # A different person started speaking, so now I know that the
53 + # first person's message is done
54 + otherPersonsMessage = cleanMessage(otherPersonsMessage)
55 + myMessage = cleanMessage(myMessage)
56 + responseDictionary[otherPersonsMessage] = myMessage
57 + break
58 + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
59 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
60 + return responseDictionary
61 +
62 +
63 +def getFacebookData():
64 + print("function active")
65 + personName = input('Enter your full Kakaotalk name: ')
66 + personName = personName.rstrip('\r')
67 + responseDictionary = dict()
68 + with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
69 + allLines = fbFile.readlines()
70 +
71 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
72 +
73 + for index,lines in enumerate(allLines):
74 + rightBracket = lines.find(':') + 5
75 + justMessage = lines[rightBracket:]
76 + colon = justMessage.find(':')
77 + # Find messages that I sent
78 + print("*input*")
79 +
80 + print(type(personName))
81 + print(personName, len(personName))
82 + print("*file*")
83 + print(justMessage[:colon-1], len(personName))
84 + print("*same?*")
85 + print(justMessage[:colon-1] == personName)
86 + print(justMessage[:colon] == "Second User")
87 + print(personName == "Second User")
88 +
89 +
90 +
91 + if (justMessage[:colon-1] == personName):
92 + print('a')
93 + if not myMessage:
94 + # Want to find the first message that I send (if I send multiple
95 + # in a row)
96 + startMessageIndex = index - 1
97 + myMessage += justMessage[colon + 2:]
98 +
99 + elif myMessage:
100 + # Now go and see what message the other person sent by looking at
101 + # previous messages
102 + for counter in range(startMessageIndex, 0, -1):
103 + currentLine = allLines[counter]
104 + rightBracket = currentLine.find(':') + 5
105 + justMessage = currentLine[rightBracket:]
106 + colon = justMessage.find(':')
107 + if not currentSpeaker:
108 + # The first speaker not named me
109 + currentSpeaker = justMessage[:colon]
110 + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
111 + # A different person started speaking, so now I know that the
112 + # first person's message is done
113 + otherPersonsMessage = cleanMessage(otherPersonsMessage)
114 + myMessage = cleanMessage(myMessage)
115 + responseDictionary[otherPersonsMessage] = myMessage
116 + break
117 + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
118 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
119 + print("------------------------------------------")
120 +
121 + return responseDictionary
122 +
123 +
124 +
125 +def cleanMessage(message):
126 + # Remove new lines within message
127 + cleanedMessage = message.replace('\n',' ').lower()
128 + # Deal with some weird tokens
129 + cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
130 + # Remove punctuation
131 + cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
132 + # Remove multiple spaces in message
133 + cleanedMessage = re.sub(' +',' ', cleanedMessage)
134 + return cleanedMessage
135 +
136 +combinedDictionary = {}
137 +
138 +combinedDictionary.update(getFacebookData())
139 +
140 +print(combinedDictionary)
141 +print ('Total len of dictionary', len(combinedDictionary))
142 +
143 +print('Saving conversation data dictionary')
144 +np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
145 +
146 +conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
147 +for key, value in combinedDictionary.items():
148 + if (not key.strip() or not value.strip()):
149 + # If there are empty strings
150 + continue
151 + print(key.strip() + value.strip())
152 + conversationFile.write(key.strip() + value.strip())
1 +import pickle
2 +
3 +with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
4 + data = pickle.load(f)
5 + print(data) # hello
1 -import pandas as pd
2 -import numpy as np
3 -import os
4 -import re
5 -from datetime import datetime
6 -
7 -
8 -
9 -
10 -def getWhatsAppDataCSV(personName):
11 - df = pd.read_csv('whatsapp_chats.csv')
12 - responseDictionary = dict()
13 - receivedMessages = df[df['From'] != personName]
14 - sentMessages = df[df['From'] == personName]
15 - combined = pd.concat([sentMessages, receivedMessages])
16 - otherPersonsMessage, myMessage = "",""
17 - firstMessage = True
18 - for index, row in combined.iterrows():
19 - if (row['From'] != personName):
20 - if myMessage and otherPersonsMessage:
21 - otherPersonsMessage = cleanMessage(otherPersonsMessage)
22 - myMessage = cleanMessage(myMessage)
23 - responseDictionary[otherPersonsMessage.rstrip()] = myMessage.rstrip()
24 - otherPersonsMessage, myMessage = "",""
25 - otherPersonsMessage = otherPersonsMessage + str(row['Content']) + " "
26 - else:
27 - if (firstMessage):
28 - firstMessage = False
29 - # Don't include if I am the person initiating the convo
30 - continue
31 - myMessage = myMessage + str(row['Content']) + " "
32 - return responseDictionary
33 -
34 -def getWhatsAppDataTXT(personName):
35 - # Putting all the file names in a list
36 - allFiles = []
37 - # Edit these file and directory names if you have them saved somewhere else
38 - for filename in os.listdir('WhatsAppChatLogs'):
39 - if filename.endswith(".txt"):
40 - allFiles.append('WhatsAppChatLogs/' + filename)
41 -
42 - responseDictionary = dict()
43 - """
44 - The key is the other person's message, and the value is my response
45 - Going through each file, and recording everyone's messages to me, and my
46 - responses
47 - """
48 - for currentFile in allFiles:
49 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
50 - with open(currentFile, 'r') as openedFile:
51 - allLines = openedFile.readlines()
52 - for index,line in enumerate(allLines):
53 - # The sender's name is separated by a ']' or '-' and a ': ' (The whitespace is important)
54 - leftDelimPattern = re.compile(r'[\]\-]')
55 - # A pattern to match either `]` or `-`
56 - leftDelim = leftDelimPattern.search(line)
57 - leftDelim = leftDelim.start() if leftDelim else -1
58 - rightColon = line.find(': ')
59 -
60 - # Find messages that I sent
61 - if (line[leftDelim + 1:rightColon].strip() == personName):
62 - if not myMessage:
63 - # Want to find the first message that I send (if I send
64 - # multiple in a row)
65 - startMessageIndex = index - 1
66 - myMessage += line[rightColon + 1:].strip()
67 -
68 - elif myMessage:
69 - # Now go and see what message the other person sent by looking at
70 - # previous messages
71 - for counter in range(startMessageIndex, 0, -1):
72 - currentLine = allLines[counter]
73 - # Extracting the values of left and right delimiters
74 - leftDelim = leftDelimPattern.search(currentLine)
75 - leftDelim = leftDelim.start() if leftDelim else -1
76 - rightColon = line.find(': ')
77 - if (leftDelim < 0 or rightColon < 0):
78 - # In case the message above isn't in the right format
79 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
80 - break
81 - if not currentSpeaker:
82 - # The first speaker not named me
83 - currentSpeaker = currentLine[leftDelim + 1:rightColon].strip()
84 - elif (currentSpeaker != currentLine[leftDelim + 1:rightColon].strip()):
85 - # A different person started speaking, so now I know that
86 - # the first person's message is done
87 - otherPersonsMessage = cleanMessage(otherPersonsMessage)
88 - myMessage = cleanMessage(myMessage)
89 - responseDictionary[otherPersonsMessage] = myMessage
90 - break
91 - otherPersonsMessage = currentLine[rightColon + 1:].strip() + otherPersonsMessage
92 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
93 - return responseDictionary
94 -
95 -def getWhatsAppData():
96 - personName = raw_input('Enter your full WhatsApp name: ')
97 - if os.path.isfile('whatsapp_chats.csv'):
98 - return getWhatsAppDataCSV(personName)
99 - else:
100 - return getWhatsAppDataTXT(personName)
101 -
102 -
103 -def getGoogleHangoutsData():
104 - personName = raw_input('Enter your full Hangouts name: ')
105 - # Putting all the file names in a list
106 - allFiles = []
107 - # Edit these file and directory names if you have them saved somewhere else
108 - for filename in os.listdir('GoogleTextForm'):
109 - if filename.endswith(".txt"):
110 - allFiles.append('GoogleTextForm/' + filename)
111 -
112 - responseDictionary = dict()
113 - """
114 - The key is the other person's message, and the value is my response
115 - Going through each file, and recording everyone's messages to me, and my
116 - responses
117 - """
118 - for currentFile in allFiles:
119 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
120 - with open(currentFile, 'r') as openedFile:
121 - allLines = openedFile.readlines()
122 - for index,lines in enumerate(allLines):
123 - # The sender's name is separated by < and >
124 - leftBracket = lines.find('<')
125 - rightBracket = lines.find('>')
126 -
127 - # Find messages that I sent
128 - if (lines[leftBracket + 1:rightBracket] == personName):
129 - if not myMessage:
130 - # Want to find the first message that I send (if I send
131 - # multiple in a row)
132 - startMessageIndex = index - 1
133 - myMessage += lines[rightBracket + 1:]
134 -
135 - elif myMessage:
136 - # Now go and see what message the other person sent by looking at
137 - # previous messages
138 - for counter in range(startMessageIndex, 0, -1):
139 - currentLine = allLines[counter]
140 - # In case the message above isn't in the right format
141 - if (currentLine.find('<') < 0 or currentLine.find('>') < 0):
142 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
143 - break
144 - if not currentSpeaker:
145 - # The first speaker not named me
146 - currentSpeaker = currentLine[currentLine.find('<') + 1:currentLine.find('>')]
147 - elif (currentSpeaker != currentLine[currentLine.find('<') + 1:currentLine.find('>')]):
148 - # A different person started speaking, so now I know that
149 - # the first person's message is done
150 - otherPersonsMessage = cleanMessage(otherPersonsMessage)
151 - myMessage = cleanMessage(myMessage)
152 - responseDictionary[otherPersonsMessage] = myMessage
153 - break
154 - otherPersonsMessage = currentLine[currentLine.find('>') + 1:] + otherPersonsMessage
155 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
156 - return responseDictionary
157 -
158 -def getFacebookData():
159 - print("function active")
160 - personName = raw_input('Enter your full Facebook name: ')
161 - personName = personName.rstrip('\r')
162 - responseDictionary = dict()
163 - with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Facebook-Messenger-Bot\fbMessages.txt', 'r') as fbFile:
164 - allLines = fbFile.readlines()
165 -
166 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
167 -
168 - for index,lines in enumerate(allLines):
169 - rightBracket = lines.find(']') + 2
170 - justMessage = lines[rightBracket:]
171 - colon = justMessage.find(':')
172 - # Find messages that I sent
173 - print("*input*")
174 - print(personName, len(personName))
175 - print("*file*")
176 - print(justMessage[:colon-1], len(personName))
177 - print("*same?*")
178 - print(justMessage[:colon-1] == personName)
179 - print(justMessage[:colon] == "Second User")
180 - print(personName == "Second User")
181 - print("------------------------------------------")
182 -
183 - if (justMessage[:colon-1] == personName):
184 - print('a')
185 - if not myMessage:
186 - # Want to find the first message that I send (if I send multiple
187 - # in a row)
188 - startMessageIndex = index - 1
189 - myMessage += justMessage[colon + 2:]
190 -
191 - elif myMessage:
192 - # Now go and see what message the other person sent by looking at
193 - # previous messages
194 - for counter in range(startMessageIndex, 0, -1):
195 - currentLine = allLines[counter]
196 - rightBracket = currentLine.find(']') + 2
197 - justMessage = currentLine[rightBracket:]
198 - colon = justMessage.find(':')
199 - if not currentSpeaker:
200 - # The first speaker not named me
201 - currentSpeaker = justMessage[:colon]
202 - elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
203 - # A different person started speaking, so now I know that the
204 - # first person's message is done
205 - otherPersonsMessage = cleanMessage(otherPersonsMessage)
206 - myMessage = cleanMessage(myMessage)
207 - responseDictionary[otherPersonsMessage] = myMessage
208 - break
209 - otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
210 - myMessage, otherPersonsMessage, currentSpeaker = "","",""
211 - return responseDictionary
212 -
213 -def getLinkedInData():
214 - personName = raw_input('Enter your full LinkedIn name: ')
215 - df = pd.read_csv('Inbox.csv')
216 - dateTimeConverter = lambda x: datetime.strptime(x,'%B %d, %Y, %I:%M %p')
217 - responseDictionary = dict()
218 - peopleContacted = df['From'].unique().tolist()
219 - for person in peopleContacted:
220 - receivedMessages = df[df['From'] == person]
221 - sentMessages = df[df['To'] == person]
222 - if (len(sentMessages) == 0 or len(receivedMessages) == 0):
223 - # There was no actual conversation
224 - continue
225 - combined = pd.concat([sentMessages, receivedMessages])
226 - combined['Date'] = combined['Date'].apply(dateTimeConverter)
227 - combined = combined.sort(['Date'])
228 - otherPersonsMessage, myMessage = "",""
229 - firstMessage = True
230 - for index, row in combined.iterrows():
231 - if (row['From'] != personName):
232 - if myMessage and otherPersonsMessage:
233 - otherPersonsMessage = cleanMessage(otherPersonsMessage)
234 - myMessage = cleanMessage(myMessage)
235 - responseDictionary[otherPersonsMessage.rstrip()] = myMessage.rstrip()
236 - otherPersonsMessage, myMessage = "",""
237 - otherPersonsMessage = otherPersonsMessage + str(row['Content']) + " "
238 - else:
239 - if (firstMessage):
240 - firstMessage = False
241 - # Don't include if I am the person initiating the convo
242 - continue
243 - myMessage = myMessage + str(row['Content']) + " "
244 - return responseDictionary
245 -
246 -def getDiscordData():
247 - personName = raw_input('Enter your full Discord name: ')
248 - # Putting all the file names in a list
249 - allFiles = []
250 - # Edit these file and directory names if you have them saved somewhere else
251 - for filename in os.listdir('DiscordChatLogs'):
252 - if filename.endswith(".txt"):
253 - allFiles.append('DiscordChatLogs/' + filename)
254 - responseDictionary = dict()
255 - """
256 - The key is the other person's message, and the value is my response
257 - Going through each file, and recording everyone's messages to me, and my
258 - responses
259 - """
260 - for currentFile in allFiles:
261 - with open(currentFile, 'r') as openedFile:
262 - allLines = openedFile.readlines()
263 - data = ''.join(allLines)
264 - otherPersonsMessage, myMessage = "",""
265 - response_sets = re.findall(r'\[.+\] (?!' + re.escape(personName) + r').+\n(.+)\n{2}(?:\[.+\] ' + re.escape(personName) + r'\n(.+)\n{2})', data)
266 - for response_set in response_sets:
267 - otherPersonsMessage = response_set[0]
268 - myMessage = response_set[1]
269 - responseDictionary[otherPersonsMessage] = cleanMessage(myMessage)
270 - otherPersonsMessage, myMessage = "",""
271 - return responseDictionary
272 -
273 -def cleanMessage(message):
274 - # Remove new lines within message
275 - cleanedMessage = message.replace('\n',' ').lower()
276 - # Deal with some weird tokens
277 - cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
278 - # Remove punctuation
279 - cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
280 - # Remove multiple spaces in message
281 - cleanedMessage = re.sub(' +',' ', cleanedMessage)
282 - return cleanedMessage
283 -
284 -combinedDictionary = {}
285 -
286 -combinedDictionary.update(getFacebookData())
287 -
288 -print ('Total len of dictionary', len(combinedDictionary))
289 -
290 -print('Saving conversation data dictionary')
291 -np.save('conversationDictionary.npy', combinedDictionary)
292 -
293 -conversationFile = open('conversationData.txt', 'w')
294 -for key, value in combinedDictionary.items():
295 - if (not key.strip() or not value.strip()):
296 - # If there are empty strings
297 - continue
298 - conversationFile.write(key.strip() + value.strip())
1 -456 78
...\ No newline at end of file ...\ No newline at end of file
1 +67126 78this is a testyes it is the last message hello45x y zx y z
...\ No newline at end of file ...\ No newline at end of file
......
No preview for this file type