kakao parsing script && SeqtoSeq analysis

hyeyeon-sun
Commit 495704f94fecaeaf1b8e2ad76d82c284c9f09718 495704f9 1 parent 77b9a5b0
Showing 19 changed files with 195 additions and 356 deletions
make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
make_your_own_chatbot/chat_system/README.md → chat_system/README.md
make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
chat_system/bash.exe.stackdump
chat_system/createDataset.py
make_your_own_chatbot/chat_system/index.js → chat_system/index.js
make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
make_your_own_chatbot/chat_system/package.json → chat_system/package.json
chat_system/test.py
make_your_own_chatbot/chat_system/createDataset.py
make_your_own_chatbot/conversationData.txt
make_your_own_chatbot/conversationDictionary.npy
make_your_own_chatbot/dict.npy
--- a/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
+++ b/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
--- a/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
+++ b/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
--- a/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior() 
 import numpy as np
 import sys
 from random import randint
@@ -10,11 +11,11 @@ import os
 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 def createTrainingMatrices(conversationFileName, wList, maxLen):
-    conversationDictionary = np.load(conversationFileName).item()
+    conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
     numExamples = len(conversationDictionary)
     xTrain = np.zeros((numExamples, maxLen), dtype='int32')
     yTrain = np.zeros((numExamples, maxLen), dtype='int32')
-    for index,(key,value) in enumerate(conversationDictionary.iteritems()):
+    for index,(key,value) in enumerate(conversationDictionary.items()):
         # Will store integerized representation of strings here (initialized as padding)
         encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
         decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
     xTrain = np.load('Seq2SeqXTrain.npy')
     yTrain = np.load('Seq2SeqYTrain.npy')
-    print 'Finished loading training matrices'
+    print ('Finished loading training matrices')
     numTrainingExamples = xTrain.shape[0]
 else:
     numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
     np.save('Seq2SeqXTrain.npy', xTrain)
     np.save('Seq2SeqYTrain.npy', yTrain)
-    print 'Finished creating training matrices'
+    print ('Finished creating training matrices')
-tf.reset_default_graph()
+tf.compat.v1.reset_default_graph()
 # Create the placeholders
 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
 # Architectural choice of of whether or not to include ^
-decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
+decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
                                                             vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
 decoderPrediction = tf.argmax(decoderOutputs, 2)
@@ -233,14 +234,14 @@ for i in range(numIterations):
         writer.add_summary(summary, i)
     if (i % 25 == 0 and i != 0):
         num = randint(0,len(encoderTestStrings) - 1)
-        print encoderTestStrings[num]
+        print (encoderTestStrings[num])
         inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
         feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
         feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({feedPrevious: True})
         ids = (sess.run(decoderPrediction, feed_dict=feedDict))
-        print idsToSentence(ids, wordList)
+        print (idsToSentence(ids, wordList))
     if (i % 10000 == 0 and i != 0):
         savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
--- a/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
 import numpy as np
 import re
 from collections import Counter
@@ -25,8 +26,9 @@ numIterations = 100000
 # into one huge string, and then uses a Counter to identify words
 # and the number of occurences
 def processDataset(filename):
-    openedFile = open(filename, 'r')
+    openedFile = open(filename, 'r', encoding='UTF8')
     allLines = openedFile.readlines()
+    print(allLines)
     myStr = ""
     for line in allLines:
         myStr += line
@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
     yTrain=[]
     for i in range(numTotalWords):
         if i % 100000 == 0:
-            print 'Finished %d/%d total words' % (i, numTotalWords)
+            print ('Finished %d/%d total words' % (i, numTotalWords))
         wordsAfter = allWords[i + 1:i + windowSize + 1]
         wordsBefore = allWords[max(0, i - windowSize):i]
         wordsAdded = wordsAfter + wordsBefore
@@ -61,61 +63,22 @@ continueWord2Vec = True
 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
     xTrain = np.load('Word2VecXTrain.npy')
     yTrain = np.load('Word2VecYTrain.npy')
-    print 'Finished loading training matrices'
+    print ('Finished loading training matrices')
     with open("wordList.txt", "rb") as fp:
         wordList = pickle.load(fp)
-    print 'Finished loading word list'
+    print ('Finished loading word list')
 else:
-    fullCorpus, datasetDictionary = processDataset('conversationData.txt')
+    fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
-    print 'Finished parsing and cleaning dataset'
+    print ('Finished parsing and cleaning dataset')
     wordList = list(datasetDictionary.keys())
-    createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
+    createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
     if (createOwnVectors == 'y'):
         xTrain, yTrain  = createTrainingMatrices(datasetDictionary, fullCorpus)
-        print 'Finished creating training matrices'
+        print ('Finished creating training matrices')
         np.save('Word2VecXTrain.npy', xTrain)
         np.save('Word2VecYTrain.npy', yTrain)
     else:
         continueWord2Vec = False
     with open("wordList.txt", "wb") as fp:
         pickle.dump(wordList, fp)
-
-# If you do not want to create your own word vectors and you'd just like to
-# have Tensorflow's seq2seq take care of that, then you don't need to run
-# anything below this line.
-if (continueWord2Vec == False):
-    sys.exit()
-
-numTrainingExamples = len(xTrain)
-vocabSize = len(wordList)
-
-sess = tf.Session()
-embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
-nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
-nceBiases = tf.Variable(tf.zeros([vocabSize]))
-
-inputs = tf.placeholder(tf.int32, shape=[batchSize])
-outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
-
-embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
-
-loss = tf.reduce_mean(
-  tf.nn.nce_loss(weights=nceWeights,
-                 biases=nceBiases,
-                 labels=outputs,
-                 inputs=embed,
-                 num_sampled=numNegativeSample,
-                 num_classes=vocabSize))
-
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
-
-sess.run(tf.global_variables_initializer())
-for i in range(numIterations):
-    trainInputs, trainLabels = getTrainingBatch()
-    _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
-    if (i % 10000 == 0):
-        print ('Current loss is:', curLoss)
-print 'Saving the word embedding matrix'
-embedMatrix = embeddingMatrix.eval(session=sess)
-np.save('embeddingMatrix.npy', embedMatrix)
--- a/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+++ b/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+Stack trace:
+Frame        Function    Args
+00600000010  001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
+00600000010  001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
+00600000010  00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
+00600000010  0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
+00600000010  0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
+00600000010  0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
+00600000010  001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
+00600000010  0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
+00600000010  0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
+000FFFFCCE0  0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
+000FFFFCCE0  00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
+000FFFFFFF0  001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
+000FFFFFFF0  00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
+End of stack trace
--- a/chat_system/createDataset.py 0 → 100644
View file @495704f
+++ b/chat_system/createDataset.py 0 → 100644
View file @495704f
+import pandas as pd
+import numpy as np
+import os
+import re
+from datetime import datetime
+
+def getKakaotalkbookData():
+    print("function active")
+    personName = raw_input('Enter your full kakao name: ')
+    personName = personName.rstrip('\r')
+    responseDictionary = dict()
+    with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
+        allLines = fbFile.readlines()
+
+    myMessage, otherPersonsMessage, currentSpeaker = "","",""
+
+    for index,lines in enumerate(allLines):
+        rightBracket = lines.find(']') + 2
+        justMessage = lines[rightBracket:]
+        colon = justMessage.find(':')
+        # Find messages that I sent
+        print("*input*")
+        print(personName, len(personName))
+        print("*file*")
+        print(justMessage[:colon-1], len(personName))
+        print("*same?*")
+        print(justMessage[:colon-1] == personName)
+        print(justMessage[:colon] == "Second User")
+        print(personName == "Second User")
+        print("------------------------------------------")
+
+        if (justMessage[:colon-1] == personName):
+            print('a')
+            if not myMessage:
+                # Want to find the first message that I send (if I send multiple
+                # in a row)
+                startMessageIndex = index - 1
+            myMessage += justMessage[colon + 2:]
+
+        elif myMessage:
+            # Now go and see what message the other person sent by looking at
+            # previous messages
+            for counter in range(startMessageIndex, 0, -1):
+                currentLine = allLines[counter]
+                rightBracket = currentLine.find(']') + 2
+                justMessage = currentLine[rightBracket:]
+                colon = justMessage.find(':')
+                if not currentSpeaker:
+                    # The first speaker not named me
+                    currentSpeaker = justMessage[:colon]
+                elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                    # A different person started speaking, so now I know that the
+                    # first person's message is done
+                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                    myMessage = cleanMessage(myMessage)
+                    responseDictionary[otherPersonsMessage] = myMessage
+                    break
+                otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+            myMessage, otherPersonsMessage, currentSpeaker = "","",""
+    return responseDictionary
+
+
+def getFacebookData():
+    print("function active")
+    personName = input('Enter your full Kakaotalk name: ')
+    personName = personName.rstrip('\r')
+    responseDictionary = dict()
+    with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
+        allLines = fbFile.readlines()
+
+    myMessage, otherPersonsMessage, currentSpeaker = "","",""
+
+    for index,lines in enumerate(allLines):
+        rightBracket = lines.find(':') + 5
+        justMessage = lines[rightBracket:]
+        colon = justMessage.find(':')
+        # Find messages that I sent
+        print("*input*")
+        
+        print(type(personName))
+        print(personName, len(personName))
+        print("*file*")
+        print(justMessage[:colon-1], len(personName))
+        print("*same?*")
+        print(justMessage[:colon-1] == personName)
+        print(justMessage[:colon] == "Second User")
+        print(personName == "Second User")
+        
+
+
+        if (justMessage[:colon-1] == personName):
+            print('a')
+            if not myMessage:
+                # Want to find the first message that I send (if I send multiple
+                # in a row)
+                startMessageIndex = index - 1
+            myMessage += justMessage[colon + 2:]
+
+        elif myMessage:
+            # Now go and see what message the other person sent by looking at
+            # previous messages
+            for counter in range(startMessageIndex, 0, -1):
+                currentLine = allLines[counter]
+                rightBracket = currentLine.find(':') + 5
+                justMessage = currentLine[rightBracket:]
+                colon = justMessage.find(':')
+                if not currentSpeaker:
+                    # The first speaker not named me
+                    currentSpeaker = justMessage[:colon]
+                elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                    # A different person started speaking, so now I know that the
+                    # first person's message is done
+                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                    myMessage = cleanMessage(myMessage)
+                    responseDictionary[otherPersonsMessage] = myMessage
+                    break
+                otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+            myMessage, otherPersonsMessage, currentSpeaker = "","",""
+        print("------------------------------------------")
+    
+    return responseDictionary
+
+
+
+def cleanMessage(message):
+    # Remove new lines within message
+    cleanedMessage = message.replace('\n',' ').lower()
+    # Deal with some weird tokens
+    cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
+    # Remove punctuation
+    cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
+    # Remove multiple spaces in message
+    cleanedMessage = re.sub(' +',' ', cleanedMessage)
+    return cleanedMessage
+
+combinedDictionary = {}
+
+combinedDictionary.update(getFacebookData())
+
+print(combinedDictionary)
+print ('Total len of dictionary', len(combinedDictionary))
+
+print('Saving conversation data dictionary')
+np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
+
+conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
+for key, value in combinedDictionary.items():
+    if (not key.strip() or not value.strip()):
+        # If there are empty strings
+        continue
+    print(key.strip() + value.strip())
+    conversationFile.write(key.strip() + value.strip())
--- a/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
+++ b/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
--- a/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
--- a/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
--- a/chat_system/test.py 0 → 100644
View file @495704f
+++ b/chat_system/test.py 0 → 100644
View file @495704f
+import pickle
+
+with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
+    data = pickle.load(f)
+    print(data)  # hello
--- a/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
+++ b/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
-import pandas as pd
-import numpy as np
-import os
-import re
-from datetime import datetime
-
-
-
-
-def getWhatsAppDataCSV(personName):
-    df = pd.read_csv('whatsapp_chats.csv')
-    responseDictionary = dict()
-    receivedMessages = df[df['From'] != personName]
-    sentMessages = df[df['From'] == personName]
-    combined = pd.concat([sentMessages, receivedMessages])
-    otherPersonsMessage, myMessage = "",""
-    firstMessage = True
-    for index, row in combined.iterrows():
-        if (row['From'] != personName):
-            if myMessage and otherPersonsMessage:
-                otherPersonsMessage = cleanMessage(otherPersonsMessage)
-                myMessage = cleanMessage(myMessage)
-                responseDictionary[otherPersonsMessage.rstrip()] = myMessage.rstrip()
-                otherPersonsMessage, myMessage = "",""
-            otherPersonsMessage = otherPersonsMessage + str(row['Content']) + " "
-        else:
-            if (firstMessage):
-                firstMessage = False
-                # Don't include if I am the person initiating the convo
-                continue
-            myMessage = myMessage + str(row['Content']) + " "
-    return responseDictionary
-
-def getWhatsAppDataTXT(personName):
-    # Putting all the file names in a list
-    allFiles = []
-    # Edit these file and directory names if you have them saved somewhere else
-    for filename in os.listdir('WhatsAppChatLogs'):
-        if filename.endswith(".txt"):
-            allFiles.append('WhatsAppChatLogs/' + filename)
-
-    responseDictionary = dict()
-    """
-        The key is the other person's message, and the value is my response
-        Going through each file, and recording everyone's messages to me, and my
-        responses
-    """
-    for currentFile in allFiles:
-        myMessage, otherPersonsMessage, currentSpeaker = "","",""
-        with open(currentFile, 'r') as openedFile:
-            allLines = openedFile.readlines()
-        for index,line in enumerate(allLines):
-            # The sender's name is separated by a ']' or '-' and a ': ' (The whitespace is important)
-            leftDelimPattern = re.compile(r'[\]\-]')
-            # A pattern to match either `]` or `-`
-            leftDelim = leftDelimPattern.search(line)
-            leftDelim = leftDelim.start() if leftDelim else -1
-            rightColon = line.find(': ')
-
-            # Find messages that I sent
-            if (line[leftDelim + 1:rightColon].strip() == personName):
-                if not myMessage:
-                    # Want to find the first message that I send (if I send
-                    # multiple in a row)
-                    startMessageIndex = index - 1
-                myMessage += line[rightColon + 1:].strip()
-
-            elif myMessage:
-                # Now go and see what message the other person sent by looking at
-                # previous messages
-                for counter in range(startMessageIndex, 0, -1):
-                    currentLine = allLines[counter]
-                    # Extracting the values of left and right delimiters
-                    leftDelim = leftDelimPattern.search(currentLine)
-                    leftDelim = leftDelim.start() if leftDelim else -1
-                    rightColon = line.find(': ')
-                    if (leftDelim < 0 or rightColon < 0):
-                        # In case the message above isn't in the right format
-                        myMessage, otherPersonsMessage, currentSpeaker = "","",""
-                        break
-                    if not currentSpeaker:
-                        # The first speaker not named me
-                        currentSpeaker = currentLine[leftDelim + 1:rightColon].strip()
-                    elif (currentSpeaker != currentLine[leftDelim + 1:rightColon].strip()):
-                        # A different person started speaking, so now I know that
-                        # the first person's message is done
-                        otherPersonsMessage = cleanMessage(otherPersonsMessage)
-                        myMessage = cleanMessage(myMessage)
-                        responseDictionary[otherPersonsMessage] = myMessage
-                        break
-                    otherPersonsMessage = currentLine[rightColon + 1:].strip() + otherPersonsMessage
-                myMessage, otherPersonsMessage, currentSpeaker = "","",""
-    return responseDictionary
-
-def getWhatsAppData():
-    personName = raw_input('Enter your full WhatsApp name: ')
-    if os.path.isfile('whatsapp_chats.csv'):
-        return getWhatsAppDataCSV(personName)
-    else:
-        return getWhatsAppDataTXT(personName)
-
-
-def getGoogleHangoutsData():
-    personName = raw_input('Enter your full Hangouts name: ')
-    # Putting all the file names in a list
-    allFiles = []
-    # Edit these file and directory names if you have them saved somewhere else
-    for filename in os.listdir('GoogleTextForm'):
-        if filename.endswith(".txt"):
-            allFiles.append('GoogleTextForm/' + filename)
-
-    responseDictionary = dict()
-    """
-        The key is the other person's message, and the value is my response
-        Going through each file, and recording everyone's messages to me, and my
-        responses
-    """
-    for currentFile in allFiles:
-        myMessage, otherPersonsMessage, currentSpeaker = "","",""
-        with open(currentFile, 'r') as openedFile:
-            allLines = openedFile.readlines()
-        for index,lines in enumerate(allLines):
-            # The sender's name is separated by < and >
-            leftBracket = lines.find('<')
-            rightBracket = lines.find('>')
-
-            # Find messages that I sent
-            if (lines[leftBracket + 1:rightBracket] == personName):
-                if not myMessage:
-                    # Want to find the first message that I send (if I send
-                    # multiple in a row)
-                    startMessageIndex = index - 1
-                myMessage += lines[rightBracket + 1:]
-
-            elif myMessage:
-                # Now go and see what message the other person sent by looking at
-                # previous messages
-                for counter in range(startMessageIndex, 0, -1):
-                    currentLine = allLines[counter]
-                    # In case the message above isn't in the right format
-                    if (currentLine.find('<') < 0 or currentLine.find('>') < 0):
-                        myMessage, otherPersonsMessage, currentSpeaker = "","",""
-                        break
-                    if not currentSpeaker:
-                        # The first speaker not named me
-                        currentSpeaker = currentLine[currentLine.find('<') + 1:currentLine.find('>')]
-                    elif (currentSpeaker != currentLine[currentLine.find('<') + 1:currentLine.find('>')]):
-                        # A different person started speaking, so now I know that
-                        # the first person's message is done
-                        otherPersonsMessage = cleanMessage(otherPersonsMessage)
-                        myMessage = cleanMessage(myMessage)
-                        responseDictionary[otherPersonsMessage] = myMessage
-                        break
-                    otherPersonsMessage = currentLine[currentLine.find('>') + 1:] + otherPersonsMessage
-                myMessage, otherPersonsMessage, currentSpeaker = "","",""
-    return responseDictionary
-
-def getFacebookData():
-    print("function active")
-    personName = raw_input('Enter your full Facebook name: ')
-    personName = personName.rstrip('\r')
-    responseDictionary = dict()
-    with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Facebook-Messenger-Bot\fbMessages.txt', 'r') as fbFile:
-        allLines = fbFile.readlines()
-
-    myMessage, otherPersonsMessage, currentSpeaker = "","",""
-
-    for index,lines in enumerate(allLines):
-        rightBracket = lines.find(']') + 2
-        justMessage = lines[rightBracket:]
-        colon = justMessage.find(':')
-        # Find messages that I sent
-        print("*input*")
-        print(personName, len(personName))
-        print("*file*")
-        print(justMessage[:colon-1], len(personName))
-        print("*same?*")
-        print(justMessage[:colon-1] == personName)
-        print(justMessage[:colon] == "Second User")
-        print(personName == "Second User")
-        print("------------------------------------------")
-
-        if (justMessage[:colon-1] == personName):
-            print('a')
-            if not myMessage:
-                # Want to find the first message that I send (if I send multiple
-                # in a row)
-                startMessageIndex = index - 1
-            myMessage += justMessage[colon + 2:]
-
-        elif myMessage:
-            # Now go and see what message the other person sent by looking at
-            # previous messages
-            for counter in range(startMessageIndex, 0, -1):
-                currentLine = allLines[counter]
-                rightBracket = currentLine.find(']') + 2
-                justMessage = currentLine[rightBracket:]
-                colon = justMessage.find(':')
-                if not currentSpeaker:
-                    # The first speaker not named me
-                    currentSpeaker = justMessage[:colon]
-                elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
-                    # A different person started speaking, so now I know that the
-                    # first person's message is done
-                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
-                    myMessage = cleanMessage(myMessage)
-                    responseDictionary[otherPersonsMessage] = myMessage
-                    break
-                otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
-            myMessage, otherPersonsMessage, currentSpeaker = "","",""
-    return responseDictionary
-
-def getLinkedInData():
-    personName = raw_input('Enter your full LinkedIn name: ')
-    df = pd.read_csv('Inbox.csv')
-    dateTimeConverter = lambda x: datetime.strptime(x,'%B %d, %Y, %I:%M %p')
-    responseDictionary = dict()
-    peopleContacted = df['From'].unique().tolist()
-    for person in peopleContacted:
-        receivedMessages = df[df['From'] == person]
-        sentMessages = df[df['To'] == person]
-        if (len(sentMessages) == 0 or len(receivedMessages) == 0):
-            # There was no actual conversation
-            continue
-        combined = pd.concat([sentMessages, receivedMessages])
-        combined['Date'] = combined['Date'].apply(dateTimeConverter)
-        combined = combined.sort(['Date'])
-        otherPersonsMessage, myMessage = "",""
-        firstMessage = True
-        for index, row in combined.iterrows():
-            if (row['From'] != personName):
-                if myMessage and otherPersonsMessage:
-                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
-                    myMessage = cleanMessage(myMessage)
-                    responseDictionary[otherPersonsMessage.rstrip()] = myMessage.rstrip()
-                    otherPersonsMessage, myMessage = "",""
-                otherPersonsMessage = otherPersonsMessage + str(row['Content']) + " "
-            else:
-                if (firstMessage):
-                    firstMessage = False
-                    # Don't include if I am the person initiating the convo
-                    continue
-                myMessage = myMessage + str(row['Content']) + " "
-    return responseDictionary
-
-def getDiscordData():
-    personName = raw_input('Enter your full Discord name: ')
-    # Putting all the file names in a list
-    allFiles = []
-    # Edit these file and directory names if you have them saved somewhere else
-    for filename in os.listdir('DiscordChatLogs'):
-        if filename.endswith(".txt"):
-            allFiles.append('DiscordChatLogs/' + filename)
-    responseDictionary = dict()
-    """
-        The key is the other person's message, and the value is my response
-        Going through each file, and recording everyone's messages to me, and my
-        responses
-    """
-    for currentFile in allFiles:
-        with open(currentFile, 'r') as openedFile:
-            allLines = openedFile.readlines()
-        data = ''.join(allLines)
-        otherPersonsMessage, myMessage = "",""
-        response_sets = re.findall(r'\[.+\] (?!' + re.escape(personName) + r').+\n(.+)\n{2}(?:\[.+\] ' + re.escape(personName) + r'\n(.+)\n{2})', data)
-        for response_set in response_sets:
-            otherPersonsMessage = response_set[0]
-            myMessage = response_set[1]
-            responseDictionary[otherPersonsMessage] = cleanMessage(myMessage)
-            otherPersonsMessage, myMessage = "",""
-    return responseDictionary
-
-def cleanMessage(message):
-    # Remove new lines within message
-    cleanedMessage = message.replace('\n',' ').lower()
-    # Deal with some weird tokens
-    cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
-    # Remove punctuation
-    cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
-    # Remove multiple spaces in message
-    cleanedMessage = re.sub(' +',' ', cleanedMessage)
-    return cleanedMessage
-
-combinedDictionary = {}
-
-combinedDictionary.update(getFacebookData())
-
-print ('Total len of dictionary', len(combinedDictionary))
-
-print('Saving conversation data dictionary')
-np.save('conversationDictionary.npy', combinedDictionary)
-
-conversationFile = open('conversationData.txt', 'w')
-for key, value in combinedDictionary.items():
-    if (not key.strip() or not value.strip()):
-        # If there are empty strings
-        continue
-    conversationFile.write(key.strip() + value.strip())
--- a/make_your_own_chatbot/conversationData.txt
View file @495704f
+++ b/make_your_own_chatbot/conversationData.txt
View file @495704f
-456 78
\ No newline at end of file
+67126 78this is a testyes it is the last message hello45x y zx y z
\ No newline at end of file
--- a/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
+++ b/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
--- a/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f
+++ b/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f