kakao parsing script && SeqtoSeq analysis

hyeyeon-sun
Commit 495704f94fecaeaf1b8e2ad76d82c284c9f09718 495704f9 1 parent 77b9a5b0
Showing 19 changed files with 195 additions and 58 deletions
make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
make_your_own_chatbot/chat_system/README.md → chat_system/README.md
make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
chat_system/bash.exe.stackdump
chat_system/createDataset.py
make_your_own_chatbot/chat_system/index.js → chat_system/index.js
make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
make_your_own_chatbot/chat_system/package.json → chat_system/package.json
chat_system/test.py
make_your_own_chatbot/chat_system/createDataset.py
make_your_own_chatbot/conversationData.txt
make_your_own_chatbot/conversationDictionary.npy
make_your_own_chatbot/dict.npy
--- a/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
+++ b/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
--- a/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
+++ b/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
--- a/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior() 
 import numpy as np
 import sys
 from random import randint
@@ -10,11 +11,11 @@ import os
 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 def createTrainingMatrices(conversationFileName, wList, maxLen):
-    conversationDictionary = np.load(conversationFileName).item()
+    conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
     numExamples = len(conversationDictionary)
     xTrain = np.zeros((numExamples, maxLen), dtype='int32')
     yTrain = np.zeros((numExamples, maxLen), dtype='int32')
-    for index,(key,value) in enumerate(conversationDictionary.iteritems()):
+    for index,(key,value) in enumerate(conversationDictionary.items()):
         # Will store integerized representation of strings here (initialized as padding)
         encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
         decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
     xTrain = np.load('Seq2SeqXTrain.npy')
     yTrain = np.load('Seq2SeqYTrain.npy')
-    print 'Finished loading training matrices'
+    print ('Finished loading training matrices')
     numTrainingExamples = xTrain.shape[0]
 else:
     numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
     np.save('Seq2SeqXTrain.npy', xTrain)
     np.save('Seq2SeqYTrain.npy', yTrain)
-    print 'Finished creating training matrices'
+    print ('Finished creating training matrices')
-tf.reset_default_graph()
+tf.compat.v1.reset_default_graph()
 # Create the placeholders
 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
 # Architectural choice of of whether or not to include ^
-decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
+decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
                                                             vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
 decoderPrediction = tf.argmax(decoderOutputs, 2)
@@ -233,14 +234,14 @@ for i in range(numIterations):
         writer.add_summary(summary, i)
     if (i % 25 == 0 and i != 0):
         num = randint(0,len(encoderTestStrings) - 1)
-        print encoderTestStrings[num]
+        print (encoderTestStrings[num])
         inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
         feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
         feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({feedPrevious: True})
         ids = (sess.run(decoderPrediction, feed_dict=feedDict))
-        print idsToSentence(ids, wordList)
+        print (idsToSentence(ids, wordList))
     if (i % 10000 == 0 and i != 0):
         savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
--- a/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
 import numpy as np
 import re
 from collections import Counter
@@ -25,8 +26,9 @@ numIterations = 100000
 # into one huge string, and then uses a Counter to identify words
 # and the number of occurences
 def processDataset(filename):
-    openedFile = open(filename, 'r')
+    openedFile = open(filename, 'r', encoding='UTF8')
     allLines = openedFile.readlines()
+    print(allLines)
     myStr = ""
     for line in allLines:
         myStr += line
@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
     yTrain=[]
     for i in range(numTotalWords):
         if i % 100000 == 0:
-            print 'Finished %d/%d total words' % (i, numTotalWords)
+            print ('Finished %d/%d total words' % (i, numTotalWords))
         wordsAfter = allWords[i + 1:i + windowSize + 1]
         wordsBefore = allWords[max(0, i - windowSize):i]
         wordsAdded = wordsAfter + wordsBefore
@@ -61,61 +63,22 @@ continueWord2Vec = True
 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
     xTrain = np.load('Word2VecXTrain.npy')
     yTrain = np.load('Word2VecYTrain.npy')
-    print 'Finished loading training matrices'
+    print ('Finished loading training matrices')
     with open("wordList.txt", "rb") as fp:
         wordList = pickle.load(fp)
-    print 'Finished loading word list'
+    print ('Finished loading word list')
 else:
-    fullCorpus, datasetDictionary = processDataset('conversationData.txt')
+    fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
-    print 'Finished parsing and cleaning dataset'
+    print ('Finished parsing and cleaning dataset')
     wordList = list(datasetDictionary.keys())
-    createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
+    createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
     if (createOwnVectors == 'y'):
         xTrain, yTrain  = createTrainingMatrices(datasetDictionary, fullCorpus)
-        print 'Finished creating training matrices'
+        print ('Finished creating training matrices')
         np.save('Word2VecXTrain.npy', xTrain)
         np.save('Word2VecYTrain.npy', yTrain)
     else:
         continueWord2Vec = False
     with open("wordList.txt", "wb") as fp:
         pickle.dump(wordList, fp)
-
-# If you do not want to create your own word vectors and you'd just like to
-# have Tensorflow's seq2seq take care of that, then you don't need to run
-# anything below this line.
-if (continueWord2Vec == False):
-    sys.exit()
-
-numTrainingExamples = len(xTrain)
-vocabSize = len(wordList)
-
-sess = tf.Session()
-embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
-nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
-nceBiases = tf.Variable(tf.zeros([vocabSize]))
-
-inputs = tf.placeholder(tf.int32, shape=[batchSize])
-outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
-
-embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
-
-loss = tf.reduce_mean(
-  tf.nn.nce_loss(weights=nceWeights,
-                 biases=nceBiases,
-                 labels=outputs,
-                 inputs=embed,
-                 num_sampled=numNegativeSample,
-                 num_classes=vocabSize))
-
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
-
-sess.run(tf.global_variables_initializer())
-for i in range(numIterations):
-    trainInputs, trainLabels = getTrainingBatch()
-    _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
-    if (i % 10000 == 0):
-        print ('Current loss is:', curLoss)
-print 'Saving the word embedding matrix'
-embedMatrix = embeddingMatrix.eval(session=sess)
-np.save('embeddingMatrix.npy', embedMatrix)
--- a/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+++ b/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+Stack trace:
+Frame        Function    Args
+00600000010  001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
+00600000010  001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
+00600000010  00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
+00600000010  0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
+00600000010  0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
+00600000010  0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
+00600000010  001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
+00600000010  0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
+00600000010  0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
+000FFFFCCE0  0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
+000FFFFCCE0  00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
+000FFFFFFF0  001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
+000FFFFFFF0  00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
+End of stack trace
--- a/chat_system/createDataset.py 0 → 100644
View file @495704f
+++ b/chat_system/createDataset.py 0 → 100644
View file @495704f
+import pandas as pd
+import numpy as np
+import os
+import re
+from datetime import datetime
+
+def getKakaotalkbookData():
+    print("function active")
+    personName = raw_input('Enter your full kakao name: ')
+    personName = personName.rstrip('\r')
+    responseDictionary = dict()
+    with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
+        allLines = fbFile.readlines()
+
+    myMessage, otherPersonsMessage, currentSpeaker = "","",""
+
+    for index,lines in enumerate(allLines):
+        rightBracket = lines.find(']') + 2
+        justMessage = lines[rightBracket:]
+        colon = justMessage.find(':')
+        # Find messages that I sent
+        print("*input*")
+        print(personName, len(personName))
+        print("*file*")
+        print(justMessage[:colon-1], len(personName))
+        print("*same?*")
+        print(justMessage[:colon-1] == personName)
+        print(justMessage[:colon] == "Second User")
+        print(personName == "Second User")
+        print("------------------------------------------")
+
+        if (justMessage[:colon-1] == personName):
+            print('a')
+            if not myMessage:
+                # Want to find the first message that I send (if I send multiple
+                # in a row)
+                startMessageIndex = index - 1
+            myMessage += justMessage[colon + 2:]
+
+        elif myMessage:
+            # Now go and see what message the other person sent by looking at
+            # previous messages
+            for counter in range(startMessageIndex, 0, -1):
+                currentLine = allLines[counter]
+                rightBracket = currentLine.find(']') + 2
+                justMessage = currentLine[rightBracket:]
+                colon = justMessage.find(':')
+                if not currentSpeaker:
+                    # The first speaker not named me
+                    currentSpeaker = justMessage[:colon]
+                elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                    # A different person started speaking, so now I know that the
+                    # first person's message is done
+                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                    myMessage = cleanMessage(myMessage)
+                    responseDictionary[otherPersonsMessage] = myMessage
+                    break
+                otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+            myMessage, otherPersonsMessage, currentSpeaker = "","",""
+    return responseDictionary
+
+
+def getFacebookData():
+    print("function active")
+    personName = input('Enter your full Kakaotalk name: ')
+    personName = personName.rstrip('\r')
+    responseDictionary = dict()
+    with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
+        allLines = fbFile.readlines()
+
+    myMessage, otherPersonsMessage, currentSpeaker = "","",""
+
+    for index,lines in enumerate(allLines):
+        rightBracket = lines.find(':') + 5
+        justMessage = lines[rightBracket:]
+        colon = justMessage.find(':')
+        # Find messages that I sent
+        print("*input*")
+        
+        print(type(personName))
+        print(personName, len(personName))
+        print("*file*")
+        print(justMessage[:colon-1], len(personName))
+        print("*same?*")
+        print(justMessage[:colon-1] == personName)
+        print(justMessage[:colon] == "Second User")
+        print(personName == "Second User")
+        
+
+
+        if (justMessage[:colon-1] == personName):
+            print('a')
+            if not myMessage:
+                # Want to find the first message that I send (if I send multiple
+                # in a row)
+                startMessageIndex = index - 1
+            myMessage += justMessage[colon + 2:]
+
+        elif myMessage:
+            # Now go and see what message the other person sent by looking at
+            # previous messages
+            for counter in range(startMessageIndex, 0, -1):
+                currentLine = allLines[counter]
+                rightBracket = currentLine.find(':') + 5
+                justMessage = currentLine[rightBracket:]
+                colon = justMessage.find(':')
+                if not currentSpeaker:
+                    # The first speaker not named me
+                    currentSpeaker = justMessage[:colon]
+                elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                    # A different person started speaking, so now I know that the
+                    # first person's message is done
+                    otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                    myMessage = cleanMessage(myMessage)
+                    responseDictionary[otherPersonsMessage] = myMessage
+                    break
+                otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+            myMessage, otherPersonsMessage, currentSpeaker = "","",""
+        print("------------------------------------------")
+    
+    return responseDictionary
+
+
+
+def cleanMessage(message):
+    # Remove new lines within message
+    cleanedMessage = message.replace('\n',' ').lower()
+    # Deal with some weird tokens
+    cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
+    # Remove punctuation
+    cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
+    # Remove multiple spaces in message
+    cleanedMessage = re.sub(' +',' ', cleanedMessage)
+    return cleanedMessage
+
+combinedDictionary = {}
+
+combinedDictionary.update(getFacebookData())
+
+print(combinedDictionary)
+print ('Total len of dictionary', len(combinedDictionary))
+
+print('Saving conversation data dictionary')
+np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
+
+conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
+for key, value in combinedDictionary.items():
+    if (not key.strip() or not value.strip()):
+        # If there are empty strings
+        continue
+    print(key.strip() + value.strip())
+    conversationFile.write(key.strip() + value.strip())
--- a/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
+++ b/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
--- a/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
--- a/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
--- a/chat_system/test.py 0 → 100644
View file @495704f
+++ b/chat_system/test.py 0 → 100644
View file @495704f
+import pickle
+
+with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
+    data = pickle.load(f)
+    print(data)  # hello
--- a/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
+++ b/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
--- a/make_your_own_chatbot/conversationData.txt
View file @495704f
+++ b/make_your_own_chatbot/conversationData.txt
View file @495704f
-456 78
\ No newline at end of file
+67126 78this is a testyes it is the last message hello45x y zx y z
\ No newline at end of file
--- a/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
+++ b/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
--- a/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f
+++ b/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f