kakao parsing script && SeqtoSeq analysis

hyeyeon-sun
Commit 495704f94fecaeaf1b8e2ad76d82c284c9f09718 495704f9 1 parent 77b9a5b0
Showing 19 changed files with 195 additions and 58 deletions
make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
make_your_own_chatbot/chat_system/README.md → chat_system/README.md
make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
chat_system/bash.exe.stackdump
chat_system/createDataset.py
make_your_own_chatbot/chat_system/index.js → chat_system/index.js
make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
make_your_own_chatbot/chat_system/package.json → chat_system/package.json
chat_system/test.py
make_your_own_chatbot/chat_system/createDataset.py
make_your_own_chatbot/conversationData.txt
make_your_own_chatbot/conversationDictionary.npy
make_your_own_chatbot/dict.npy
--- a/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
+++ b/make_your_own_chatbot/chat_system/.gitignore → chat_system/.gitignore
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DefaultChatbotResponse.png → chat_system/Images/DefaultChatbotResponse.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/DirectoryStructure.png → chat_system/Images/DirectoryStructure.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Models.png → chat_system/Images/Models.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Images/Samples.png → chat_system/Images/Samples.png
View file @495704f
--- a/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Procfile → chat_system/Procfile
View file @495704f
--- a/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
+++ b/make_your_own_chatbot/chat_system/README.md → chat_system/README.md
View file @495704f
--- a/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Seq2Seq.py → chat_system/Seq2Seq.py
View file @495704f
- import tensorflow as tf
+ import tensorflow.compat.v1 as tf
+ tf.disable_v2_behavior() 
 import numpy as np
 import sys
 from random import randint
@@ -10,11 +11,11 @@ import os
 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 
 def createTrainingMatrices(conversationFileName, wList, maxLen):
-     conversationDictionary = np.load(conversationFileName).item()
+     conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
     numExamples = len(conversationDictionary)
     xTrain = np.zeros((numExamples, maxLen), dtype='int32')
     yTrain = np.zeros((numExamples, maxLen), dtype='int32')
-     for index,(key,value) in enumerate(conversationDictionary.iteritems()):
+     for index,(key,value) in enumerate(conversationDictionary.items()):
         # Will store integerized representation of strings here (initialized as padding)
         encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
         decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
     xTrain = np.load('Seq2SeqXTrain.npy')
     yTrain = np.load('Seq2SeqYTrain.npy')
-     print 'Finished loading training matrices'
+     print ('Finished loading training matrices')
     numTrainingExamples = xTrain.shape[0]
 else:
     numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
     np.save('Seq2SeqXTrain.npy', xTrain)
     np.save('Seq2SeqYTrain.npy', yTrain)
-     print 'Finished creating training matrices'
+     print ('Finished creating training matrices')
 
- tf.reset_default_graph()
+ tf.compat.v1.reset_default_graph()
 
 # Create the placeholders
 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
 # Architectural choice of of whether or not to include ^
 
- decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
+ decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
                                                             vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
 
 decoderPrediction = tf.argmax(decoderOutputs, 2)
@@ -233,14 +234,14 @@ for i in range(numIterations):
         writer.add_summary(summary, i)
     if (i % 25 == 0 and i != 0):
         num = randint(0,len(encoderTestStrings) - 1)
-         print encoderTestStrings[num]
+         print (encoderTestStrings[num])
         inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
         feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
         feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
         feedDict.update({feedPrevious: True})
         ids = (sess.run(decoderPrediction, feed_dict=feedDict))
-         print idsToSentence(ids, wordList)
+         print (idsToSentence(ids, wordList))
 
     if (i % 10000 == 0 and i != 0):
         savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
--- a/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
+++ b/make_your_own_chatbot/chat_system/Word2Vec.py → chat_system/Word2Vec.py
View file @495704f
- import tensorflow as tf
+ import tensorflow.compat.v1 as tf
+ tf.disable_v2_behavior()
 import numpy as np
 import re
 from collections import Counter
@@ -25,8 +26,9 @@ numIterations = 100000
 # into one huge string, and then uses a Counter to identify words
 # and the number of occurences
 def processDataset(filename):
-     openedFile = open(filename, 'r')
+     openedFile = open(filename, 'r', encoding='UTF8')
     allLines = openedFile.readlines()
+     print(allLines)
     myStr = ""
     for line in allLines:
         myStr += line
@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
     yTrain=[]
     for i in range(numTotalWords):
         if i % 100000 == 0:
-             print 'Finished %d/%d total words' % (i, numTotalWords)
+             print ('Finished %d/%d total words' % (i, numTotalWords))
         wordsAfter = allWords[i + 1:i + windowSize + 1]
         wordsBefore = allWords[max(0, i - windowSize):i]
         wordsAdded = wordsAfter + wordsBefore
@@ -61,61 +63,22 @@ continueWord2Vec = True
 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
     xTrain = np.load('Word2VecXTrain.npy')
     yTrain = np.load('Word2VecYTrain.npy')
-     print 'Finished loading training matrices'
+     print ('Finished loading training matrices')
     with open("wordList.txt", "rb") as fp:
         wordList = pickle.load(fp)
-     print 'Finished loading word list'
+     print ('Finished loading word list')
 
 else:
-     fullCorpus, datasetDictionary = processDataset('conversationData.txt')
-     print 'Finished parsing and cleaning dataset'
+     fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
+     print ('Finished parsing and cleaning dataset')
     wordList = list(datasetDictionary.keys())
-     createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
+     createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
     if (createOwnVectors == 'y'):
         xTrain, yTrain  = createTrainingMatrices(datasetDictionary, fullCorpus)
-         print 'Finished creating training matrices'
+         print ('Finished creating training matrices')
         np.save('Word2VecXTrain.npy', xTrain)
         np.save('Word2VecYTrain.npy', yTrain)
     else:
         continueWord2Vec = False
     with open("wordList.txt", "wb") as fp:
         pickle.dump(wordList, fp)
- 
- # If you do not want to create your own word vectors and you'd just like to
- # have Tensorflow's seq2seq take care of that, then you don't need to run
- # anything below this line.
- if (continueWord2Vec == False):
-     sys.exit()
- 
- numTrainingExamples = len(xTrain)
- vocabSize = len(wordList)
- 
- sess = tf.Session()
- embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
- nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
- nceBiases = tf.Variable(tf.zeros([vocabSize]))
- 
- inputs = tf.placeholder(tf.int32, shape=[batchSize])
- outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
- 
- embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
- 
- loss = tf.reduce_mean(
-   tf.nn.nce_loss(weights=nceWeights,
-                  biases=nceBiases,
-                  labels=outputs,
-                  inputs=embed,
-                  num_sampled=numNegativeSample,
-                  num_classes=vocabSize))
- 
- optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
- 
- sess.run(tf.global_variables_initializer())
- for i in range(numIterations):
-     trainInputs, trainLabels = getTrainingBatch()
-     _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
-     if (i % 10000 == 0):
-         print ('Current loss is:', curLoss)
- print 'Saving the word embedding matrix'
- embedMatrix = embeddingMatrix.eval(session=sess)
- np.save('embeddingMatrix.npy', embedMatrix)
--- a/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+++ b/chat_system/bash.exe.stackdump 0 → 100644
View file @495704f
+ Stack trace:
+ Frame        Function    Args
+ 00600000010  001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
+ 00600000010  001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
+ 00600000010  00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
+ 00600000010  0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
+ 00600000010  0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
+ 00600000010  0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
+ 00600000010  001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
+ 00600000010  0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
+ 00600000010  0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
+ 000FFFFCCE0  0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
+ 000FFFFCCE0  00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
+ 000FFFFFFF0  001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
+ 000FFFFFFF0  00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
+ End of stack trace
--- a/chat_system/createDataset.py 0 → 100644
View file @495704f
+++ b/chat_system/createDataset.py 0 → 100644
View file @495704f
+ import pandas as pd
+ import numpy as np
+ import os
+ import re
+ from datetime import datetime
+ 
+ def getKakaotalkbookData():
+     print("function active")
+     personName = raw_input('Enter your full kakao name: ')
+     personName = personName.rstrip('\r')
+     responseDictionary = dict()
+     with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
+         allLines = fbFile.readlines()
+ 
+     myMessage, otherPersonsMessage, currentSpeaker = "","",""
+ 
+     for index,lines in enumerate(allLines):
+         rightBracket = lines.find(']') + 2
+         justMessage = lines[rightBracket:]
+         colon = justMessage.find(':')
+         # Find messages that I sent
+         print("*input*")
+         print(personName, len(personName))
+         print("*file*")
+         print(justMessage[:colon-1], len(personName))
+         print("*same?*")
+         print(justMessage[:colon-1] == personName)
+         print(justMessage[:colon] == "Second User")
+         print(personName == "Second User")
+         print("------------------------------------------")
+ 
+         if (justMessage[:colon-1] == personName):
+             print('a')
+             if not myMessage:
+                 # Want to find the first message that I send (if I send multiple
+                 # in a row)
+                 startMessageIndex = index - 1
+             myMessage += justMessage[colon + 2:]
+ 
+         elif myMessage:
+             # Now go and see what message the other person sent by looking at
+             # previous messages
+             for counter in range(startMessageIndex, 0, -1):
+                 currentLine = allLines[counter]
+                 rightBracket = currentLine.find(']') + 2
+                 justMessage = currentLine[rightBracket:]
+                 colon = justMessage.find(':')
+                 if not currentSpeaker:
+                     # The first speaker not named me
+                     currentSpeaker = justMessage[:colon]
+                 elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                     # A different person started speaking, so now I know that the
+                     # first person's message is done
+                     otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                     myMessage = cleanMessage(myMessage)
+                     responseDictionary[otherPersonsMessage] = myMessage
+                     break
+                 otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+             myMessage, otherPersonsMessage, currentSpeaker = "","",""
+     return responseDictionary
+ 
+ 
+ def getFacebookData():
+     print("function active")
+     personName = input('Enter your full Kakaotalk name: ')
+     personName = personName.rstrip('\r')
+     responseDictionary = dict()
+     with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
+         allLines = fbFile.readlines()
+ 
+     myMessage, otherPersonsMessage, currentSpeaker = "","",""
+ 
+     for index,lines in enumerate(allLines):
+         rightBracket = lines.find(':') + 5
+         justMessage = lines[rightBracket:]
+         colon = justMessage.find(':')
+         # Find messages that I sent
+         print("*input*")
+         
+         print(type(personName))
+         print(personName, len(personName))
+         print("*file*")
+         print(justMessage[:colon-1], len(personName))
+         print("*same?*")
+         print(justMessage[:colon-1] == personName)
+         print(justMessage[:colon] == "Second User")
+         print(personName == "Second User")
+         
+ 
+ 
+         if (justMessage[:colon-1] == personName):
+             print('a')
+             if not myMessage:
+                 # Want to find the first message that I send (if I send multiple
+                 # in a row)
+                 startMessageIndex = index - 1
+             myMessage += justMessage[colon + 2:]
+ 
+         elif myMessage:
+             # Now go and see what message the other person sent by looking at
+             # previous messages
+             for counter in range(startMessageIndex, 0, -1):
+                 currentLine = allLines[counter]
+                 rightBracket = currentLine.find(':') + 5
+                 justMessage = currentLine[rightBracket:]
+                 colon = justMessage.find(':')
+                 if not currentSpeaker:
+                     # The first speaker not named me
+                     currentSpeaker = justMessage[:colon]
+                 elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
+                     # A different person started speaking, so now I know that the
+                     # first person's message is done
+                     otherPersonsMessage = cleanMessage(otherPersonsMessage)
+                     myMessage = cleanMessage(myMessage)
+                     responseDictionary[otherPersonsMessage] = myMessage
+                     break
+                 otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
+             myMessage, otherPersonsMessage, currentSpeaker = "","",""
+         print("------------------------------------------")
+     
+     return responseDictionary
+ 
+ 
+ 
+ def cleanMessage(message):
+     # Remove new lines within message
+     cleanedMessage = message.replace('\n',' ').lower()
+     # Deal with some weird tokens
+     cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
+     # Remove punctuation
+     cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
+     # Remove multiple spaces in message
+     cleanedMessage = re.sub(' +',' ', cleanedMessage)
+     return cleanedMessage
+ 
+ combinedDictionary = {}
+ 
+ combinedDictionary.update(getFacebookData())
+ 
+ print(combinedDictionary)
+ print ('Total len of dictionary', len(combinedDictionary))
+ 
+ print('Saving conversation data dictionary')
+ np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
+ 
+ conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
+ for key, value in combinedDictionary.items():
+     if (not key.strip() or not value.strip()):
+         # If there are empty strings
+         continue
+     print(key.strip() + value.strip())
+     conversationFile.write(key.strip() + value.strip())
--- a/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
+++ b/make_your_own_chatbot/chat_system/index.js → chat_system/index.js
View file @495704f
--- a/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package-lock.json → chat_system/package-lock.json
View file @495704f
--- a/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
+++ b/make_your_own_chatbot/chat_system/package.json → chat_system/package.json
View file @495704f
--- a/chat_system/test.py 0 → 100644
View file @495704f
+++ b/chat_system/test.py 0 → 100644
View file @495704f
+ import pickle
+ 
+ with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
+     data = pickle.load(f)
+     print(data)  # hello
--- a/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
+++ b/make_your_own_chatbot/chat_system/createDataset.py deleted 100644 → 0
View file @77b9a5b
--- a/make_your_own_chatbot/conversationData.txt
View file @495704f
+++ b/make_your_own_chatbot/conversationData.txt
View file @495704f
- 456 78
\ No newline at end of file
+ 67126 78this is a testyes it is the last message hello45x y zx y z
\ No newline at end of file
--- a/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
+++ b/make_your_own_chatbot/conversationDictionary.npy
View file @495704f
--- a/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f
+++ b/make_your_own_chatbot/dict.npy 0 → 100644
View file @495704f