
kakao parsing script && SeqtoSeq analysis

import tensorflow as tf
import tensorflow.compat.v1 as tf
import numpy as np
import sys
from random import randint
def createTrainingMatrices(conversationFileName, wList, maxLen):
conversationDictionary = np.load(conversationFileName).item()
conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
numExamples = len(conversationDictionary)
xTrain = np.zeros((numExamples, maxLen), dtype='int32')
yTrain = np.zeros((numExamples, maxLen), dtype='int32')
for index,(key,value) in enumerate(conversationDictionary.iteritems()):
for index,(key,value) in enumerate(conversationDictionary.items()):
# Will store integerized representation of strings here (initialized as padding)
encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
xTrain = np.load('Seq2SeqXTrain.npy')
yTrain = np.load('Seq2SeqYTrain.npy')
print 'Finished loading training matrices'
print ('Finished loading training matrices')
numTrainingExamples = xTrain.shape[0]
numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
np.save('Seq2SeqXTrain.npy', xTrain)
np.save('Seq2SeqYTrain.npy', yTrain)
print 'Finished creating training matrices'
print ('Finished creating training matrices')
# Create the placeholders
encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
#encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
# Architectural choice of of whether or not to include ^
decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
decoderPrediction = tf.argmax(decoderOutputs, 2)
writer.add_summary(summary, i)
if (i % 25 == 0 and i != 0):
num = randint(0,len(encoderTestStrings) - 1)
print encoderTestStrings[num]
print (encoderTestStrings[num])
inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
feedDict.update({feedPrevious: True})
ids = (sess.run(decoderPrediction, feed_dict=feedDict))
print idsToSentence(ids, wordList)
print (idsToSentence(ids, wordList))
if (i % 10000 == 0 and i != 0):
savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
import tensorflow as tf
import tensorflow.compat.v1 as tf
import numpy as np
import re
from collections import Counter
# into one huge string, and then uses a Counter to identify words
# and the number of occurences
openedFile = open(filename, 'r')
openedFile = open(filename, 'r', encoding='UTF8')
allLines = openedFile.readlines()
myStr = ""
for line in allLines:
myStr += line
......@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
for i in range(numTotalWords):
if i % 100000 == 0:
print 'Finished %d/%d total words' % (i, numTotalWords)
print ('Finished %d/%d total words' % (i, numTotalWords))
wordsAfter = allWords[i + 1:i + windowSize + 1]
wordsBefore = allWords[max(0, i - windowSize):i]
wordsAdded = wordsAfter + wordsBefore
......@@ -61,61 +63,22 @@ continueWord2Vec = True
if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
xTrain = np.load('Word2VecXTrain.npy')
yTrain = np.load('Word2VecYTrain.npy')
print 'Finished loading training matrices'
print ('Finished loading training matrices')
with open("wordList.txt", "rb") as fp:
wordList = pickle.load(fp)
print 'Finished loading word list'
print ('Finished loading word list')
fullCorpus, datasetDictionary = processDataset('conversationData.txt')
print 'Finished parsing and cleaning dataset'
fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
print ('Finished parsing and cleaning dataset')
wordList = list(datasetDictionary.keys())
createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
if (createOwnVectors == 'y'):
xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus)
print 'Finished creating training matrices'
print ('Finished creating training matrices')
np.save('Word2VecXTrain.npy', xTrain)
np.save('Word2VecYTrain.npy', yTrain)
continueWord2Vec = False
with open("wordList.txt", "wb") as fp:
pickle.dump(wordList, fp)
# If you do not want to create your own word vectors and you'd just like to
# have Tensorflow's seq2seq take care of that, then you don't need to run
# anything below this line.
if (continueWord2Vec == False):
numTrainingExamples = len(xTrain)
vocabSize = len(wordList)
sess = tf.Session()
embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
nceBiases = tf.Variable(tf.zeros([vocabSize]))
inputs = tf.placeholder(tf.int32, shape=[batchSize])
outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
loss = tf.reduce_mean(
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
for i in range(numIterations):
trainInputs, trainLabels = getTrainingBatch()
_, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
if (i % 10000 == 0):
print ('Current loss is:', curLoss)
print 'Saving the word embedding matrix'
embedMatrix = embeddingMatrix.eval(session=sess)
np.save('embeddingMatrix.npy', embedMatrix)
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
def getKakaotalkbookData():
print("function active")
personName = raw_input('Enter your full kakao name: ')
personName = personName.rstrip('\r')
responseDictionary = dict()
with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
allLines = fbFile.readlines()
myMessage, otherPersonsMessage, currentSpeaker = "","",""
for index,lines in enumerate(allLines):
rightBracket = lines.find(']') + 2
justMessage = lines[rightBracket:]
colon = justMessage.find(':')
# Find messages that I sent
print(personName, len(personName))
print(justMessage[:colon-1], len(personName))
print(justMessage[:colon-1] == personName)
print(justMessage[:colon] == "Second User")
print(personName == "Second User")
if (justMessage[:colon-1] == personName):
if not myMessage:
# Want to find the first message that I send (if I send multiple
# in a row)
startMessageIndex = index - 1
myMessage += justMessage[colon + 2:]
elif myMessage:
# Now go and see what message the other person sent by looking at
# previous messages
for counter in range(startMessageIndex, 0, -1):
currentLine = allLines[counter]
rightBracket = currentLine.find(']') + 2
justMessage = currentLine[rightBracket:]
colon = justMessage.find(':')
if not currentSpeaker:
# The first speaker not named me
currentSpeaker = justMessage[:colon]
elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
# A different person started speaking, so now I know that the
# first person's message is done
otherPersonsMessage = cleanMessage(otherPersonsMessage)
myMessage = cleanMessage(myMessage)
responseDictionary[otherPersonsMessage] = myMessage
otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
myMessage, otherPersonsMessage, currentSpeaker = "","",""
return responseDictionary
def getFacebookData():
print("function active")
personName = input('Enter your full Kakaotalk name: ')
personName = personName.rstrip('\r')
responseDictionary = dict()
with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
allLines = fbFile.readlines()
myMessage, otherPersonsMessage, currentSpeaker = "","",""
for index,lines in enumerate(allLines):
rightBracket = lines.find(':') + 5
justMessage = lines[rightBracket:]
colon = justMessage.find(':')
# Find messages that I sent
print(personName, len(personName))
print(justMessage[:colon-1], len(personName))
print(justMessage[:colon-1] == personName)
print(justMessage[:colon] == "Second User")
print(personName == "Second User")
if (justMessage[:colon-1] == personName):
if not myMessage:
# Want to find the first message that I send (if I send multiple
# in a row)
startMessageIndex = index - 1
myMessage += justMessage[colon + 2:]
elif myMessage:
# Now go and see what message the other person sent by looking at
# previous messages
for counter in range(startMessageIndex, 0, -1):
currentLine = allLines[counter]
rightBracket = currentLine.find(':') + 5
justMessage = currentLine[rightBracket:]
colon = justMessage.find(':')
if not currentSpeaker:
# The first speaker not named me
currentSpeaker = justMessage[:colon]
elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
# A different person started speaking, so now I know that the
# first person's message is done
otherPersonsMessage = cleanMessage(otherPersonsMessage)
myMessage = cleanMessage(myMessage)
responseDictionary[otherPersonsMessage] = myMessage
otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
myMessage, otherPersonsMessage, currentSpeaker = "","",""
return responseDictionary
def cleanMessage(message):
# Remove new lines within message
cleanedMessage = message.replace('\n',' ').lower()
# Deal with some weird tokens
cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
# Remove punctuation
cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
# Remove multiple spaces in message
cleanedMessage = re.sub(' +',' ', cleanedMessage)
return cleanedMessage
combinedDictionary = {}
print ('Total len of dictionary', len(combinedDictionary))
print('Saving conversation data dictionary')
np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
for key, value in combinedDictionary.items():
if (not key.strip() or not value.strip()):
# If there are empty strings
print(key.strip() + value.strip())
conversationFile.write(key.strip() + value.strip())
import pickle
with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
data = pickle.load(f)
print(data) # hello
