hyeyeon-sun

kakao parsing script && SeqtoSeq analysis

import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import sys
from random import randint
......@@ -10,11 +11,11 @@ import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
def createTrainingMatrices(conversationFileName, wList, maxLen):
conversationDictionary = np.load(conversationFileName).item()
conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
numExamples = len(conversationDictionary)
xTrain = np.zeros((numExamples, maxLen), dtype='int32')
yTrain = np.zeros((numExamples, maxLen), dtype='int32')
for index,(key,value) in enumerate(conversationDictionary.iteritems()):
for index,(key,value) in enumerate(conversationDictionary.items()):
# Will store integerized representation of strings here (initialized as padding)
encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
......@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
xTrain = np.load('Seq2SeqXTrain.npy')
yTrain = np.load('Seq2SeqYTrain.npy')
print 'Finished loading training matrices'
print ('Finished loading training matrices')
numTrainingExamples = xTrain.shape[0]
else:
numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
np.save('Seq2SeqXTrain.npy', xTrain)
np.save('Seq2SeqYTrain.npy', yTrain)
print 'Finished creating training matrices'
print ('Finished creating training matrices')
tf.reset_default_graph()
tf.compat.v1.reset_default_graph()
# Create the placeholders
encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
......@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
#encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
# Architectural choice of of whether or not to include ^
decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
decoderPrediction = tf.argmax(decoderOutputs, 2)
......@@ -233,14 +234,14 @@ for i in range(numIterations):
writer.add_summary(summary, i)
if (i % 25 == 0 and i != 0):
num = randint(0,len(encoderTestStrings) - 1)
print encoderTestStrings[num]
print (encoderTestStrings[num])
inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
feedDict.update({feedPrevious: True})
ids = (sess.run(decoderPrediction, feed_dict=feedDict))
print idsToSentence(ids, wordList)
print (idsToSentence(ids, wordList))
if (i % 10000 == 0 and i != 0):
savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
......
import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import re
from collections import Counter
......@@ -25,8 +26,9 @@ numIterations = 100000
# into one huge string, and then uses a Counter to identify words
# and the number of occurences
def processDataset(filename):
openedFile = open(filename, 'r')
openedFile = open(filename, 'r', encoding='UTF8')
allLines = openedFile.readlines()
print(allLines)
myStr = ""
for line in allLines:
myStr += line
......@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
yTrain=[]
for i in range(numTotalWords):
if i % 100000 == 0:
print 'Finished %d/%d total words' % (i, numTotalWords)
print ('Finished %d/%d total words' % (i, numTotalWords))
wordsAfter = allWords[i + 1:i + windowSize + 1]
wordsBefore = allWords[max(0, i - windowSize):i]
wordsAdded = wordsAfter + wordsBefore
......@@ -61,61 +63,22 @@ continueWord2Vec = True
if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
xTrain = np.load('Word2VecXTrain.npy')
yTrain = np.load('Word2VecYTrain.npy')
print 'Finished loading training matrices'
print ('Finished loading training matrices')
with open("wordList.txt", "rb") as fp:
wordList = pickle.load(fp)
print 'Finished loading word list'
print ('Finished loading word list')
else:
fullCorpus, datasetDictionary = processDataset('conversationData.txt')
print 'Finished parsing and cleaning dataset'
fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
print ('Finished parsing and cleaning dataset')
wordList = list(datasetDictionary.keys())
createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?')
createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
if (createOwnVectors == 'y'):
xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus)
print 'Finished creating training matrices'
print ('Finished creating training matrices')
np.save('Word2VecXTrain.npy', xTrain)
np.save('Word2VecYTrain.npy', yTrain)
else:
continueWord2Vec = False
with open("wordList.txt", "wb") as fp:
pickle.dump(wordList, fp)
# If you do not want to create your own word vectors and you'd just like to
# have Tensorflow's seq2seq take care of that, then you don't need to run
# anything below this line.
if (continueWord2Vec == False):
sys.exit()
numTrainingExamples = len(xTrain)
vocabSize = len(wordList)
sess = tf.Session()
embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
nceBiases = tf.Variable(tf.zeros([vocabSize]))
inputs = tf.placeholder(tf.int32, shape=[batchSize])
outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nceWeights,
biases=nceBiases,
labels=outputs,
inputs=embed,
num_sampled=numNegativeSample,
num_classes=vocabSize))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
sess.run(tf.global_variables_initializer())
for i in range(numIterations):
trainInputs, trainLabels = getTrainingBatch()
_, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
if (i % 10000 == 0):
print ('Current loss is:', curLoss)
print 'Saving the word embedding matrix'
embedMatrix = embeddingMatrix.eval(session=sess)
np.save('embeddingMatrix.npy', embedMatrix)
......
Stack trace:
Frame Function Args
00600000010 001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
00600000010 001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
00600000010 00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
00600000010 0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
00600000010 0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
00600000010 0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
00600000010 001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
00600000010 0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
00600000010 0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
000FFFFCCE0 0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
000FFFFCCE0 00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
000FFFFFFF0 001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
000FFFFFFF0 00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
End of stack trace
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
def getKakaotalkbookData():
print("function active")
personName = raw_input('Enter your full kakao name: ')
personName = personName.rstrip('\r')
responseDictionary = dict()
with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
allLines = fbFile.readlines()
myMessage, otherPersonsMessage, currentSpeaker = "","",""
for index,lines in enumerate(allLines):
rightBracket = lines.find(']') + 2
justMessage = lines[rightBracket:]
colon = justMessage.find(':')
# Find messages that I sent
print("*input*")
print(personName, len(personName))
print("*file*")
print(justMessage[:colon-1], len(personName))
print("*same?*")
print(justMessage[:colon-1] == personName)
print(justMessage[:colon] == "Second User")
print(personName == "Second User")
print("------------------------------------------")
if (justMessage[:colon-1] == personName):
print('a')
if not myMessage:
# Want to find the first message that I send (if I send multiple
# in a row)
startMessageIndex = index - 1
myMessage += justMessage[colon + 2:]
elif myMessage:
# Now go and see what message the other person sent by looking at
# previous messages
for counter in range(startMessageIndex, 0, -1):
currentLine = allLines[counter]
rightBracket = currentLine.find(']') + 2
justMessage = currentLine[rightBracket:]
colon = justMessage.find(':')
if not currentSpeaker:
# The first speaker not named me
currentSpeaker = justMessage[:colon]
elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
# A different person started speaking, so now I know that the
# first person's message is done
otherPersonsMessage = cleanMessage(otherPersonsMessage)
myMessage = cleanMessage(myMessage)
responseDictionary[otherPersonsMessage] = myMessage
break
otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
myMessage, otherPersonsMessage, currentSpeaker = "","",""
return responseDictionary
def getFacebookData():
print("function active")
personName = input('Enter your full Kakaotalk name: ')
personName = personName.rstrip('\r')
responseDictionary = dict()
with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
allLines = fbFile.readlines()
myMessage, otherPersonsMessage, currentSpeaker = "","",""
for index,lines in enumerate(allLines):
rightBracket = lines.find(':') + 5
justMessage = lines[rightBracket:]
colon = justMessage.find(':')
# Find messages that I sent
print("*input*")
print(type(personName))
print(personName, len(personName))
print("*file*")
print(justMessage[:colon-1], len(personName))
print("*same?*")
print(justMessage[:colon-1] == personName)
print(justMessage[:colon] == "Second User")
print(personName == "Second User")
if (justMessage[:colon-1] == personName):
print('a')
if not myMessage:
# Want to find the first message that I send (if I send multiple
# in a row)
startMessageIndex = index - 1
myMessage += justMessage[colon + 2:]
elif myMessage:
# Now go and see what message the other person sent by looking at
# previous messages
for counter in range(startMessageIndex, 0, -1):
currentLine = allLines[counter]
rightBracket = currentLine.find(':') + 5
justMessage = currentLine[rightBracket:]
colon = justMessage.find(':')
if not currentSpeaker:
# The first speaker not named me
currentSpeaker = justMessage[:colon]
elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
# A different person started speaking, so now I know that the
# first person's message is done
otherPersonsMessage = cleanMessage(otherPersonsMessage)
myMessage = cleanMessage(myMessage)
responseDictionary[otherPersonsMessage] = myMessage
break
otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
myMessage, otherPersonsMessage, currentSpeaker = "","",""
print("------------------------------------------")
return responseDictionary
def cleanMessage(message):
# Remove new lines within message
cleanedMessage = message.replace('\n',' ').lower()
# Deal with some weird tokens
cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
# Remove punctuation
cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
# Remove multiple spaces in message
cleanedMessage = re.sub(' +',' ', cleanedMessage)
return cleanedMessage
combinedDictionary = {}
combinedDictionary.update(getFacebookData())
print(combinedDictionary)
print ('Total len of dictionary', len(combinedDictionary))
print('Saving conversation data dictionary')
np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
for key, value in combinedDictionary.items():
if (not key.strip() or not value.strip()):
# If there are empty strings
continue
print(key.strip() + value.strip())
conversationFile.write(key.strip() + value.strip())
import pickle
with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
data = pickle.load(f)
print(data) # hello
This diff is collapsed. Click to expand it.
456 78
\ No newline at end of file
67126 78this is a testyes it is the last message hello45x y zx y z
\ No newline at end of file
......
No preview for this file type