hyeyeon-sun

kakao parsing script && SeqtoSeq analysis

1 -import tensorflow as tf 1 +import tensorflow.compat.v1 as tf
2 +tf.disable_v2_behavior()
2 import numpy as np 3 import numpy as np
3 import sys 4 import sys
4 from random import randint 5 from random import randint
...@@ -10,11 +11,11 @@ import os ...@@ -10,11 +11,11 @@ import os
10 os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 11 os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
11 12
12 def createTrainingMatrices(conversationFileName, wList, maxLen): 13 def createTrainingMatrices(conversationFileName, wList, maxLen):
13 - conversationDictionary = np.load(conversationFileName).item() 14 + conversationDictionary = np.load(conversationFileName, allow_pickle=True).item()
14 numExamples = len(conversationDictionary) 15 numExamples = len(conversationDictionary)
15 xTrain = np.zeros((numExamples, maxLen), dtype='int32') 16 xTrain = np.zeros((numExamples, maxLen), dtype='int32')
16 yTrain = np.zeros((numExamples, maxLen), dtype='int32') 17 yTrain = np.zeros((numExamples, maxLen), dtype='int32')
17 - for index,(key,value) in enumerate(conversationDictionary.iteritems()): 18 + for index,(key,value) in enumerate(conversationDictionary.items()):
18 # Will store integerized representation of strings here (initialized as padding) 19 # Will store integerized representation of strings here (initialized as padding)
19 encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') 20 encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
20 decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') 21 decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
...@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2 ...@@ -165,15 +166,15 @@ vocabSize = vocabSize + 2
165 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')): 166 if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')):
166 xTrain = np.load('Seq2SeqXTrain.npy') 167 xTrain = np.load('Seq2SeqXTrain.npy')
167 yTrain = np.load('Seq2SeqYTrain.npy') 168 yTrain = np.load('Seq2SeqYTrain.npy')
168 - print 'Finished loading training matrices' 169 + print ('Finished loading training matrices')
169 numTrainingExamples = xTrain.shape[0] 170 numTrainingExamples = xTrain.shape[0]
170 else: 171 else:
171 numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength) 172 numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength)
172 np.save('Seq2SeqXTrain.npy', xTrain) 173 np.save('Seq2SeqXTrain.npy', xTrain)
173 np.save('Seq2SeqYTrain.npy', yTrain) 174 np.save('Seq2SeqYTrain.npy', yTrain)
174 - print 'Finished creating training matrices' 175 + print ('Finished creating training matrices')
175 176
176 -tf.reset_default_graph() 177 +tf.compat.v1.reset_default_graph()
177 178
178 # Create the placeholders 179 # Create the placeholders
179 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)] 180 encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
...@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) ...@@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
186 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True) 187 #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
187 # Architectural choice of of whether or not to include ^ 188 # Architectural choice of of whether or not to include ^
188 189
189 -decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM, 190 +decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,
190 vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious) 191 vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious)
191 192
192 decoderPrediction = tf.argmax(decoderOutputs, 2) 193 decoderPrediction = tf.argmax(decoderOutputs, 2)
...@@ -233,14 +234,14 @@ for i in range(numIterations): ...@@ -233,14 +234,14 @@ for i in range(numIterations):
233 writer.add_summary(summary, i) 234 writer.add_summary(summary, i)
234 if (i % 25 == 0 and i != 0): 235 if (i % 25 == 0 and i != 0):
235 num = randint(0,len(encoderTestStrings) - 1) 236 num = randint(0,len(encoderTestStrings) - 1)
236 - print encoderTestStrings[num] 237 + print (encoderTestStrings[num])
237 inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength); 238 inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength);
238 feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)} 239 feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)}
239 feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)}) 240 feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)})
240 feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)}) 241 feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)})
241 feedDict.update({feedPrevious: True}) 242 feedDict.update({feedPrevious: True})
242 ids = (sess.run(decoderPrediction, feed_dict=feedDict)) 243 ids = (sess.run(decoderPrediction, feed_dict=feedDict))
243 - print idsToSentence(ids, wordList) 244 + print (idsToSentence(ids, wordList))
244 245
245 if (i % 10000 == 0 and i != 0): 246 if (i % 10000 == 0 and i != 0):
246 savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i) 247 savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i)
......
1 -import tensorflow as tf 1 +import tensorflow.compat.v1 as tf
2 +tf.disable_v2_behavior()
2 import numpy as np 3 import numpy as np
3 import re 4 import re
4 from collections import Counter 5 from collections import Counter
...@@ -25,8 +26,9 @@ numIterations = 100000 ...@@ -25,8 +26,9 @@ numIterations = 100000
25 # into one huge string, and then uses a Counter to identify words 26 # into one huge string, and then uses a Counter to identify words
26 # and the number of occurences 27 # and the number of occurences
27 def processDataset(filename): 28 def processDataset(filename):
28 - openedFile = open(filename, 'r') 29 + openedFile = open(filename, 'r', encoding='UTF8')
29 allLines = openedFile.readlines() 30 allLines = openedFile.readlines()
31 + print(allLines)
30 myStr = "" 32 myStr = ""
31 for line in allLines: 33 for line in allLines:
32 myStr += line 34 myStr += line
...@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus): ...@@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus):
41 yTrain=[] 43 yTrain=[]
42 for i in range(numTotalWords): 44 for i in range(numTotalWords):
43 if i % 100000 == 0: 45 if i % 100000 == 0:
44 - print 'Finished %d/%d total words' % (i, numTotalWords) 46 + print ('Finished %d/%d total words' % (i, numTotalWords))
45 wordsAfter = allWords[i + 1:i + windowSize + 1] 47 wordsAfter = allWords[i + 1:i + windowSize + 1]
46 wordsBefore = allWords[max(0, i - windowSize):i] 48 wordsBefore = allWords[max(0, i - windowSize):i]
47 wordsAdded = wordsAfter + wordsBefore 49 wordsAdded = wordsAfter + wordsBefore
...@@ -61,61 +63,22 @@ continueWord2Vec = True ...@@ -61,61 +63,22 @@ continueWord2Vec = True
61 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')): 63 if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')):
62 xTrain = np.load('Word2VecXTrain.npy') 64 xTrain = np.load('Word2VecXTrain.npy')
63 yTrain = np.load('Word2VecYTrain.npy') 65 yTrain = np.load('Word2VecYTrain.npy')
64 - print 'Finished loading training matrices' 66 + print ('Finished loading training matrices')
65 with open("wordList.txt", "rb") as fp: 67 with open("wordList.txt", "rb") as fp:
66 wordList = pickle.load(fp) 68 wordList = pickle.load(fp)
67 - print 'Finished loading word list' 69 + print ('Finished loading word list')
68 70
69 else: 71 else:
70 - fullCorpus, datasetDictionary = processDataset('conversationData.txt') 72 + fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt')
71 - print 'Finished parsing and cleaning dataset' 73 + print ('Finished parsing and cleaning dataset')
72 wordList = list(datasetDictionary.keys()) 74 wordList = list(datasetDictionary.keys())
73 - createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?') 75 + createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?')
74 if (createOwnVectors == 'y'): 76 if (createOwnVectors == 'y'):
75 xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus) 77 xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus)
76 - print 'Finished creating training matrices' 78 + print ('Finished creating training matrices')
77 np.save('Word2VecXTrain.npy', xTrain) 79 np.save('Word2VecXTrain.npy', xTrain)
78 np.save('Word2VecYTrain.npy', yTrain) 80 np.save('Word2VecYTrain.npy', yTrain)
79 else: 81 else:
80 continueWord2Vec = False 82 continueWord2Vec = False
81 with open("wordList.txt", "wb") as fp: 83 with open("wordList.txt", "wb") as fp:
82 pickle.dump(wordList, fp) 84 pickle.dump(wordList, fp)
83 -
84 -# If you do not want to create your own word vectors and you'd just like to
85 -# have Tensorflow's seq2seq take care of that, then you don't need to run
86 -# anything below this line.
87 -if (continueWord2Vec == False):
88 - sys.exit()
89 -
90 -numTrainingExamples = len(xTrain)
91 -vocabSize = len(wordList)
92 -
93 -sess = tf.Session()
94 -embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0))
95 -nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions)))
96 -nceBiases = tf.Variable(tf.zeros([vocabSize]))
97 -
98 -inputs = tf.placeholder(tf.int32, shape=[batchSize])
99 -outputs = tf.placeholder(tf.int32, shape=[batchSize, 1])
100 -
101 -embed = tf.nn.embedding_lookup(embeddingMatrix, inputs)
102 -
103 -loss = tf.reduce_mean(
104 - tf.nn.nce_loss(weights=nceWeights,
105 - biases=nceBiases,
106 - labels=outputs,
107 - inputs=embed,
108 - num_sampled=numNegativeSample,
109 - num_classes=vocabSize))
110 -
111 -optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
112 -
113 -sess.run(tf.global_variables_initializer())
114 -for i in range(numIterations):
115 - trainInputs, trainLabels = getTrainingBatch()
116 - _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels})
117 - if (i % 10000 == 0):
118 - print ('Current loss is:', curLoss)
119 -print 'Saving the word embedding matrix'
120 -embedMatrix = embeddingMatrix.eval(session=sess)
121 -np.save('embeddingMatrix.npy', embedMatrix)
......
1 +Stack trace:
2 +Frame Function Args
3 +00600000010 001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
4 +00600000010 001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
5 +00600000010 00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
6 +00600000010 0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
7 +00600000010 0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
8 +00600000010 0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
9 +00600000010 001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF)
10 +00600000010 0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
11 +00600000010 0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
12 +000FFFFCCE0 0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
13 +000FFFFCCE0 00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
14 +000FFFFFFF0 001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
15 +000FFFFFFF0 00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
16 +End of stack trace
1 +import pandas as pd
2 +import numpy as np
3 +import os
4 +import re
5 +from datetime import datetime
6 +
7 +def getKakaotalkbookData():
8 + print("function active")
9 + personName = raw_input('Enter your full kakao name: ')
10 + personName = personName.rstrip('\r')
11 + responseDictionary = dict()
12 + with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile:
13 + allLines = fbFile.readlines()
14 +
15 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
16 +
17 + for index,lines in enumerate(allLines):
18 + rightBracket = lines.find(']') + 2
19 + justMessage = lines[rightBracket:]
20 + colon = justMessage.find(':')
21 + # Find messages that I sent
22 + print("*input*")
23 + print(personName, len(personName))
24 + print("*file*")
25 + print(justMessage[:colon-1], len(personName))
26 + print("*same?*")
27 + print(justMessage[:colon-1] == personName)
28 + print(justMessage[:colon] == "Second User")
29 + print(personName == "Second User")
30 + print("------------------------------------------")
31 +
32 + if (justMessage[:colon-1] == personName):
33 + print('a')
34 + if not myMessage:
35 + # Want to find the first message that I send (if I send multiple
36 + # in a row)
37 + startMessageIndex = index - 1
38 + myMessage += justMessage[colon + 2:]
39 +
40 + elif myMessage:
41 + # Now go and see what message the other person sent by looking at
42 + # previous messages
43 + for counter in range(startMessageIndex, 0, -1):
44 + currentLine = allLines[counter]
45 + rightBracket = currentLine.find(']') + 2
46 + justMessage = currentLine[rightBracket:]
47 + colon = justMessage.find(':')
48 + if not currentSpeaker:
49 + # The first speaker not named me
50 + currentSpeaker = justMessage[:colon]
51 + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
52 + # A different person started speaking, so now I know that the
53 + # first person's message is done
54 + otherPersonsMessage = cleanMessage(otherPersonsMessage)
55 + myMessage = cleanMessage(myMessage)
56 + responseDictionary[otherPersonsMessage] = myMessage
57 + break
58 + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
59 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
60 + return responseDictionary
61 +
62 +
63 +def getFacebookData():
64 + print("function active")
65 + personName = input('Enter your full Kakaotalk name: ')
66 + personName = personName.rstrip('\r')
67 + responseDictionary = dict()
68 + with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile:
69 + allLines = fbFile.readlines()
70 +
71 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
72 +
73 + for index,lines in enumerate(allLines):
74 + rightBracket = lines.find(':') + 5
75 + justMessage = lines[rightBracket:]
76 + colon = justMessage.find(':')
77 + # Find messages that I sent
78 + print("*input*")
79 +
80 + print(type(personName))
81 + print(personName, len(personName))
82 + print("*file*")
83 + print(justMessage[:colon-1], len(personName))
84 + print("*same?*")
85 + print(justMessage[:colon-1] == personName)
86 + print(justMessage[:colon] == "Second User")
87 + print(personName == "Second User")
88 +
89 +
90 +
91 + if (justMessage[:colon-1] == personName):
92 + print('a')
93 + if not myMessage:
94 + # Want to find the first message that I send (if I send multiple
95 + # in a row)
96 + startMessageIndex = index - 1
97 + myMessage += justMessage[colon + 2:]
98 +
99 + elif myMessage:
100 + # Now go and see what message the other person sent by looking at
101 + # previous messages
102 + for counter in range(startMessageIndex, 0, -1):
103 + currentLine = allLines[counter]
104 + rightBracket = currentLine.find(':') + 5
105 + justMessage = currentLine[rightBracket:]
106 + colon = justMessage.find(':')
107 + if not currentSpeaker:
108 + # The first speaker not named me
109 + currentSpeaker = justMessage[:colon]
110 + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage):
111 + # A different person started speaking, so now I know that the
112 + # first person's message is done
113 + otherPersonsMessage = cleanMessage(otherPersonsMessage)
114 + myMessage = cleanMessage(myMessage)
115 + responseDictionary[otherPersonsMessage] = myMessage
116 + break
117 + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage
118 + myMessage, otherPersonsMessage, currentSpeaker = "","",""
119 + print("------------------------------------------")
120 +
121 + return responseDictionary
122 +
123 +
124 +
125 +def cleanMessage(message):
126 + # Remove new lines within message
127 + cleanedMessage = message.replace('\n',' ').lower()
128 + # Deal with some weird tokens
129 + cleanedMessage = cleanedMessage.replace("\xc2\xa0", "")
130 + # Remove punctuation
131 + cleanedMessage = re.sub('([.,!?])','', cleanedMessage)
132 + # Remove multiple spaces in message
133 + cleanedMessage = re.sub(' +',' ', cleanedMessage)
134 + return cleanedMessage
135 +
136 +combinedDictionary = {}
137 +
138 +combinedDictionary.update(getFacebookData())
139 +
140 +print(combinedDictionary)
141 +print ('Total len of dictionary', len(combinedDictionary))
142 +
143 +print('Saving conversation data dictionary')
144 +np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary)
145 +
146 +conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8')
147 +for key, value in combinedDictionary.items():
148 + if (not key.strip() or not value.strip()):
149 + # If there are empty strings
150 + continue
151 + print(key.strip() + value.strip())
152 + conversationFile.write(key.strip() + value.strip())
1 +import pickle
2 +
3 +with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\wordList.txt','rb') as f:
4 + data = pickle.load(f)
5 + print(data) # hello
This diff is collapsed. Click to expand it.
1 -456 78
...\ No newline at end of file ...\ No newline at end of file
1 +67126 78this is a testyes it is the last message hello45x y zx y z
...\ No newline at end of file ...\ No newline at end of file
......
No preview for this file type