Showing
19 changed files
with
195 additions
and
58 deletions
1 | -import tensorflow as tf | 1 | +import tensorflow.compat.v1 as tf |
2 | +tf.disable_v2_behavior() | ||
2 | import numpy as np | 3 | import numpy as np |
3 | import sys | 4 | import sys |
4 | from random import randint | 5 | from random import randint |
... | @@ -10,11 +11,11 @@ import os | ... | @@ -10,11 +11,11 @@ import os |
10 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' | 11 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' |
11 | 12 | ||
12 | def createTrainingMatrices(conversationFileName, wList, maxLen): | 13 | def createTrainingMatrices(conversationFileName, wList, maxLen): |
13 | - conversationDictionary = np.load(conversationFileName).item() | 14 | + conversationDictionary = np.load(conversationFileName, allow_pickle=True).item() |
14 | numExamples = len(conversationDictionary) | 15 | numExamples = len(conversationDictionary) |
15 | xTrain = np.zeros((numExamples, maxLen), dtype='int32') | 16 | xTrain = np.zeros((numExamples, maxLen), dtype='int32') |
16 | yTrain = np.zeros((numExamples, maxLen), dtype='int32') | 17 | yTrain = np.zeros((numExamples, maxLen), dtype='int32') |
17 | - for index,(key,value) in enumerate(conversationDictionary.iteritems()): | 18 | + for index,(key,value) in enumerate(conversationDictionary.items()): |
18 | # Will store integerized representation of strings here (initialized as padding) | 19 | # Will store integerized representation of strings here (initialized as padding) |
19 | encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') | 20 | encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') |
20 | decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') | 21 | decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32') |
... | @@ -165,15 +166,15 @@ vocabSize = vocabSize + 2 | ... | @@ -165,15 +166,15 @@ vocabSize = vocabSize + 2 |
165 | if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')): | 166 | if (os.path.isfile('Seq2SeqXTrain.npy') and os.path.isfile('Seq2SeqYTrain.npy')): |
166 | xTrain = np.load('Seq2SeqXTrain.npy') | 167 | xTrain = np.load('Seq2SeqXTrain.npy') |
167 | yTrain = np.load('Seq2SeqYTrain.npy') | 168 | yTrain = np.load('Seq2SeqYTrain.npy') |
168 | - print 'Finished loading training matrices' | 169 | + print ('Finished loading training matrices') |
169 | numTrainingExamples = xTrain.shape[0] | 170 | numTrainingExamples = xTrain.shape[0] |
170 | else: | 171 | else: |
171 | numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength) | 172 | numTrainingExamples, xTrain, yTrain = createTrainingMatrices('conversationDictionary.npy', wordList, maxEncoderLength) |
172 | np.save('Seq2SeqXTrain.npy', xTrain) | 173 | np.save('Seq2SeqXTrain.npy', xTrain) |
173 | np.save('Seq2SeqYTrain.npy', yTrain) | 174 | np.save('Seq2SeqYTrain.npy', yTrain) |
174 | - print 'Finished creating training matrices' | 175 | + print ('Finished creating training matrices') |
175 | 176 | ||
176 | -tf.reset_default_graph() | 177 | +tf.compat.v1.reset_default_graph() |
177 | 178 | ||
178 | # Create the placeholders | 179 | # Create the placeholders |
179 | encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)] | 180 | encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)] |
... | @@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) | ... | @@ -186,7 +187,7 @@ encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) |
186 | #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True) | 187 | #encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True) |
187 | # Architectural choice of of whether or not to include ^ | 188 | # Architectural choice of of whether or not to include ^ |
188 | 189 | ||
189 | -decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM, | 190 | +decoderOutputs, decoderFinalState = tf.sparsemax.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM, |
190 | vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious) | 191 | vocabSize, vocabSize, embeddingDim, feed_previous=feedPrevious) |
191 | 192 | ||
192 | decoderPrediction = tf.argmax(decoderOutputs, 2) | 193 | decoderPrediction = tf.argmax(decoderOutputs, 2) |
... | @@ -233,14 +234,14 @@ for i in range(numIterations): | ... | @@ -233,14 +234,14 @@ for i in range(numIterations): |
233 | writer.add_summary(summary, i) | 234 | writer.add_summary(summary, i) |
234 | if (i % 25 == 0 and i != 0): | 235 | if (i % 25 == 0 and i != 0): |
235 | num = randint(0,len(encoderTestStrings) - 1) | 236 | num = randint(0,len(encoderTestStrings) - 1) |
236 | - print encoderTestStrings[num] | 237 | + print (encoderTestStrings[num]) |
237 | inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength); | 238 | inputVector = getTestInput(encoderTestStrings[num], wordList, maxEncoderLength); |
238 | feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)} | 239 | feedDict = {encoderInputs[t]: inputVector[t] for t in range(maxEncoderLength)} |
239 | feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)}) | 240 | feedDict.update({decoderLabels[t]: zeroVector for t in range(maxDecoderLength)}) |
240 | feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)}) | 241 | feedDict.update({decoderInputs[t]: zeroVector for t in range(maxDecoderLength)}) |
241 | feedDict.update({feedPrevious: True}) | 242 | feedDict.update({feedPrevious: True}) |
242 | ids = (sess.run(decoderPrediction, feed_dict=feedDict)) | 243 | ids = (sess.run(decoderPrediction, feed_dict=feedDict)) |
243 | - print idsToSentence(ids, wordList) | 244 | + print (idsToSentence(ids, wordList)) |
244 | 245 | ||
245 | if (i % 10000 == 0 and i != 0): | 246 | if (i % 10000 == 0 and i != 0): |
246 | savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i) | 247 | savePath = saver.save(sess, "models/pretrained_seq2seq.ckpt", global_step=i) | ... | ... |
1 | -import tensorflow as tf | 1 | +import tensorflow.compat.v1 as tf |
2 | +tf.disable_v2_behavior() | ||
2 | import numpy as np | 3 | import numpy as np |
3 | import re | 4 | import re |
4 | from collections import Counter | 5 | from collections import Counter |
... | @@ -25,8 +26,9 @@ numIterations = 100000 | ... | @@ -25,8 +26,9 @@ numIterations = 100000 |
25 | # into one huge string, and then uses a Counter to identify words | 26 | # into one huge string, and then uses a Counter to identify words |
26 | # and the number of occurences | 27 | # and the number of occurences |
27 | def processDataset(filename): | 28 | def processDataset(filename): |
28 | - openedFile = open(filename, 'r') | 29 | + openedFile = open(filename, 'r', encoding='UTF8') |
29 | allLines = openedFile.readlines() | 30 | allLines = openedFile.readlines() |
31 | + print(allLines) | ||
30 | myStr = "" | 32 | myStr = "" |
31 | for line in allLines: | 33 | for line in allLines: |
32 | myStr += line | 34 | myStr += line |
... | @@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus): | ... | @@ -41,7 +43,7 @@ def createTrainingMatrices(dictionary, corpus): |
41 | yTrain=[] | 43 | yTrain=[] |
42 | for i in range(numTotalWords): | 44 | for i in range(numTotalWords): |
43 | if i % 100000 == 0: | 45 | if i % 100000 == 0: |
44 | - print 'Finished %d/%d total words' % (i, numTotalWords) | 46 | + print ('Finished %d/%d total words' % (i, numTotalWords)) |
45 | wordsAfter = allWords[i + 1:i + windowSize + 1] | 47 | wordsAfter = allWords[i + 1:i + windowSize + 1] |
46 | wordsBefore = allWords[max(0, i - windowSize):i] | 48 | wordsBefore = allWords[max(0, i - windowSize):i] |
47 | wordsAdded = wordsAfter + wordsBefore | 49 | wordsAdded = wordsAfter + wordsBefore |
... | @@ -61,61 +63,22 @@ continueWord2Vec = True | ... | @@ -61,61 +63,22 @@ continueWord2Vec = True |
61 | if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')): | 63 | if (os.path.isfile('Word2VecXTrain.npy') and os.path.isfile('Word2VecYTrain.npy') and os.path.isfile('wordList.txt')): |
62 | xTrain = np.load('Word2VecXTrain.npy') | 64 | xTrain = np.load('Word2VecXTrain.npy') |
63 | yTrain = np.load('Word2VecYTrain.npy') | 65 | yTrain = np.load('Word2VecYTrain.npy') |
64 | - print 'Finished loading training matrices' | 66 | + print ('Finished loading training matrices') |
65 | with open("wordList.txt", "rb") as fp: | 67 | with open("wordList.txt", "rb") as fp: |
66 | wordList = pickle.load(fp) | 68 | wordList = pickle.load(fp) |
67 | - print 'Finished loading word list' | 69 | + print ('Finished loading word list') |
68 | 70 | ||
69 | else: | 71 | else: |
70 | - fullCorpus, datasetDictionary = processDataset('conversationData.txt') | 72 | + fullCorpus, datasetDictionary = processDataset(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt') |
71 | - print 'Finished parsing and cleaning dataset' | 73 | + print ('Finished parsing and cleaning dataset') |
72 | wordList = list(datasetDictionary.keys()) | 74 | wordList = list(datasetDictionary.keys()) |
73 | - createOwnVectors = raw_input('Do you want to create your own vectors through Word2Vec (y/n)?') | 75 | + createOwnVectors = input('Do you want to create your own vectors through Word2Vec (y/n)?') |
74 | if (createOwnVectors == 'y'): | 76 | if (createOwnVectors == 'y'): |
75 | xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus) | 77 | xTrain, yTrain = createTrainingMatrices(datasetDictionary, fullCorpus) |
76 | - print 'Finished creating training matrices' | 78 | + print ('Finished creating training matrices') |
77 | np.save('Word2VecXTrain.npy', xTrain) | 79 | np.save('Word2VecXTrain.npy', xTrain) |
78 | np.save('Word2VecYTrain.npy', yTrain) | 80 | np.save('Word2VecYTrain.npy', yTrain) |
79 | else: | 81 | else: |
80 | continueWord2Vec = False | 82 | continueWord2Vec = False |
81 | with open("wordList.txt", "wb") as fp: | 83 | with open("wordList.txt", "wb") as fp: |
82 | pickle.dump(wordList, fp) | 84 | pickle.dump(wordList, fp) |
83 | - | ||
84 | -# If you do not want to create your own word vectors and you'd just like to | ||
85 | -# have Tensorflow's seq2seq take care of that, then you don't need to run | ||
86 | -# anything below this line. | ||
87 | -if (continueWord2Vec == False): | ||
88 | - sys.exit() | ||
89 | - | ||
90 | -numTrainingExamples = len(xTrain) | ||
91 | -vocabSize = len(wordList) | ||
92 | - | ||
93 | -sess = tf.Session() | ||
94 | -embeddingMatrix = tf.Variable(tf.random_uniform([vocabSize, wordVecDimensions], -1.0, 1.0)) | ||
95 | -nceWeights = tf.Variable(tf.truncated_normal([vocabSize, wordVecDimensions], stddev=1.0 / math.sqrt(wordVecDimensions))) | ||
96 | -nceBiases = tf.Variable(tf.zeros([vocabSize])) | ||
97 | - | ||
98 | -inputs = tf.placeholder(tf.int32, shape=[batchSize]) | ||
99 | -outputs = tf.placeholder(tf.int32, shape=[batchSize, 1]) | ||
100 | - | ||
101 | -embed = tf.nn.embedding_lookup(embeddingMatrix, inputs) | ||
102 | - | ||
103 | -loss = tf.reduce_mean( | ||
104 | - tf.nn.nce_loss(weights=nceWeights, | ||
105 | - biases=nceBiases, | ||
106 | - labels=outputs, | ||
107 | - inputs=embed, | ||
108 | - num_sampled=numNegativeSample, | ||
109 | - num_classes=vocabSize)) | ||
110 | - | ||
111 | -optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) | ||
112 | - | ||
113 | -sess.run(tf.global_variables_initializer()) | ||
114 | -for i in range(numIterations): | ||
115 | - trainInputs, trainLabels = getTrainingBatch() | ||
116 | - _, curLoss = sess.run([optimizer, loss], feed_dict={inputs: trainInputs, outputs: trainLabels}) | ||
117 | - if (i % 10000 == 0): | ||
118 | - print ('Current loss is:', curLoss) | ||
119 | -print 'Saving the word embedding matrix' | ||
120 | -embedMatrix = embeddingMatrix.eval(session=sess) | ||
121 | -np.save('embeddingMatrix.npy', embedMatrix) | ... | ... |
chat_system/bash.exe.stackdump
0 → 100644
1 | +Stack trace: | ||
2 | +Frame Function Args | ||
3 | +00600000010 001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770) | ||
4 | +00600000010 001800490FA (00000000000, 00100000000, 00000000000, 00000000001) | ||
5 | +00600000010 00180049132 (00000000000, 00000000000, 00000000058, 0018031E960) | ||
6 | +00600000010 0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000) | ||
7 | +00600000010 0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940) | ||
8 | +00600000010 0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D) | ||
9 | +00600000010 001800596A6 (000FFFF0000, 00000000000, 00000000000, E2DE0F8BFFFFFFFF) | ||
10 | +00600000010 0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000) | ||
11 | +00600000010 0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000) | ||
12 | +000FFFFCCE0 0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F) | ||
13 | +000FFFFCCE0 00180049877 (00000000000, 00000000000, 00000000000, 00000000000) | ||
14 | +000FFFFFFF0 001800482C6 (00000000000, 00000000000, 00000000000, 00000000000) | ||
15 | +000FFFFFFF0 00180048374 (00000000000, 00000000000, 00000000000, 00000000000) | ||
16 | +End of stack trace |
chat_system/createDataset.py
0 → 100644
1 | +import pandas as pd | ||
2 | +import numpy as np | ||
3 | +import os | ||
4 | +import re | ||
5 | +from datetime import datetime | ||
6 | + | ||
7 | +def getKakaotalkbookData(): | ||
8 | + print("function active") | ||
9 | + personName = raw_input('Enter your full kakao name: ') | ||
10 | + personName = personName.rstrip('\r') | ||
11 | + responseDictionary = dict() | ||
12 | + with open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\fbMessages.txt', 'r') as fbFile: | ||
13 | + allLines = fbFile.readlines() | ||
14 | + | ||
15 | + myMessage, otherPersonsMessage, currentSpeaker = "","","" | ||
16 | + | ||
17 | + for index,lines in enumerate(allLines): | ||
18 | + rightBracket = lines.find(']') + 2 | ||
19 | + justMessage = lines[rightBracket:] | ||
20 | + colon = justMessage.find(':') | ||
21 | + # Find messages that I sent | ||
22 | + print("*input*") | ||
23 | + print(personName, len(personName)) | ||
24 | + print("*file*") | ||
25 | + print(justMessage[:colon-1], len(personName)) | ||
26 | + print("*same?*") | ||
27 | + print(justMessage[:colon-1] == personName) | ||
28 | + print(justMessage[:colon] == "Second User") | ||
29 | + print(personName == "Second User") | ||
30 | + print("------------------------------------------") | ||
31 | + | ||
32 | + if (justMessage[:colon-1] == personName): | ||
33 | + print('a') | ||
34 | + if not myMessage: | ||
35 | + # Want to find the first message that I send (if I send multiple | ||
36 | + # in a row) | ||
37 | + startMessageIndex = index - 1 | ||
38 | + myMessage += justMessage[colon + 2:] | ||
39 | + | ||
40 | + elif myMessage: | ||
41 | + # Now go and see what message the other person sent by looking at | ||
42 | + # previous messages | ||
43 | + for counter in range(startMessageIndex, 0, -1): | ||
44 | + currentLine = allLines[counter] | ||
45 | + rightBracket = currentLine.find(']') + 2 | ||
46 | + justMessage = currentLine[rightBracket:] | ||
47 | + colon = justMessage.find(':') | ||
48 | + if not currentSpeaker: | ||
49 | + # The first speaker not named me | ||
50 | + currentSpeaker = justMessage[:colon] | ||
51 | + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage): | ||
52 | + # A different person started speaking, so now I know that the | ||
53 | + # first person's message is done | ||
54 | + otherPersonsMessage = cleanMessage(otherPersonsMessage) | ||
55 | + myMessage = cleanMessage(myMessage) | ||
56 | + responseDictionary[otherPersonsMessage] = myMessage | ||
57 | + break | ||
58 | + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage | ||
59 | + myMessage, otherPersonsMessage, currentSpeaker = "","","" | ||
60 | + return responseDictionary | ||
61 | + | ||
62 | + | ||
63 | +def getFacebookData(): | ||
64 | + print("function active") | ||
65 | + personName = input('Enter your full Kakaotalk name: ') | ||
66 | + personName = personName.rstrip('\r') | ||
67 | + responseDictionary = dict() | ||
68 | + with open(r'C:\Users\dlgpd\Downloads\kakao\Talk_2020.6.21 20_35-1.txt', 'r', encoding='UTF8') as fbFile: | ||
69 | + allLines = fbFile.readlines() | ||
70 | + | ||
71 | + myMessage, otherPersonsMessage, currentSpeaker = "","","" | ||
72 | + | ||
73 | + for index,lines in enumerate(allLines): | ||
74 | + rightBracket = lines.find(':') + 5 | ||
75 | + justMessage = lines[rightBracket:] | ||
76 | + colon = justMessage.find(':') | ||
77 | + # Find messages that I sent | ||
78 | + print("*input*") | ||
79 | + | ||
80 | + print(type(personName)) | ||
81 | + print(personName, len(personName)) | ||
82 | + print("*file*") | ||
83 | + print(justMessage[:colon-1], len(personName)) | ||
84 | + print("*same?*") | ||
85 | + print(justMessage[:colon-1] == personName) | ||
86 | + print(justMessage[:colon] == "Second User") | ||
87 | + print(personName == "Second User") | ||
88 | + | ||
89 | + | ||
90 | + | ||
91 | + if (justMessage[:colon-1] == personName): | ||
92 | + print('a') | ||
93 | + if not myMessage: | ||
94 | + # Want to find the first message that I send (if I send multiple | ||
95 | + # in a row) | ||
96 | + startMessageIndex = index - 1 | ||
97 | + myMessage += justMessage[colon + 2:] | ||
98 | + | ||
99 | + elif myMessage: | ||
100 | + # Now go and see what message the other person sent by looking at | ||
101 | + # previous messages | ||
102 | + for counter in range(startMessageIndex, 0, -1): | ||
103 | + currentLine = allLines[counter] | ||
104 | + rightBracket = currentLine.find(':') + 5 | ||
105 | + justMessage = currentLine[rightBracket:] | ||
106 | + colon = justMessage.find(':') | ||
107 | + if not currentSpeaker: | ||
108 | + # The first speaker not named me | ||
109 | + currentSpeaker = justMessage[:colon] | ||
110 | + elif (currentSpeaker != justMessage[:colon] and otherPersonsMessage): | ||
111 | + # A different person started speaking, so now I know that the | ||
112 | + # first person's message is done | ||
113 | + otherPersonsMessage = cleanMessage(otherPersonsMessage) | ||
114 | + myMessage = cleanMessage(myMessage) | ||
115 | + responseDictionary[otherPersonsMessage] = myMessage | ||
116 | + break | ||
117 | + otherPersonsMessage = justMessage[colon + 2:] + otherPersonsMessage | ||
118 | + myMessage, otherPersonsMessage, currentSpeaker = "","","" | ||
119 | + print("------------------------------------------") | ||
120 | + | ||
121 | + return responseDictionary | ||
122 | + | ||
123 | + | ||
124 | + | ||
125 | +def cleanMessage(message): | ||
126 | + # Remove new lines within message | ||
127 | + cleanedMessage = message.replace('\n',' ').lower() | ||
128 | + # Deal with some weird tokens | ||
129 | + cleanedMessage = cleanedMessage.replace("\xc2\xa0", "") | ||
130 | + # Remove punctuation | ||
131 | + cleanedMessage = re.sub('([.,!?])','', cleanedMessage) | ||
132 | + # Remove multiple spaces in message | ||
133 | + cleanedMessage = re.sub(' +',' ', cleanedMessage) | ||
134 | + return cleanedMessage | ||
135 | + | ||
136 | +combinedDictionary = {} | ||
137 | + | ||
138 | +combinedDictionary.update(getFacebookData()) | ||
139 | + | ||
140 | +print(combinedDictionary) | ||
141 | +print ('Total len of dictionary', len(combinedDictionary)) | ||
142 | + | ||
143 | +print('Saving conversation data dictionary') | ||
144 | +np.save(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationDictionary.npy', combinedDictionary) | ||
145 | + | ||
146 | +conversationFile = open(r'C:\Users\dlgpd\Desktop\20-1\oss\term-project\Learn_for_yourself\chat_system\conversationData.txt', 'w', encoding='UTF8') | ||
147 | +for key, value in combinedDictionary.items(): | ||
148 | + if (not key.strip() or not value.strip()): | ||
149 | + # If there are empty strings | ||
150 | + continue | ||
151 | + print(key.strip() + value.strip()) | ||
152 | + conversationFile.write(key.strip() + value.strip()) |
chat_system/test.py
0 → 100644
This diff is collapsed. Click to expand it.
No preview for this file type
make_your_own_chatbot/dict.npy
0 → 100644
No preview for this file type
-
Please register or login to post a comment