Create model generator

이현규
Commit e4adf0172697b93e896ca9052a098bc68f291789 e4adf017 1 parent 636be616
Showing 4 changed files with 61 additions and 13 deletions
web/backend/yt8m/esot3ria/inference_pb.py
web/backend/yt8m/esot3ria/tags_word2vec.model
web/backend/yt8m/esot3ria/vocabulary.csv
web/backend/yt8m/esot3ria/word2vec_model_generator.py
--- a/web/backend/yt8m/esot3ria/inference_pb.py
View file @e4adf01
+++ b/web/backend/yt8m/esot3ria/inference_pb.py
View file @e4adf01
@@ -58,12 +58,12 @@ def format_prediction(video_ids, predictions, top_k, whitelisted_cls_mask=None):
                "\n").encode("utf8")
 
 
- def inference_pb(filename):
+ def inference_pb(file_path, model_path):
     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 
         # 200527 Esot3riA
         # 0. Import SequenceExample type target from pb.
-         target_video = pbutil.convert_pb(filename)
+         target_video = pbutil.convert_pb(file_path)
 
         # 1. Load video features from pb.
         video_id_batch_val = np.array([b'video'])
@@ -83,18 +83,15 @@ def inference_pb(filename):
         # 200527 Esot3riA End
 
         # Restore checkpoint and meta-graph file
-         checkpoint_file = '/Users/esot3ria/PycharmProjects/yt8m/models/frame' \
-                                '/sample_model/inference_model/segment_inference_model'
-         if not gfile.Exists(checkpoint_file + ".meta"):
-           raise IOError("Cannot find %s. Did you run eval.py?" % checkpoint_file)
-         meta_graph_location = checkpoint_file + ".meta"
+         if not gfile.Exists(model_path + ".meta"):
+           raise IOError("Cannot find %s. Did you run eval.py?" % model_path)
+         meta_graph_location = model_path + ".meta"
         logging.info("loading meta-graph: " + meta_graph_location)
 
         with tf.device("/cpu:0"):
-             saver = tf.train.import_meta_graph(meta_graph_location,
-                                                clear_devices=True)
-         logging.info("restoring variables from " + checkpoint_file)
-         saver.restore(sess, checkpoint_file)
+             saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True)
+         logging.info("restoring variables from " + model_path)
+         saver.restore(sess, model_path)
         input_tensor = tf.get_collection("input_batch_raw")[0]
         num_frames_tensor = tf.get_collection("num_frames")[0]
         predictions_tensor = tf.get_collection("predictions")[0]
@@ -150,10 +147,18 @@ def inference_pb(filename):
         logging.info("profit :D")
 
         # result = format_prediction(video_id_batch_val, predictions_val, 10, whitelisted_cls_mask)
+         # 결과값
+         # 1. Tag 목록들(5개) + 각 Tag의 유사도(dict format)
+         # 2. 연관된 영상들의 링크 => 모델에서 연관영상 찾아서, 유저 인풋(Threshold) 받아서 (20%~80%) 연관영상 + 연관도 5개 출력.
+ 
+ 
 
 
 if __name__ == '__main__':
     logging.set_verbosity(tf.logging.INFO)
 
-     filename = 'features.pb'
-     inference_pb(filename)
+     file_path = 'features.pb'
+     model_path = '/Users/esot3ria/PycharmProjects/yt8m/models/frame' \
+                  '/sample_model/inference_model/segment_inference_model'
+ 
+     inference_pb(file_path, model_path)
--- a/web/backend/yt8m/esot3ria/tags_word2vec.model 0 → 100644
View file @e4adf01
+++ b/web/backend/yt8m/esot3ria/tags_word2vec.model 0 → 100644
View file @e4adf01
--- a/web/backend/yt8m/esot3ria/vocabulary.csv 0 → 100644
View file @e4adf01
+++ b/web/backend/yt8m/esot3ria/vocabulary.csv 0 → 100644
View file @e4adf01
--- a/web/backend/yt8m/esot3ria/word2vec_model_generator.py 0 → 100644
View file @e4adf01
+++ b/web/backend/yt8m/esot3ria/word2vec_model_generator.py 0 → 100644
View file @e4adf01
+ import nltk
+ import gensim
+ import pandas as pd
+ 
+ # Load files.
+ nltk.download('stopwords')
+ vocab = pd.read_csv('vocabulary.csv')
+ 
+ # Lower corpus and Remove () from name.
+ vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z]', ' ')
+ vocab['Name'] = vocab['Name'].str.lower()
+ for i in range(vocab['Name'].__len__()):
+     name = vocab['Name'][i]
+     if isinstance(name, str) and name.find(" (") != -1:
+         vocab['Name'][i] = name[:name.find(" (")]
+ 
+ # Combine separated names.(mobile phone -> mobile_phone)
+ for name in vocab['Name']:
+     if isinstance(name, str) and name.find(" ") != -1:
+         combined_name = name.replace(" ", "-")
+         for i in range(vocab['WikiDescription'].__len__()):
+             if isinstance(vocab['WikiDescription'][i], str):
+                 vocab['WikiDescription'][i] = vocab['WikiDescription'][i].replace(name, combined_name)
+ 
+ 
+ # Remove stopwords from corpus.
+ stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
+ vocab['WikiDescription'] = vocab['WikiDescription'].str.replace(stop_re, '')
+ vocab['WikiDescription'] = vocab['WikiDescription'].str.split()
+ 
+ # Tokenize corpus.
+ tokenlist = [x for x in vocab['WikiDescription'] if str(x) != 'nan']
+ phrases = gensim.models.phrases.Phrases(tokenlist)
+ phraser = gensim.models.phrases.Phraser(phrases)
+ vocab_phrased = phraser[tokenlist]
+ 
+ # Vectorize tags.
+ w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, workers=2, min_count=1)
+ w2v.save('tags_word2vec.model')
+ 
+ word_vectors = w2v.wv
+ vocabs = word_vectors.vocab.keys()
+ word_vectors_list = [word_vectors[v] for v in vocabs]
\ No newline at end of file