이현규

Add gigaword and twitter tag vector model

...@@ -10,15 +10,17 @@ import src.video_util as videoutil ...@@ -10,15 +10,17 @@ import src.video_util as videoutil
10 import json 10 import json
11 import urllib3 11 import urllib3
12 12
13 +# Erase logs
13 logging.disable(logging.WARNING) 14 logging.disable(logging.WARNING)
14 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 15 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
15 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 16 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16 17
17 -# Old model 18 +# Define model paths.
18 -MODEL_PATH = "./model/inference_model/segment_inference_model" 19 +MODEL_PATH = "./new_model/inference_model/segment_inference_model"
19 -TAG_VECTOR_MODEL_PATH = "./model/tag_vectors.model" 20 +# TAG_VECTOR_MODEL_PATH = "./new_model/twitter100_tag_vectors.gz"
20 -VIDEO_VECTOR_MODEL_PATH = "./model/video_vectors.model" 21 +TAG_VECTOR_MODEL_PATH = "glove-wiki-gigaword-100"
21 -VIDEO_TAGS_PATH = "./statics/kaggle_solution_40k.csv" 22 +VIDEO_VECTOR_MODEL_PATH = "./new_model/gigaword100_video_vectors.model"
23 +VIDEO_TAGS_PATH = "./statics/new_kaggle_solution_40k.csv"
22 24
23 # Define static file paths. 25 # Define static file paths.
24 SEGMENT_LABEL_PATH = "./statics/segment_label_ids.csv" 26 SEGMENT_LABEL_PATH = "./statics/segment_label_ids.csv"
...@@ -26,11 +28,12 @@ VOCAB_PATH = "./statics/vocabulary.csv" ...@@ -26,11 +28,12 @@ VOCAB_PATH = "./statics/vocabulary.csv"
26 28
27 # Define parameters. 29 # Define parameters.
28 TAG_TOP_K = 5 30 TAG_TOP_K = 5
29 -VIDEO_TOP_K = 10 31 +VIDEO_TOP_K = 5
30 32
31 # Target featuremap. 33 # Target featuremap.
32 FEATUREMAP_PATH = "./featuremaps/toy-3-features.pb" 34 FEATUREMAP_PATH = "./featuremaps/toy-3-features.pb"
33 35
36 +
34 def get_segments(batch_video_mtx, batch_num_frames, segment_size): 37 def get_segments(batch_video_mtx, batch_num_frames, segment_size):
35 """Get segment-level inputs from frame-level features.""" 38 """Get segment-level inputs from frame-level features."""
36 video_batch_size = batch_video_mtx.shape[0] 39 video_batch_size = batch_video_mtx.shape[0]
...@@ -232,18 +235,5 @@ def inference_pb(file_path, threshold): ...@@ -232,18 +235,5 @@ def inference_pb(file_path, threshold):
232 235
233 236
234 if __name__ == '__main__': 237 if __name__ == '__main__':
235 - # result = inference_pb(FEATUREMAP_PATH, 5) 238 + result = inference_pb(FEATUREMAP_PATH, VIDEO_TOP_K)
236 - # print("=============== Old Model ===============")
237 - # print(result["tag_result"])
238 - # print(json.dumps(result["video_result"], sort_keys=True, indent=2))
239 -
240 - # New model
241 - MODEL_PATH = "./new_model/inference_model/segment_inference_model"
242 - # TAG_VECTOR_MODEL_PATH = "./new_model/googlenews_tag_vectors.bin"
243 - # VIDEO_VECTOR_MODEL_PATH = "./new_model/googlenews_video_vectors.model"
244 - TAG_VECTOR_MODEL_PATH = "./new_model/tag_vectors.model"
245 - VIDEO_VECTOR_MODEL_PATH = "./new_model/video_vectors.model"
246 - VIDEO_TAGS_PATH = "./statics/new_kaggle_solution_40k.csv"
247 - result = inference_pb(FEATUREMAP_PATH, 5)
248 - # print("=============== New Model ===============")
249 print(json.dumps(result, sort_keys=True, indent=2)) 239 print(json.dumps(result, sort_keys=True, indent=2))
......
This file is too large to display.
This file is too large to display.
1 +from gensim.models.word2vec import Word2Vec
2 +import gensim.downloader as api
3 +
4 +corpus = api.load('text8') # download the corpus and return it opened as an iterable
5 +model = Word2Vec(corpus) # train a model from the corpus
6 +print(model.most_similar("car"))
7 +
8 +model.save('twitter25-tag_vectors.model')
This diff could not be displayed because it is too large.
1 from gensim.models import Word2Vec, KeyedVectors 1 from gensim.models import Word2Vec, KeyedVectors
2 +import gensim.downloader as api
2 import numpy as np 3 import numpy as np
3 4
5 +
4 def recommend_videos(tags, tag_model_path, video_model_path, top_k): 6 def recommend_videos(tags, tag_model_path, video_model_path, top_k):
5 - tag_vectors = Word2Vec.load(tag_model_path).wv 7 + # tag_vectors = Word2Vec.load(tag_model_path).wv
6 # tag_vectors = KeyedVectors.load_word2vec_format(tag_model_path, binary=True) 8 # tag_vectors = KeyedVectors.load_word2vec_format(tag_model_path, binary=True)
9 + tag_vectors = api.load(tag_model_path)
7 video_vectors = Word2Vec().wv.load(video_model_path) 10 video_vectors = Word2Vec().wv.load(video_model_path)
8 error_tags = [] 11 error_tags = []
9 12
......
1 import pandas as pd 1 import pandas as pd
2 import numpy as np 2 import numpy as np
3 from gensim.models import Word2Vec, KeyedVectors 3 from gensim.models import Word2Vec, KeyedVectors
4 +import gensim.downloader as api
4 5
5 -BATCH_SIZE = 1000
6 6
7 +BATCH_SIZE = 1000
8 +VECTOR_DIMENSION = 100
7 9
8 if __name__ == '__main__': 10 if __name__ == '__main__':
9 - # tag_vectors = KeyedVectors.load_word2vec_format("new_model/GoogleNews-vectors-negative300.bin", binary=True) 11 + # tag_vectors = KeyedVectors.load_word2vec_format("new_model/twitter25_tag_vectors.model")
10 - tag_vectors = Word2Vec.load("new_model/tag_vectors.model").wv 12 + # tag_vectors = Word2Vec.load("new_model/text8_tag_vectors.model").wv
11 - video_vectors = Word2Vec().wv # Empty model 13 + tag_vectors = api.load("glove-wiki-gigaword-100") # download the model and return as object ready for use
14 + video_vectors = Word2Vec(size=VECTOR_DIMENSION).wv # Empty model
12 15
13 # Load video recommendation tags. 16 # Load video recommendation tags.
14 video_tags = pd.read_csv('statics/new_kaggle_solution_40k.csv') 17 video_tags = pd.read_csv('statics/new_kaggle_solution_40k.csv')
...@@ -20,7 +23,7 @@ if __name__ == '__main__': ...@@ -20,7 +23,7 @@ if __name__ == '__main__':
20 23
21 for i, row in video_tags.iterrows(): 24 for i, row in video_tags.iterrows():
22 video_id = row[0] 25 video_id = row[0]
23 - video_vector = np.zeros(300) 26 + video_vector = np.zeros(VECTOR_DIMENSION)
24 for segment_index in range(1, 6): 27 for segment_index in range(1, 6):
25 tag, weight = row[segment_index].split(":") 28 tag, weight = row[segment_index].split(":")
26 if tag in tag_vectors.vocab: 29 if tag in tag_vectors.vocab:
......