이현규

Add gigaword and twitter tag vector model

......@@ -10,15 +10,17 @@ import src.video_util as videoutil
import json
import urllib3
# Erase logs
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Old model
MODEL_PATH = "./model/inference_model/segment_inference_model"
TAG_VECTOR_MODEL_PATH = "./model/tag_vectors.model"
VIDEO_VECTOR_MODEL_PATH = "./model/video_vectors.model"
VIDEO_TAGS_PATH = "./statics/kaggle_solution_40k.csv"
# Define model paths.
MODEL_PATH = "./new_model/inference_model/segment_inference_model"
# TAG_VECTOR_MODEL_PATH = "./new_model/twitter100_tag_vectors.gz"
TAG_VECTOR_MODEL_PATH = "glove-wiki-gigaword-100"
VIDEO_VECTOR_MODEL_PATH = "./new_model/gigaword100_video_vectors.model"
VIDEO_TAGS_PATH = "./statics/new_kaggle_solution_40k.csv"
# Define static file paths.
SEGMENT_LABEL_PATH = "./statics/segment_label_ids.csv"
......@@ -26,11 +28,12 @@ VOCAB_PATH = "./statics/vocabulary.csv"
# Define parameters.
TAG_TOP_K = 5
VIDEO_TOP_K = 10
VIDEO_TOP_K = 5
# Target featuremap.
FEATUREMAP_PATH = "./featuremaps/toy-3-features.pb"
def get_segments(batch_video_mtx, batch_num_frames, segment_size):
"""Get segment-level inputs from frame-level features."""
video_batch_size = batch_video_mtx.shape[0]
......@@ -232,18 +235,5 @@ def inference_pb(file_path, threshold):
if __name__ == '__main__':
# result = inference_pb(FEATUREMAP_PATH, 5)
# print("=============== Old Model ===============")
# print(result["tag_result"])
# print(json.dumps(result["video_result"], sort_keys=True, indent=2))
# New model
MODEL_PATH = "./new_model/inference_model/segment_inference_model"
# TAG_VECTOR_MODEL_PATH = "./new_model/googlenews_tag_vectors.bin"
# VIDEO_VECTOR_MODEL_PATH = "./new_model/googlenews_video_vectors.model"
TAG_VECTOR_MODEL_PATH = "./new_model/tag_vectors.model"
VIDEO_VECTOR_MODEL_PATH = "./new_model/video_vectors.model"
VIDEO_TAGS_PATH = "./statics/new_kaggle_solution_40k.csv"
result = inference_pb(FEATUREMAP_PATH, 5)
# print("=============== New Model ===============")
result = inference_pb(FEATUREMAP_PATH, VIDEO_TOP_K)
print(json.dumps(result, sort_keys=True, indent=2))
......
This file is too large to display.
This file is too large to display.
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
corpus = api.load('text8') # download the corpus and return it opened as an iterable
model = Word2Vec(corpus) # train a model from the corpus
print(model.most_similar("car"))
model.save('twitter25-tag_vectors.model')
This diff could not be displayed because it is too large.
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
import numpy as np
def recommend_videos(tags, tag_model_path, video_model_path, top_k):
tag_vectors = Word2Vec.load(tag_model_path).wv
# tag_vectors = Word2Vec.load(tag_model_path).wv
# tag_vectors = KeyedVectors.load_word2vec_format(tag_model_path, binary=True)
tag_vectors = api.load(tag_model_path)
video_vectors = Word2Vec().wv.load(video_model_path)
error_tags = []
......
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
BATCH_SIZE = 1000
BATCH_SIZE = 1000
VECTOR_DIMENSION = 100
if __name__ == '__main__':
# tag_vectors = KeyedVectors.load_word2vec_format("new_model/GoogleNews-vectors-negative300.bin", binary=True)
tag_vectors = Word2Vec.load("new_model/tag_vectors.model").wv
video_vectors = Word2Vec().wv # Empty model
# tag_vectors = KeyedVectors.load_word2vec_format("new_model/twitter25_tag_vectors.model")
# tag_vectors = Word2Vec.load("new_model/text8_tag_vectors.model").wv
tag_vectors = api.load("glove-wiki-gigaword-100") # download the model and return as object ready for use
video_vectors = Word2Vec(size=VECTOR_DIMENSION).wv # Empty model
# Load video recommendation tags.
video_tags = pd.read_csv('statics/new_kaggle_solution_40k.csv')
......@@ -20,7 +23,7 @@ if __name__ == '__main__':
for i, row in video_tags.iterrows():
video_id = row[0]
video_vector = np.zeros(300)
video_vector = np.zeros(VECTOR_DIMENSION)
for segment_index in range(1, 6):
tag, weight = row[segment_index].split(":")
if tag in tag_vectors.vocab:
......