Showing
20 changed files
with
1531 additions
and
0 deletions
esot3ria/featuremaps/features.pb
0 → 100644
No preview for this file type
esot3ria/inference_pb.py
0 → 100644
1 | +import numpy as np | ||
2 | +import tensorflow as tf | ||
3 | +from tensorflow import logging | ||
4 | +from tensorflow import gfile | ||
5 | +import operator | ||
6 | +import esot3ria.pb_util as pbutil | ||
7 | +import esot3ria.video_recommender as recommender | ||
8 | +import esot3ria.video_util as videoutil | ||
9 | + | ||
10 | +# Define model paths. | ||
11 | +MODEL_PATH = "./model/inference_model/segment_inference_model" | ||
12 | +TAG_VECTOR_MODEL_PATH = "./tag_vectors.model" | ||
13 | +VIDEO_VECTOR_MODEL_PATH = "./video_vectors.model" | ||
14 | + | ||
15 | +# Define static file paths. | ||
16 | +SEGMENT_LABEL_PATH = "./statics/segment_label_ids.csv" | ||
17 | +VIDEO_TAGS_PATH = "./statics/kaggle_solution_40k.csv" | ||
18 | +VOCAB_PATH = "./statics/vocabulary.csv" | ||
19 | + | ||
20 | +# Define parameters. | ||
21 | +TAG_TOP_K = 5 | ||
22 | +VIDEO_TOP_K = 10 | ||
23 | + | ||
24 | + | ||
25 | +def get_segments(batch_video_mtx, batch_num_frames, segment_size): | ||
26 | + """Get segment-level inputs from frame-level features.""" | ||
27 | + video_batch_size = batch_video_mtx.shape[0] | ||
28 | + max_frame = batch_video_mtx.shape[1] | ||
29 | + feature_dim = batch_video_mtx.shape[-1] | ||
30 | + padded_segment_sizes = (batch_num_frames + segment_size - 1) // segment_size | ||
31 | + padded_segment_sizes *= segment_size | ||
32 | + segment_mask = ( | ||
33 | + 0 < (padded_segment_sizes[:, np.newaxis] - np.arange(0, max_frame))) | ||
34 | + | ||
35 | + # Segment bags. | ||
36 | + frame_bags = batch_video_mtx.reshape((-1, feature_dim)) | ||
37 | + segment_frames = frame_bags[segment_mask.reshape(-1)].reshape( | ||
38 | + (-1, segment_size, feature_dim)) | ||
39 | + | ||
40 | + # Segment num frames. | ||
41 | + segment_start_times = np.arange(0, max_frame, segment_size) | ||
42 | + num_segments = batch_num_frames[:, np.newaxis] - segment_start_times | ||
43 | + num_segment_bags = num_segments.reshape((-1)) | ||
44 | + valid_segment_mask = num_segment_bags > 0 | ||
45 | + segment_num_frames = num_segment_bags[valid_segment_mask] | ||
46 | + segment_num_frames[segment_num_frames > segment_size] = segment_size | ||
47 | + | ||
48 | + max_segment_num = (max_frame + segment_size - 1) // segment_size | ||
49 | + video_idxs = np.tile( | ||
50 | + np.arange(0, video_batch_size)[:, np.newaxis], [1, max_segment_num]) | ||
51 | + segment_idxs = np.tile(segment_start_times, [video_batch_size, 1]) | ||
52 | + idx_bags = np.stack([video_idxs, segment_idxs], axis=-1).reshape((-1, 2)) | ||
53 | + video_segment_ids = idx_bags[valid_segment_mask] | ||
54 | + | ||
55 | + return { | ||
56 | + "video_batch": segment_frames, | ||
57 | + "num_frames_batch": segment_num_frames, | ||
58 | + "video_segment_ids": video_segment_ids | ||
59 | + } | ||
60 | + | ||
61 | + | ||
62 | +def format_predictions(video_ids, predictions, top_k, whitelisted_cls_mask=None): | ||
63 | + batch_size = len(video_ids) | ||
64 | + for video_index in range(batch_size): | ||
65 | + video_prediction = predictions[video_index] | ||
66 | + if whitelisted_cls_mask is not None: | ||
67 | + # Whitelist classes. | ||
68 | + video_prediction *= whitelisted_cls_mask | ||
69 | + top_indices = np.argpartition(video_prediction, -top_k)[-top_k:] | ||
70 | + line = [(class_index, predictions[video_index][class_index]) | ||
71 | + for class_index in top_indices] | ||
72 | + line = sorted(line, key=lambda p: -p[1]) | ||
73 | + yield (video_ids[video_index] + "," + | ||
74 | + " ".join("%i %g" % (label, score) for (label, score) in line) + | ||
75 | + "\n").encode("utf8") | ||
76 | + | ||
77 | + | ||
78 | +def normalize_tag(tag): | ||
79 | + if isinstance(tag, str): | ||
80 | + new_tag = tag.lower().replace('[^a-zA-Z]', ' ') | ||
81 | + if new_tag.find(" (") != -1: | ||
82 | + new_tag = new_tag[:new_tag.find(" (")] | ||
83 | + new_tag = new_tag.replace(" ", "-") | ||
84 | + return new_tag | ||
85 | + else: | ||
86 | + return tag | ||
87 | + | ||
88 | + | ||
89 | +def inference_pb(file_path, threshold): | ||
90 | + VIDEO_TOP_K = int(threshold) | ||
91 | + inference_result = {} | ||
92 | + with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: | ||
93 | + | ||
94 | + # 0. Import SequenceExample type target from pb. | ||
95 | + target_video = pbutil.convert_pb(file_path) | ||
96 | + | ||
97 | + # 1. Load video features from pb. | ||
98 | + video_id_batch_val = np.array([b'video']) | ||
99 | + n_frames = len(target_video.feature_lists.feature_list['rgb'].feature) | ||
100 | + # Restrict frame size to 300 | ||
101 | + if n_frames > 300: | ||
102 | + n_frames = 300 | ||
103 | + video_batch_val = np.zeros((300, 1152)) | ||
104 | + for i in range(n_frames): | ||
105 | + video_batch_rgb_raw = target_video.feature_lists.feature_list['rgb'].feature[i].bytes_list.value[0] | ||
106 | + video_batch_rgb = np.array(tf.cast(tf.decode_raw(video_batch_rgb_raw, tf.float32), tf.float32).eval()) | ||
107 | + video_batch_audio_raw = target_video.feature_lists.feature_list['audio'].feature[i].bytes_list.value[0] | ||
108 | + video_batch_audio = np.array(tf.cast(tf.decode_raw(video_batch_audio_raw, tf.float32), tf.float32).eval()) | ||
109 | + video_batch_val[i] = np.concatenate([video_batch_rgb, video_batch_audio], axis=0) | ||
110 | + video_batch_val = np.array([video_batch_val]) | ||
111 | + num_frames_batch_val = np.array([n_frames]) | ||
112 | + | ||
113 | + # Restore checkpoint and meta-graph file. | ||
114 | + if not gfile.Exists(MODEL_PATH + ".meta"): | ||
115 | + raise IOError("Cannot find %s. Did you run eval.py?" % MODEL_PATH) | ||
116 | + meta_graph_location = MODEL_PATH + ".meta" | ||
117 | + logging.info("loading meta-graph: " + meta_graph_location) | ||
118 | + | ||
119 | + with tf.device("/cpu:0"): | ||
120 | + saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True) | ||
121 | + logging.info("restoring variables from " + MODEL_PATH) | ||
122 | + saver.restore(sess, MODEL_PATH) | ||
123 | + input_tensor = tf.get_collection("input_batch_raw")[0] | ||
124 | + num_frames_tensor = tf.get_collection("num_frames")[0] | ||
125 | + predictions_tensor = tf.get_collection("predictions")[0] | ||
126 | + | ||
127 | + # Workaround for num_epochs issue. | ||
128 | + def set_up_init_ops(variables): | ||
129 | + init_op_list = [] | ||
130 | + for variable in list(variables): | ||
131 | + if "train_input" in variable.name: | ||
132 | + init_op_list.append(tf.assign(variable, 1)) | ||
133 | + variables.remove(variable) | ||
134 | + init_op_list.append(tf.variables_initializer(variables)) | ||
135 | + return init_op_list | ||
136 | + | ||
137 | + sess.run( | ||
138 | + set_up_init_ops(tf.get_collection_ref(tf.GraphKeys.LOCAL_VARIABLES))) | ||
139 | + | ||
140 | + whitelisted_cls_mask = np.zeros((predictions_tensor.get_shape()[-1],), | ||
141 | + dtype=np.float32) | ||
142 | + with tf.io.gfile.GFile(SEGMENT_LABEL_PATH) as fobj: | ||
143 | + for line in fobj: | ||
144 | + try: | ||
145 | + cls_id = int(line) | ||
146 | + whitelisted_cls_mask[cls_id] = 1. | ||
147 | + except ValueError: | ||
148 | + # Simply skip the non-integer line. | ||
149 | + continue | ||
150 | + | ||
151 | + # 2. Make segment features. | ||
152 | + results = get_segments(video_batch_val, num_frames_batch_val, 5) | ||
153 | + video_segment_ids = results["video_segment_ids"] | ||
154 | + video_id_batch_val = video_id_batch_val[video_segment_ids[:, 0]] | ||
155 | + video_id_batch_val = np.array([ | ||
156 | + "%s:%d" % (x.decode("utf8"), y) | ||
157 | + for x, y in zip(video_id_batch_val, video_segment_ids[:, 1]) | ||
158 | + ]) | ||
159 | + video_batch_val = results["video_batch"] | ||
160 | + num_frames_batch_val = results["num_frames_batch"] | ||
161 | + if input_tensor.get_shape()[1] != video_batch_val.shape[1]: | ||
162 | + raise ValueError("max_frames mismatch. Please re-run the eval.py " | ||
163 | + "with correct segment_labels settings.") | ||
164 | + | ||
165 | + predictions_val, = sess.run([predictions_tensor], | ||
166 | + feed_dict={ | ||
167 | + input_tensor: video_batch_val, | ||
168 | + num_frames_tensor: num_frames_batch_val | ||
169 | + }) | ||
170 | + | ||
171 | + # 3. Make vocabularies. | ||
172 | + voca_dict = {} | ||
173 | + vocabs = open(VOCAB_PATH, 'r') | ||
174 | + while True: | ||
175 | + line = vocabs.readline() | ||
176 | + if not line: break | ||
177 | + vocab_dict_item = line.split(",") | ||
178 | + if vocab_dict_item[0] != "Index": | ||
179 | + voca_dict[vocab_dict_item[0]] = vocab_dict_item[3] | ||
180 | + vocabs.close() | ||
181 | + | ||
182 | + # 4. Make combined scores. | ||
183 | + combined_scores = {} | ||
184 | + for line in format_predictions(video_id_batch_val, predictions_val, TAG_TOP_K, whitelisted_cls_mask): | ||
185 | + segment_id, preds = line.decode("utf8").split(",") | ||
186 | + preds = preds.split(" ") | ||
187 | + pred_cls_ids = [int(preds[idx]) for idx in range(0, len(preds), 2)] | ||
188 | + pred_cls_scores = [float(preds[idx]) for idx in range(1, len(preds), 2)] | ||
189 | + for i in range(len(pred_cls_ids)): | ||
190 | + if pred_cls_ids[i] in combined_scores: | ||
191 | + combined_scores[pred_cls_ids[i]] += pred_cls_scores[i] | ||
192 | + else: | ||
193 | + combined_scores[pred_cls_ids[i]] = pred_cls_scores[i] | ||
194 | + | ||
195 | + combined_scores = sorted(combined_scores.items(), key=operator.itemgetter(1), reverse=True) | ||
196 | + demoninator = float(combined_scores[0][1] + combined_scores[1][1] | ||
197 | + + combined_scores[2][1] + combined_scores[3][1] + combined_scores[4][1]) | ||
198 | + | ||
199 | + tag_result = [] | ||
200 | + for itemIndex in range(TAG_TOP_K): | ||
201 | + segment_tag = str(voca_dict[str(combined_scores[itemIndex][0])]) | ||
202 | + normalized_tag = normalize_tag(segment_tag) | ||
203 | + tag_percentage = format(combined_scores[itemIndex][1] / demoninator, ".3f") | ||
204 | + tag_result.append((normalized_tag, tag_percentage)) | ||
205 | + | ||
206 | + # 5. Create recommend videos info, Combine results. | ||
207 | + recommend_video_ids = recommender.recommend_videos(tag_result, TAG_VECTOR_MODEL_PATH, | ||
208 | + VIDEO_VECTOR_MODEL_PATH, VIDEO_TOP_K) | ||
209 | + video_result = [videoutil.getVideoInfo(ids, VIDEO_TAGS_PATH, TAG_TOP_K) for ids in recommend_video_ids] | ||
210 | + | ||
211 | + inference_result = { | ||
212 | + "tag_result": tag_result, | ||
213 | + "video_result": video_result | ||
214 | + } | ||
215 | + | ||
216 | + # 6. Dispose instances. | ||
217 | + sess.close() | ||
218 | + | ||
219 | + return inference_result | ||
220 | + | ||
221 | + | ||
222 | +if __name__ == '__main__': | ||
223 | + filepath = "./featuremaps/features.pb" | ||
224 | + result = inference_pb(filepath, 5) | ||
225 | + print(result) |
No preview for this file type
esot3ria/model/inference_model/checkpoint
0 → 100644
This file is too large to display.
No preview for this file type
No preview for this file type
esot3ria/model/model_flags.json
0 → 100644
1 | +{"model": "FrameLevelLogisticModel", "feature_sizes": "1024,128", "feature_names": "rgb,audio", "frame_features": true, "label_loss": "CrossEntropyLoss"} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
esot3ria/pb_util.py
0 → 100644
1 | +import tensorflow as tf | ||
2 | +import numpy | ||
3 | + | ||
4 | + | ||
5 | +def _make_bytes(int_array): | ||
6 | + if bytes == str: # Python2 | ||
7 | + return ''.join(map(chr, int_array)) | ||
8 | + else: | ||
9 | + return bytes(int_array) | ||
10 | + | ||
11 | + | ||
12 | +def quantize(features, min_quantized_value=-2.0, max_quantized_value=2.0): | ||
13 | + """Quantizes float32 `features` into string.""" | ||
14 | + assert features.dtype == 'float32' | ||
15 | + assert len(features.shape) == 1 # 1-D array | ||
16 | + features = numpy.clip(features, min_quantized_value, max_quantized_value) | ||
17 | + quantize_range = max_quantized_value - min_quantized_value | ||
18 | + features = (features - min_quantized_value) * (255.0 / quantize_range) | ||
19 | + features = [int(round(f)) for f in features] | ||
20 | + | ||
21 | + return _make_bytes(features) | ||
22 | + | ||
23 | + | ||
24 | +# for parse feature.pb | ||
25 | + | ||
26 | +contexts = { | ||
27 | + 'AUDIO/feature/dimensions': tf.io.FixedLenFeature([], tf.int64), | ||
28 | + 'AUDIO/feature/rate': tf.io.FixedLenFeature([], tf.float32), | ||
29 | + 'RGB/feature/dimensions': tf.io.FixedLenFeature([], tf.int64), | ||
30 | + 'RGB/feature/rate': tf.io.FixedLenFeature([], tf.float32), | ||
31 | + 'clip/data_path': tf.io.FixedLenFeature([], tf.string), | ||
32 | + 'clip/end/timestamp': tf.io.FixedLenFeature([], tf.int64), | ||
33 | + 'clip/start/timestamp': tf.io.FixedLenFeature([], tf.int64) | ||
34 | +} | ||
35 | + | ||
36 | +features = { | ||
37 | + 'AUDIO/feature/floats': tf.io.VarLenFeature(dtype=tf.float32), | ||
38 | + 'AUDIO/feature/timestamp': tf.io.VarLenFeature(tf.int64), | ||
39 | + 'RGB/feature/floats': tf.io.VarLenFeature(dtype=tf.float32), | ||
40 | + 'RGB/feature/timestamp': tf.io.VarLenFeature(tf.int64) | ||
41 | + | ||
42 | +} | ||
43 | + | ||
44 | + | ||
45 | +def parse_exmp(serial_exmp): | ||
46 | + _, sequence_parsed = tf.io.parse_single_sequence_example( | ||
47 | + serialized=serial_exmp, | ||
48 | + context_features=contexts, | ||
49 | + sequence_features=features) | ||
50 | + | ||
51 | + sequence_parsed = tf.contrib.learn.run_n(sequence_parsed)[0] | ||
52 | + | ||
53 | + audio = sequence_parsed['AUDIO/feature/floats'].values | ||
54 | + rgb = sequence_parsed['RGB/feature/floats'].values | ||
55 | + | ||
56 | + # print(audio.values) | ||
57 | + # print(type(audio.values)) | ||
58 | + | ||
59 | + # audio is 128 8bit, rgb is 1024 8bit for every second | ||
60 | + audio_slices = [audio[128 * i: 128 * (i + 1)] for i in range(len(audio) // 128)] | ||
61 | + rgb_slices = [rgb[1024 * i: 1024 * (i + 1)] for i in range(len(rgb) // 1024)] | ||
62 | + | ||
63 | + byte_audio = [] | ||
64 | + byte_rgb = [] | ||
65 | + | ||
66 | + for seg in audio_slices: | ||
67 | + # audio_seg = quantize(seg) | ||
68 | + audio_seg = _make_bytes(seg) | ||
69 | + byte_audio.append(audio_seg) | ||
70 | + | ||
71 | + for seg in rgb_slices: | ||
72 | + # rgb_seg = quantize(seg) | ||
73 | + rgb_seg = _make_bytes(seg) | ||
74 | + byte_rgb.append(rgb_seg) | ||
75 | + | ||
76 | + return byte_audio, byte_rgb | ||
77 | + | ||
78 | + | ||
79 | +def make_exmp(id, audio, rgb): | ||
80 | + audio_features = [] | ||
81 | + rgb_features = [] | ||
82 | + | ||
83 | + for embedding in audio: | ||
84 | + embedding_feature = tf.train.Feature( | ||
85 | + bytes_list=tf.train.BytesList(value=[embedding])) | ||
86 | + audio_features.append(embedding_feature) | ||
87 | + | ||
88 | + for embedding in rgb: | ||
89 | + embedding_feature = tf.train.Feature( | ||
90 | + bytes_list=tf.train.BytesList(value=[embedding])) | ||
91 | + rgb_features.append(embedding_feature) | ||
92 | + | ||
93 | + # for construct yt8m data | ||
94 | + seq_exmp = tf.train.SequenceExample( | ||
95 | + context=tf.train.Features( | ||
96 | + feature={ | ||
97 | + 'id': tf.train.Feature(bytes_list=tf.train.BytesList( | ||
98 | + value=[id.encode('utf-8')])) | ||
99 | + }), | ||
100 | + feature_lists=tf.train.FeatureLists( | ||
101 | + feature_list={ | ||
102 | + 'audio': tf.train.FeatureList( | ||
103 | + feature=audio_features | ||
104 | + ), | ||
105 | + 'rgb': tf.train.FeatureList( | ||
106 | + feature=rgb_features | ||
107 | + ) | ||
108 | + }) | ||
109 | + ) | ||
110 | + serialized = seq_exmp.SerializeToString() | ||
111 | + return serialized | ||
112 | + | ||
113 | + | ||
114 | +def convert_pb(filename): | ||
115 | + sequence_example = open(filename, 'rb').read() | ||
116 | + | ||
117 | + audio, rgb = parse_exmp(sequence_example) | ||
118 | + tmp_example = make_exmp('video', audio, rgb) | ||
119 | + | ||
120 | + decoded = tf.train.SequenceExample.FromString(tmp_example) | ||
121 | + return decoded |
esot3ria/readpb.py
0 → 100644
1 | +import tensorflow as tf | ||
2 | +import numpy as np | ||
3 | + | ||
4 | +frame_lvl_record = "test0000.tfrecord" | ||
5 | + | ||
6 | +feat_rgb = [] | ||
7 | +feat_audio = [] | ||
8 | + | ||
9 | +for example in tf.python_io.tf_record_iterator(frame_lvl_record): | ||
10 | + tf_seq_example = tf.train.SequenceExample.FromString(example) | ||
11 | + test = tf_seq_example.SerializeToString() | ||
12 | + n_frames = len(tf_seq_example.feature_lists.feature_list['audio'].feature) | ||
13 | + sess = tf.InteractiveSession() | ||
14 | + rgb_frame = [] | ||
15 | + audio_frame = [] | ||
16 | + # iterate through frames | ||
17 | + for i in range(n_frames): | ||
18 | + rgb_frame.append(tf.cast(tf.decode_raw( | ||
19 | + tf_seq_example.feature_lists.feature_list['rgb'] | ||
20 | + .feature[i].bytes_list.value[0], tf.uint8) | ||
21 | + , tf.float32).eval()) | ||
22 | + audio_frame.append(tf.cast(tf.decode_raw( | ||
23 | + tf_seq_example.feature_lists.feature_list['audio'] | ||
24 | + .feature[i].bytes_list.value[0], tf.uint8) | ||
25 | + , tf.float32).eval()) | ||
26 | + | ||
27 | + sess.close() | ||
28 | + | ||
29 | + feat_audio.append(audio_frame) | ||
30 | + feat_rgb.append(rgb_frame) | ||
31 | + break | ||
32 | + | ||
33 | +print('The first video has %d frames' %len(feat_rgb[0])) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
esot3ria/statics/kaggle_solution_40k.csv
0 → 100644
This diff could not be displayed because it is too large.
esot3ria/statics/segment_label_ids.csv
0 → 100644
1 | +Index | ||
2 | +3 | ||
3 | +7 | ||
4 | +8 | ||
5 | +11 | ||
6 | +12 | ||
7 | +17 | ||
8 | +18 | ||
9 | +19 | ||
10 | +21 | ||
11 | +22 | ||
12 | +23 | ||
13 | +28 | ||
14 | +31 | ||
15 | +30 | ||
16 | +32 | ||
17 | +33 | ||
18 | +34 | ||
19 | +41 | ||
20 | +43 | ||
21 | +45 | ||
22 | +46 | ||
23 | +48 | ||
24 | +53 | ||
25 | +54 | ||
26 | +52 | ||
27 | +55 | ||
28 | +58 | ||
29 | +59 | ||
30 | +60 | ||
31 | +61 | ||
32 | +65 | ||
33 | +68 | ||
34 | +73 | ||
35 | +71 | ||
36 | +74 | ||
37 | +75 | ||
38 | +76 | ||
39 | +77 | ||
40 | +80 | ||
41 | +83 | ||
42 | +90 | ||
43 | +88 | ||
44 | +89 | ||
45 | +92 | ||
46 | +95 | ||
47 | +100 | ||
48 | +101 | ||
49 | +99 | ||
50 | +104 | ||
51 | +105 | ||
52 | +109 | ||
53 | +113 | ||
54 | +112 | ||
55 | +115 | ||
56 | +116 | ||
57 | +118 | ||
58 | +120 | ||
59 | +121 | ||
60 | +123 | ||
61 | +125 | ||
62 | +127 | ||
63 | +131 | ||
64 | +128 | ||
65 | +129 | ||
66 | +130 | ||
67 | +137 | ||
68 | +141 | ||
69 | +143 | ||
70 | +145 | ||
71 | +148 | ||
72 | +152 | ||
73 | +151 | ||
74 | +156 | ||
75 | +155 | ||
76 | +158 | ||
77 | +160 | ||
78 | +164 | ||
79 | +163 | ||
80 | +169 | ||
81 | +170 | ||
82 | +172 | ||
83 | +171 | ||
84 | +173 | ||
85 | +174 | ||
86 | +175 | ||
87 | +176 | ||
88 | +178 | ||
89 | +182 | ||
90 | +184 | ||
91 | +186 | ||
92 | +188 | ||
93 | +187 | ||
94 | +192 | ||
95 | +191 | ||
96 | +190 | ||
97 | +194 | ||
98 | +197 | ||
99 | +196 | ||
100 | +198 | ||
101 | +201 | ||
102 | +202 | ||
103 | +200 | ||
104 | +199 | ||
105 | +205 | ||
106 | +204 | ||
107 | +209 | ||
108 | +207 | ||
109 | +206 | ||
110 | +210 | ||
111 | +213 | ||
112 | +214 | ||
113 | +220 | ||
114 | +218 | ||
115 | +217 | ||
116 | +226 | ||
117 | +227 | ||
118 | +231 | ||
119 | +232 | ||
120 | +229 | ||
121 | +233 | ||
122 | +235 | ||
123 | +237 | ||
124 | +244 | ||
125 | +240 | ||
126 | +249 | ||
127 | +246 | ||
128 | +248 | ||
129 | +239 | ||
130 | +250 | ||
131 | +245 | ||
132 | +255 | ||
133 | +253 | ||
134 | +256 | ||
135 | +261 | ||
136 | +259 | ||
137 | +263 | ||
138 | +262 | ||
139 | +266 | ||
140 | +267 | ||
141 | +268 | ||
142 | +269 | ||
143 | +271 | ||
144 | +276 | ||
145 | +273 | ||
146 | +277 | ||
147 | +274 | ||
148 | +278 | ||
149 | +279 | ||
150 | +280 | ||
151 | +288 | ||
152 | +291 | ||
153 | +295 | ||
154 | +294 | ||
155 | +293 | ||
156 | +297 | ||
157 | +296 | ||
158 | +300 | ||
159 | +299 | ||
160 | +303 | ||
161 | +302 | ||
162 | +304 | ||
163 | +305 | ||
164 | +313 | ||
165 | +307 | ||
166 | +311 | ||
167 | +310 | ||
168 | +312 | ||
169 | +316 | ||
170 | +318 | ||
171 | +321 | ||
172 | +322 | ||
173 | +331 | ||
174 | +333 | ||
175 | +329 | ||
176 | +330 | ||
177 | +334 | ||
178 | +343 | ||
179 | +349 | ||
180 | +340 | ||
181 | +344 | ||
182 | +348 | ||
183 | +358 | ||
184 | +347 | ||
185 | +359 | ||
186 | +355 | ||
187 | +361 | ||
188 | +360 | ||
189 | +364 | ||
190 | +365 | ||
191 | +368 | ||
192 | +369 | ||
193 | +366 | ||
194 | +370 | ||
195 | +374 | ||
196 | +380 | ||
197 | +373 | ||
198 | +385 | ||
199 | +384 | ||
200 | +388 | ||
201 | +389 | ||
202 | +382 | ||
203 | +393 | ||
204 | +381 | ||
205 | +390 | ||
206 | +394 | ||
207 | +399 | ||
208 | +397 | ||
209 | +396 | ||
210 | +402 | ||
211 | +400 | ||
212 | +398 | ||
213 | +401 | ||
214 | +405 | ||
215 | +406 | ||
216 | +410 | ||
217 | +408 | ||
218 | +416 | ||
219 | +415 | ||
220 | +419 | ||
221 | +422 | ||
222 | +414 | ||
223 | +421 | ||
224 | +424 | ||
225 | +429 | ||
226 | +418 | ||
227 | +427 | ||
228 | +434 | ||
229 | +428 | ||
230 | +435 | ||
231 | +430 | ||
232 | +441 | ||
233 | +439 | ||
234 | +437 | ||
235 | +443 | ||
236 | +440 | ||
237 | +442 | ||
238 | +445 | ||
239 | +446 | ||
240 | +448 | ||
241 | +454 | ||
242 | +444 | ||
243 | +453 | ||
244 | +455 | ||
245 | +451 | ||
246 | +452 | ||
247 | +458 | ||
248 | +460 | ||
249 | +465 | ||
250 | +457 | ||
251 | +463 | ||
252 | +462 | ||
253 | +461 | ||
254 | +464 | ||
255 | +469 | ||
256 | +468 | ||
257 | +472 | ||
258 | +473 | ||
259 | +471 | ||
260 | +475 | ||
261 | +474 | ||
262 | +477 | ||
263 | +485 | ||
264 | +491 | ||
265 | +488 | ||
266 | +482 | ||
267 | +490 | ||
268 | +496 | ||
269 | +494 | ||
270 | +483 | ||
271 | +495 | ||
272 | +493 | ||
273 | +507 | ||
274 | +501 | ||
275 | +499 | ||
276 | +503 | ||
277 | +498 | ||
278 | +514 | ||
279 | +504 | ||
280 | +502 | ||
281 | +506 | ||
282 | +508 | ||
283 | +511 | ||
284 | +527 | ||
285 | +526 | ||
286 | +532 | ||
287 | +513 | ||
288 | +519 | ||
289 | +525 | ||
290 | +518 | ||
291 | +528 | ||
292 | +522 | ||
293 | +523 | ||
294 | +535 | ||
295 | +539 | ||
296 | +540 | ||
297 | +533 | ||
298 | +521 | ||
299 | +541 | ||
300 | +547 | ||
301 | +550 | ||
302 | +544 | ||
303 | +549 | ||
304 | +551 | ||
305 | +554 | ||
306 | +543 | ||
307 | +548 | ||
308 | +557 | ||
309 | +560 | ||
310 | +552 | ||
311 | +559 | ||
312 | +563 | ||
313 | +565 | ||
314 | +567 | ||
315 | +555 | ||
316 | +576 | ||
317 | +568 | ||
318 | +564 | ||
319 | +573 | ||
320 | +581 | ||
321 | +580 | ||
322 | +572 | ||
323 | +571 | ||
324 | +584 | ||
325 | +590 | ||
326 | +585 | ||
327 | +587 | ||
328 | +588 | ||
329 | +592 | ||
330 | +598 | ||
331 | +597 | ||
332 | +599 | ||
333 | +603 | ||
334 | +600 | ||
335 | +604 | ||
336 | +605 | ||
337 | +614 | ||
338 | +602 | ||
339 | +610 | ||
340 | +608 | ||
341 | +611 | ||
342 | +612 | ||
343 | +613 | ||
344 | +617 | ||
345 | +620 | ||
346 | +607 | ||
347 | +624 | ||
348 | +627 | ||
349 | +625 | ||
350 | +631 | ||
351 | +629 | ||
352 | +638 | ||
353 | +632 | ||
354 | +634 | ||
355 | +644 | ||
356 | +641 | ||
357 | +642 | ||
358 | +646 | ||
359 | +652 | ||
360 | +647 | ||
361 | +637 | ||
362 | +661 | ||
363 | +635 | ||
364 | +658 | ||
365 | +648 | ||
366 | +663 | ||
367 | +668 | ||
368 | +664 | ||
369 | +656 | ||
370 | +666 | ||
371 | +671 | ||
372 | +683 | ||
373 | +675 | ||
374 | +669 | ||
375 | +676 | ||
376 | +667 | ||
377 | +691 | ||
378 | +685 | ||
379 | +673 | ||
380 | +688 | ||
381 | +702 | ||
382 | +684 | ||
383 | +679 | ||
384 | +694 | ||
385 | +686 | ||
386 | +689 | ||
387 | +680 | ||
388 | +693 | ||
389 | +703 | ||
390 | +697 | ||
391 | +698 | ||
392 | +692 | ||
393 | +705 | ||
394 | +706 | ||
395 | +712 | ||
396 | +711 | ||
397 | +709 | ||
398 | +710 | ||
399 | +726 | ||
400 | +713 | ||
401 | +721 | ||
402 | +720 | ||
403 | +715 | ||
404 | +717 | ||
405 | +730 | ||
406 | +728 | ||
407 | +723 | ||
408 | +716 | ||
409 | +722 | ||
410 | +718 | ||
411 | +732 | ||
412 | +724 | ||
413 | +736 | ||
414 | +725 | ||
415 | +742 | ||
416 | +727 | ||
417 | +735 | ||
418 | +740 | ||
419 | +748 | ||
420 | +738 | ||
421 | +746 | ||
422 | +751 | ||
423 | +749 | ||
424 | +752 | ||
425 | +754 | ||
426 | +760 | ||
427 | +763 | ||
428 | +756 | ||
429 | +758 | ||
430 | +766 | ||
431 | +764 | ||
432 | +757 | ||
433 | +780 | ||
434 | +767 | ||
435 | +769 | ||
436 | +771 | ||
437 | +786 | ||
438 | +785 | ||
439 | +781 | ||
440 | +787 | ||
441 | +778 | ||
442 | +783 | ||
443 | +792 | ||
444 | +791 | ||
445 | +795 | ||
446 | +788 | ||
447 | +805 | ||
448 | +802 | ||
449 | +801 | ||
450 | +793 | ||
451 | +796 | ||
452 | +804 | ||
453 | +803 | ||
454 | +797 | ||
455 | +814 | ||
456 | +813 | ||
457 | +789 | ||
458 | +808 | ||
459 | +818 | ||
460 | +816 | ||
461 | +817 | ||
462 | +811 | ||
463 | +820 | ||
464 | +826 | ||
465 | +829 | ||
466 | +824 | ||
467 | +821 | ||
468 | +825 | ||
469 | +822 | ||
470 | +835 | ||
471 | +833 | ||
472 | +843 | ||
473 | +823 | ||
474 | +827 | ||
475 | +830 | ||
476 | +832 | ||
477 | +837 | ||
478 | +852 | ||
479 | +844 | ||
480 | +841 | ||
481 | +812 | ||
482 | +847 | ||
483 | +862 | ||
484 | +869 | ||
485 | +860 | ||
486 | +838 | ||
487 | +870 | ||
488 | +846 | ||
489 | +858 | ||
490 | +854 | ||
491 | +880 | ||
492 | +876 | ||
493 | +857 | ||
494 | +859 | ||
495 | +877 | ||
496 | +871 | ||
497 | +855 | ||
498 | +875 | ||
499 | +861 | ||
500 | +867 | ||
501 | +892 | ||
502 | +898 | ||
503 | +888 | ||
504 | +884 | ||
505 | +887 | ||
506 | +891 | ||
507 | +906 | ||
508 | +900 | ||
509 | +878 | ||
510 | +885 | ||
511 | +883 | ||
512 | +901 | ||
513 | +903 | ||
514 | +907 | ||
515 | +930 | ||
516 | +897 | ||
517 | +914 | ||
518 | +917 | ||
519 | +910 | ||
520 | +905 | ||
521 | +909 | ||
522 | +933 | ||
523 | +932 | ||
524 | +922 | ||
525 | +913 | ||
526 | +923 | ||
527 | +931 | ||
528 | +911 | ||
529 | +937 | ||
530 | +918 | ||
531 | +955 | ||
532 | +915 | ||
533 | +944 | ||
534 | +952 | ||
535 | +945 | ||
536 | +948 | ||
537 | +946 | ||
538 | +970 | ||
539 | +974 | ||
540 | +958 | ||
541 | +925 | ||
542 | +979 | ||
543 | +942 | ||
544 | +965 | ||
545 | +975 | ||
546 | +950 | ||
547 | +982 | ||
548 | +940 | ||
549 | +973 | ||
550 | +962 | ||
551 | +972 | ||
552 | +957 | ||
553 | +984 | ||
554 | +983 | ||
555 | +964 | ||
556 | +1007 | ||
557 | +971 | ||
558 | +981 | ||
559 | +954 | ||
560 | +993 | ||
561 | +991 | ||
562 | +996 | ||
563 | +1005 | ||
564 | +1015 | ||
565 | +1009 | ||
566 | +995 | ||
567 | +986 | ||
568 | +1000 | ||
569 | +985 | ||
570 | +980 | ||
571 | +1016 | ||
572 | +1011 | ||
573 | +999 | ||
574 | +1002 | ||
575 | +994 | ||
576 | +1013 | ||
577 | +1010 | ||
578 | +992 | ||
579 | +1008 | ||
580 | +1036 | ||
581 | +1025 | ||
582 | +1012 | ||
583 | +990 | ||
584 | +1037 | ||
585 | +1040 | ||
586 | +1031 | ||
587 | +1019 | ||
588 | +1052 | ||
589 | +1001 | ||
590 | +1055 | ||
591 | +1032 | ||
592 | +1069 | ||
593 | +1058 | ||
594 | +1014 | ||
595 | +1023 | ||
596 | +1030 | ||
597 | +1061 | ||
598 | +1035 | ||
599 | +1034 | ||
600 | +1053 | ||
601 | +1045 | ||
602 | +1046 | ||
603 | +1067 | ||
604 | +1060 | ||
605 | +1049 | ||
606 | +1056 | ||
607 | +1074 | ||
608 | +1066 | ||
609 | +1044 | ||
610 | +1038 | ||
611 | +1073 | ||
612 | +1077 | ||
613 | +1068 | ||
614 | +1057 | ||
615 | +1072 | ||
616 | +1104 | ||
617 | +1083 | ||
618 | +1089 | ||
619 | +1087 | ||
620 | +1099 | ||
621 | +1076 | ||
622 | +1086 | ||
623 | +1098 | ||
624 | +1094 | ||
625 | +1095 | ||
626 | +1096 | ||
627 | +1101 | ||
628 | +1107 | ||
629 | +1105 | ||
630 | +1117 | ||
631 | +1093 | ||
632 | +1106 | ||
633 | +1122 | ||
634 | +1119 | ||
635 | +1103 | ||
636 | +1128 | ||
637 | +1120 | ||
638 | +1126 | ||
639 | +1102 | ||
640 | +1115 | ||
641 | +1124 | ||
642 | +1123 | ||
643 | +1131 | ||
644 | +1136 | ||
645 | +1144 | ||
646 | +1121 | ||
647 | +1137 | ||
648 | +1132 | ||
649 | +1133 | ||
650 | +1157 | ||
651 | +1134 | ||
652 | +1143 | ||
653 | +1159 | ||
654 | +1164 | ||
655 | +1155 | ||
656 | +1142 | ||
657 | +1150 | ||
658 | +1148 | ||
659 | +1161 | ||
660 | +1165 | ||
661 | +1147 | ||
662 | +1162 | ||
663 | +1152 | ||
664 | +1174 | ||
665 | +1160 | ||
666 | +1166 | ||
667 | +1190 | ||
668 | +1175 | ||
669 | +1167 | ||
670 | +1156 | ||
671 | +1180 | ||
672 | +1171 | ||
673 | +1179 | ||
674 | +1172 | ||
675 | +1186 | ||
676 | +1188 | ||
677 | +1201 | ||
678 | +1177 | ||
679 | +1208 | ||
680 | +1183 | ||
681 | +1189 | ||
682 | +1192 | ||
683 | +1209 | ||
684 | +1214 | ||
685 | +1197 | ||
686 | +1168 | ||
687 | +1202 | ||
688 | +1205 | ||
689 | +1203 | ||
690 | +1199 | ||
691 | +1219 | ||
692 | +1217 | ||
693 | +1187 | ||
694 | +1206 | ||
695 | +1210 | ||
696 | +1241 | ||
697 | +1221 | ||
698 | +1218 | ||
699 | +1223 | ||
700 | +1236 | ||
701 | +1212 | ||
702 | +1237 | ||
703 | +1195 | ||
704 | +1216 | ||
705 | +1247 | ||
706 | +1234 | ||
707 | +1240 | ||
708 | +1257 | ||
709 | +1224 | ||
710 | +1243 | ||
711 | +1259 | ||
712 | +1242 | ||
713 | +1282 | ||
714 | +1222 | ||
715 | +1254 | ||
716 | +1227 | ||
717 | +1235 | ||
718 | +1269 | ||
719 | +1258 | ||
720 | +1290 | ||
721 | +1275 | ||
722 | +1262 | ||
723 | +1252 | ||
724 | +1248 | ||
725 | +1272 | ||
726 | +1246 | ||
727 | +1225 | ||
728 | +1245 | ||
729 | +1277 | ||
730 | +1298 | ||
731 | +1288 | ||
732 | +1271 | ||
733 | +1265 | ||
734 | +1286 | ||
735 | +1260 | ||
736 | +1266 | ||
737 | +1296 | ||
738 | +1280 | ||
739 | +1285 | ||
740 | +1293 | ||
741 | +1276 | ||
742 | +1287 | ||
743 | +1289 | ||
744 | +1261 | ||
745 | +1264 | ||
746 | +1295 | ||
747 | +1291 | ||
748 | +1283 | ||
749 | +1311 | ||
750 | +1303 | ||
751 | +1330 | ||
752 | +1315 | ||
753 | +1300 | ||
754 | +1333 | ||
755 | +1307 | ||
756 | +1325 | ||
757 | +1334 | ||
758 | +1316 | ||
759 | +1314 | ||
760 | +1317 | ||
761 | +1310 | ||
762 | +1329 | ||
763 | +1324 | ||
764 | +1339 | ||
765 | +1346 | ||
766 | +1342 | ||
767 | +1352 | ||
768 | +1321 | ||
769 | +1376 | ||
770 | +1366 | ||
771 | +1308 | ||
772 | +1345 | ||
773 | +1348 | ||
774 | +1386 | ||
775 | +1383 | ||
776 | +1372 | ||
777 | +1367 | ||
778 | +1400 | ||
779 | +1382 | ||
780 | +1375 | ||
781 | +1392 | ||
782 | +1380 | ||
783 | +1371 | ||
784 | +1393 | ||
785 | +1389 | ||
786 | +1353 | ||
787 | +1387 | ||
788 | +1374 | ||
789 | +1379 | ||
790 | +1381 | ||
791 | +1359 | ||
792 | +1360 | ||
793 | +1396 | ||
794 | +1399 | ||
795 | +1365 | ||
796 | +1424 | ||
797 | +1373 | ||
798 | +1411 | ||
799 | +1401 | ||
800 | +1397 | ||
801 | +1395 | ||
802 | +1412 | ||
803 | +1394 | ||
804 | +1368 | ||
805 | +1423 | ||
806 | +1391 | ||
807 | +1435 | ||
808 | +1409 | ||
809 | +1443 | ||
810 | +1402 | ||
811 | +1425 | ||
812 | +1415 | ||
813 | +1421 | ||
814 | +1426 | ||
815 | +1433 | ||
816 | +1420 | ||
817 | +1452 | ||
818 | +1436 | ||
819 | +1430 | ||
820 | +1408 | ||
821 | +1458 | ||
822 | +1429 | ||
823 | +1453 | ||
824 | +1454 | ||
825 | +1447 | ||
826 | +1472 | ||
827 | +1486 | ||
828 | +1468 | ||
829 | +1461 | ||
830 | +1467 | ||
831 | +1484 | ||
832 | +1457 | ||
833 | +1444 | ||
834 | +1450 | ||
835 | +1451 | ||
836 | +1459 | ||
837 | +1462 | ||
838 | +1449 | ||
839 | +1476 | ||
840 | +1470 | ||
841 | +1471 | ||
842 | +1498 | ||
843 | +1488 | ||
844 | +1442 | ||
845 | +1480 | ||
846 | +1456 | ||
847 | +1466 | ||
848 | +1505 | ||
849 | +1517 | ||
850 | +1464 | ||
851 | +1503 | ||
852 | +1490 | ||
853 | +1519 | ||
854 | +1481 | ||
855 | +1493 | ||
856 | +1463 | ||
857 | +1532 | ||
858 | +1487 | ||
859 | +1501 | ||
860 | +1500 | ||
861 | +1495 | ||
862 | +1509 | ||
863 | +1535 | ||
864 | +1506 | ||
865 | +1521 | ||
866 | +1580 | ||
867 | +1540 | ||
868 | +1502 | ||
869 | +1520 | ||
870 | +1496 | ||
871 | +1569 | ||
872 | +1515 | ||
873 | +1489 | ||
874 | +1507 | ||
875 | +1527 | ||
876 | +1545 | ||
877 | +1560 | ||
878 | +1510 | ||
879 | +1514 | ||
880 | +1526 | ||
881 | +1594 | ||
882 | +1511 | ||
883 | +1572 | ||
884 | +1548 | ||
885 | +1584 | ||
886 | +1556 | ||
887 | +1588 | ||
888 | +1628 | ||
889 | +1555 | ||
890 | +1568 | ||
891 | +1550 | ||
892 | +1622 | ||
893 | +1563 | ||
894 | +1603 | ||
895 | +1616 | ||
896 | +1576 | ||
897 | +1549 | ||
898 | +1537 | ||
899 | +1593 | ||
900 | +1618 | ||
901 | +1645 | ||
902 | +1624 | ||
903 | +1617 | ||
904 | +1634 | ||
905 | +1595 | ||
906 | +1597 | ||
907 | +1590 | ||
908 | +1632 | ||
909 | +1575 | ||
910 | +1559 | ||
911 | +1625 | ||
912 | +1615 | ||
913 | +1591 | ||
914 | +1630 | ||
915 | +1608 | ||
916 | +1621 | ||
917 | +1589 | ||
918 | +1646 | ||
919 | +1643 | ||
920 | +1652 | ||
921 | +1627 | ||
922 | +1611 | ||
923 | +1626 | ||
924 | +1613 | ||
925 | +1639 | ||
926 | +1655 | ||
927 | +1620 | ||
928 | +1602 | ||
929 | +1651 | ||
930 | +1653 | ||
931 | +1669 | ||
932 | +1638 | ||
933 | +1696 | ||
934 | +1649 | ||
935 | +1675 | ||
936 | +1660 | ||
937 | +1683 | ||
938 | +1666 | ||
939 | +1671 | ||
940 | +1703 | ||
941 | +1716 | ||
942 | +1637 | ||
943 | +1672 | ||
944 | +1676 | ||
945 | +1692 | ||
946 | +1711 | ||
947 | +1680 | ||
948 | +1641 | ||
949 | +1688 | ||
950 | +1708 | ||
951 | +1704 | ||
952 | +1690 | ||
953 | +1674 | ||
954 | +1718 | ||
955 | +1699 | ||
956 | +1723 | ||
957 | +1756 | ||
958 | +1700 | ||
959 | +1662 | ||
960 | +1715 | ||
961 | +1657 | ||
962 | +1733 | ||
963 | +1728 | ||
964 | +1670 | ||
965 | +1712 | ||
966 | +1685 | ||
967 | +1724 | ||
968 | +1735 | ||
969 | +1714 | ||
970 | +1730 | ||
971 | +1747 | ||
972 | +1656 | ||
973 | +1737 | ||
974 | +1705 | ||
975 | +1693 | ||
976 | +1713 | ||
977 | +1689 | ||
978 | +1753 | ||
979 | +1739 | ||
980 | +1721 | ||
981 | +1725 | ||
982 | +1749 | ||
983 | +1732 | ||
984 | +1743 | ||
985 | +1731 | ||
986 | +1767 | ||
987 | +1738 | ||
988 | +1831 | ||
989 | +1771 | ||
990 | +1726 | ||
991 | +1746 | ||
992 | +1776 | ||
993 | +1775 | ||
994 | +1799 | ||
995 | +1774 | ||
996 | +1780 | ||
997 | +1781 | ||
998 | +1769 | ||
999 | +1805 | ||
1000 | +1788 | ||
1001 | +1801 |
esot3ria/statics/vocabulary.csv
0 → 100644
This diff could not be displayed because it is too large.
esot3ria/tag_vector_generator.py
0 → 100644
1 | +import nltk | ||
2 | +import gensim | ||
3 | +import pandas as pd | ||
4 | + | ||
5 | +# Load files. | ||
6 | +nltk.download('stopwords') | ||
7 | +vocab = pd.read_csv('../vocabulary.csv') | ||
8 | + | ||
9 | +# Lower corpus and Remove () from name. | ||
10 | +vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ') | ||
11 | +for i in range(vocab['Name'].__len__()): | ||
12 | + name = vocab['Name'][i] | ||
13 | + if isinstance(name, str) and name.find(" (") != -1: | ||
14 | + vocab['Name'][i] = name[:name.find(" (")] | ||
15 | +vocab['Name'] = vocab['Name'].str.lower() | ||
16 | + | ||
17 | +# Combine separated names.(mobile phone -> mobile-phone) | ||
18 | +for name in vocab['Name']: | ||
19 | + if isinstance(name, str) and name.find(" ") != -1: | ||
20 | + combined_name = name.replace(" ", "-") | ||
21 | + for i in range(vocab['WikiDescription'].__len__()): | ||
22 | + if isinstance(vocab['WikiDescription'][i], str): | ||
23 | + vocab['WikiDescription'][i] = vocab['WikiDescription'][i].replace(name, combined_name) | ||
24 | + | ||
25 | + | ||
26 | +# Remove stopwords from corpus. | ||
27 | +stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b' | ||
28 | +vocab['WikiDescription'] = vocab['WikiDescription'].str.replace(stop_re, '') | ||
29 | +vocab['WikiDescription'] = vocab['WikiDescription'].str.split() | ||
30 | + | ||
31 | +# Tokenize corpus. | ||
32 | +tokenlist = [x for x in vocab['WikiDescription'] if str(x) != 'nan'] | ||
33 | +phrases = gensim.models.phrases.Phrases(tokenlist) | ||
34 | +phraser = gensim.models.phrases.Phraser(phrases) | ||
35 | +vocab_phrased = phraser[tokenlist] | ||
36 | + | ||
37 | +# Vectorize tags. | ||
38 | +w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1) | ||
39 | +w2v.save('tag_vectors.model') | ||
40 | + | ||
41 | +# word_vectors = w2v.wv | ||
42 | +# vocabs = word_vectors.vocab.keys() | ||
43 | +# word_vectors_list = [word_vectors[v] for v in vocabs] |
esot3ria/tag_vectors.model
0 → 100644
This file is too large to display.
esot3ria/video_recommender.py
0 → 100644
1 | +from gensim.models import Word2Vec | ||
2 | +import numpy as np | ||
3 | + | ||
4 | +def recommend_videos(tags, tag_model_path, video_model_path, top_k): | ||
5 | + tag_vectors = Word2Vec.load(tag_model_path).wv | ||
6 | + video_vectors = Word2Vec().wv.load(video_model_path) | ||
7 | + error_tags = [] | ||
8 | + | ||
9 | + video_vector = np.zeros(100) | ||
10 | + for (tag, weight) in tags: | ||
11 | + if tag in tag_vectors.index_to_key: | ||
12 | + video_vector = video_vector + (tag_vectors[tag] * float(weight)) | ||
13 | + else: | ||
14 | + # Pass if tag is unknown | ||
15 | + if tag not in error_tags: | ||
16 | + error_tags.append(tag) | ||
17 | + | ||
18 | + similar_ids = [x[0] for x in video_vectors.similar_by_vector(video_vector, top_k)] | ||
19 | + return similar_ids |
esot3ria/video_util.py
0 → 100644
1 | +import requests | ||
2 | +import pandas as pd | ||
3 | + | ||
4 | +base_URL = 'https://data.yt8m.org/2/j/i/' | ||
5 | +youtube_url = 'https://www.youtube.com/watch?v=' | ||
6 | + | ||
7 | + | ||
8 | +def getURL(vid_id): | ||
9 | + URL = base_URL + vid_id[:-2] + '/' + vid_id + '.js' | ||
10 | + response = requests.get(URL, verify = False) | ||
11 | + if response.status_code == 200: | ||
12 | + return youtube_url + response.text[10:-3] | ||
13 | + | ||
14 | + | ||
15 | +def getVideoInfo(vid_id, video_tags_path, top_k): | ||
16 | + video_url = getURL(vid_id) | ||
17 | + | ||
18 | + entire_video_tags = pd.read_csv(video_tags_path) | ||
19 | + video_tags_info = entire_video_tags.loc[entire_video_tags["vid_id"] == vid_id] | ||
20 | + video_tags = [] | ||
21 | + for i in range(1, top_k + 1): | ||
22 | + video_tag_tuple = video_tags_info["segment" + str(i)].values[0] # ex: "mobile-phone:0.361" | ||
23 | + video_tags.append(video_tag_tuple.split(":")[0]) | ||
24 | + | ||
25 | + return { | ||
26 | + "video_url": video_url, | ||
27 | + "video_tags": video_tags | ||
28 | + } |
esot3ria/video_vector_generator.py
0 → 100644
1 | +import pandas as pd | ||
2 | +import numpy as np | ||
3 | +from gensim.models import Word2Vec | ||
4 | + | ||
5 | +BATCH_SIZE = 1000 | ||
6 | + | ||
7 | + | ||
8 | +def vectorization_video(): | ||
9 | + print('[0.1 0.2]') | ||
10 | + | ||
11 | + | ||
12 | +if __name__ == '__main__': | ||
13 | + tag_vectors = Word2Vec.load("tag_vectors.model").wv | ||
14 | + video_vectors = Word2Vec().wv # Empty model | ||
15 | + | ||
16 | + # Load video recommendation tags. | ||
17 | + video_tags = pd.read_csv('statics/kaggle_solution_40k.csv') | ||
18 | + | ||
19 | + # Define batch variables. | ||
20 | + batch_video_ids = [] | ||
21 | + batch_video_vectors = [] | ||
22 | + error_tags = [] | ||
23 | + | ||
24 | + for i, row in video_tags.iterrows(): | ||
25 | + video_id = row[0] | ||
26 | + video_vector = np.zeros(100) | ||
27 | + for segment_index in range(1, 6): | ||
28 | + tag, weight = row[segment_index].split(":") | ||
29 | + if tag in tag_vectors.vocab: | ||
30 | + video_vector = video_vector + (tag_vectors[tag] * float(weight)) | ||
31 | + else: | ||
32 | + # Pass if tag is unknown | ||
33 | + if tag not in error_tags: | ||
34 | + error_tags.append(tag) | ||
35 | + | ||
36 | + batch_video_ids.append(video_id) | ||
37 | + batch_video_vectors.append(video_vector) | ||
38 | + # Add video vectors. | ||
39 | + if (i+1) % BATCH_SIZE == 0: | ||
40 | + video_vectors.add(batch_video_ids, batch_video_vectors) | ||
41 | + batch_video_ids = [] | ||
42 | + batch_video_vectors = [] | ||
43 | + print("Video vectors created: ", i+1) | ||
44 | + | ||
45 | + # Add rest of video vectors. | ||
46 | + video_vectors.add(batch_video_ids, batch_video_vectors) | ||
47 | + print("error tags: ") | ||
48 | + print(error_tags) | ||
49 | + | ||
50 | + video_vectors.save("video_vectors.model") | ||
51 | + | ||
52 | + # Usage | ||
53 | + # video_vectors = Word2Vec().wv.load("video_vectors.model") | ||
54 | + # video_vectors.most_similar("XwFj", topn=5) |
esot3ria/video_vectors.model
0 → 100644
This file is too large to display.
-
Please register or login to post a comment