이현규

Create tag vectors and video vectors

This diff could not be displayed because it is too large.
...@@ -7,12 +7,12 @@ nltk.download('stopwords') ...@@ -7,12 +7,12 @@ nltk.download('stopwords')
7 vocab = pd.read_csv('../vocabulary.csv') 7 vocab = pd.read_csv('../vocabulary.csv')
8 8
9 # Lower corpus and Remove () from name. 9 # Lower corpus and Remove () from name.
10 -vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z]', ' ') 10 +vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ')
11 -vocab['Name'] = vocab['Name'].str.lower()
12 for i in range(vocab['Name'].__len__()): 11 for i in range(vocab['Name'].__len__()):
13 name = vocab['Name'][i] 12 name = vocab['Name'][i]
14 if isinstance(name, str) and name.find(" (") != -1: 13 if isinstance(name, str) and name.find(" (") != -1:
15 vocab['Name'][i] = name[:name.find(" (")] 14 vocab['Name'][i] = name[:name.find(" (")]
15 +vocab['Name'] = vocab['Name'].str.lower()
16 16
17 # Combine separated names.(mobile phone -> mobile-phone) 17 # Combine separated names.(mobile phone -> mobile-phone)
18 for name in vocab['Name']: 18 for name in vocab['Name']:
...@@ -35,8 +35,8 @@ phraser = gensim.models.phrases.Phraser(phrases) ...@@ -35,8 +35,8 @@ phraser = gensim.models.phrases.Phraser(phrases)
35 vocab_phrased = phraser[tokenlist] 35 vocab_phrased = phraser[tokenlist]
36 36
37 # Vectorize tags. 37 # Vectorize tags.
38 -w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, workers=2, min_count=1) 38 +w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1)
39 -w2v.save('tags_word2vec.model') 39 +w2v.save('tag_vectors.model')
40 40
41 # word_vectors = w2v.wv 41 # word_vectors = w2v.wv
42 # vocabs = word_vectors.vocab.keys() 42 # vocabs = word_vectors.vocab.keys()
......
This file is too large to display.
This file is too large to display.
1 -vid_id,seg1,seg2,seg3,seg4,seg5
2 -Ndaa,Sports car:0.202,Shower:0.200,Racing:0.200,Greeting card:0.200,Car:0.199
3 -Dvaa,Tractor:0.363,Motorsport:0.323,Dance:0.145,Flour:0.092,Cappuccino:0.076
4 -gEaa,Cooking:0.246,Food:0.243,Dish (food):0.224,Vegetable:0.167,:0.120
5 -Pwaa,Dance:0.633,Wing Chun:0.095,Pencil:0.095,Eye shadow:0.095,Rubber band:0.083
6 -jgaa,Concert:0.332,Motorsport:0.209,Motorcycling:0.194,Motorcycle:0.159,Bicycle:0.106
7 -1Yaa,Concert:0.249,Dance:0.191,Tuna:0.188,Airplane:0.187,Association football:0.185
8 -yVaa,Weight training:0.372,Sport utility vehicle:0.241,Barbell:0.147,Luxury yacht:0.123,Icing (food):0.117
9 -BCaa,Mobile phone:0.397,Smartphone:0.395,Dance:0.090,Samsung Galaxy:0.073,Alpine skiing:0.046
10 -38aa,Food:0.269,Gold:0.211,Raven (comics):0.208,Car:0.171,Marching band:0.141
11 -AFaa,Car:0.386,Sports car:0.276,Motorsport:0.202,Volkswagen:0.078,Food:0.058
12 -Ajaa,Concert:0.355,Soldier:0.289,Cello:0.146,Drum kit:0.114,Arena:0.096
13 -2Faa,Orchestra:0.424,Disc jockey:0.288,Inflatable boat:0.115,Vegetarian cuisine:0.096,Concert:0.077
14 -ujaa,Mobile phone:0.273,Smartphone:0.215,IPhone 5S:0.199,Acoustic guitar:0.170,Door:0.143
15 -e2aa,Food:0.319,Cooking:0.313,Dish (food):0.285,Pikachu:0.048,Headset (audio):0.036
16 -UTaa,Pet:0.376,Wig:0.172,Mobile phone:0.170,Easter egg:0.156,Food:0.126
17 -12aa,Railroad car:0.342,Train:0.300,Muffler:0.142,Car:0.115,BMW 3 Series:0.101
18 -Duaa,Jaguar Cars:0.379,MacBook Air:0.189,Ferrari F430:0.168,Coupon:0.137,Hang gliding:0.126
19 -cpab,Car:0.408,Sports car:0.254,Motorsport:0.139,Sedan (automobile):0.139,Racing:0.060
20 -4rab,Food:0.310,Cooking:0.286,Dish (food):0.265,Meat:0.100,Bee:0.040
21 -Vtab,Choir:0.228,Handball:0.201,Hot air balloon:0.200,Fishing:0.199,Sedan (automobile):0.172
22 -gkab,Pet:0.374,Mercedes-Benz C-Class:0.285,Cat:0.162,Belle (Disney):0.111,Electric car:0.068
23 -RJab,Beer:0.317,Electric car:0.268,Acoustic guitar:0.169,Eye shadow:0.162,Vending machine:0.084
24 -utab,Concert:0.303,Booster pack:0.279,Fishing:0.159,Culinary art:0.138,Hair coloring:0.121
25 -Aeab,Samurai:0.278,Fishing:0.240,Association football:0.167,Chevrolet Corvette:0.167,Slam dunk:0.148
26 -t4ab,Association football:0.520,Barbell:0.166,Teacher:0.105,Biceps curl:0.105,Parachute:0.104
27 -53ab,Food:0.315,Cooking:0.269,Dish (food):0.257,Concealer:0.113,Bowling ball:0.046
28 -kaab,Necktie:0.257,Primary school:0.209,Turbine:0.187,Guitar amplifier:0.184,Dance:0.163
29 -Kdab,Cooking:0.306,Food:0.217,Train:0.175,Acoustic guitar:0.166,Tram:0.137
30 -Smab,Association football:0.292,Airbus A320 family:0.210,Racing:0.167,Vampire:0.165,Robot:0.165
31 -rAab,Association football:0.559,Pool (cue sports):0.170,Full moon:0.111,Fishing bait:0.091,Eye liner:0.070
32 -U3ab,Bride:0.414,Mobile phone:0.267,Smartphone:0.133,Mercedes-Benz C-Class:0.106,Loudspeaker:0.080
33 -mBab,Food:0.281,Cooking:0.261,Dish (food):0.260,:0.144,Vegetable:0.054
34 -18ab,Cooking:0.243,Dish (food):0.241,Food:0.239,Vegetable:0.166,:0.112
35 -NKab,Apartment:0.309,Piano:0.201,Association football:0.179,Table (furniture):0.176,Television set:0.134
1 import pandas as pd 1 import pandas as pd
2 +import numpy as np
2 from gensim.models import Word2Vec 3 from gensim.models import Word2Vec
3 4
5 +BATCH_SIZE = 1000
6 +
4 7
5 def vectorization_video(): 8 def vectorization_video():
6 print('[0.1 0.2]') 9 print('[0.1 0.2]')
7 10
8 11
9 if __name__ == '__main__': 12 if __name__ == '__main__':
10 - tag_vectors = Word2Vec.load("esot3ria/tags_word2vec.model").wv 13 + tag_vectors = Word2Vec.load("tag_vectors.model").wv
11 video_vectors = Word2Vec().wv # Empty model 14 video_vectors = Word2Vec().wv # Empty model
12 15
13 # Load video recommendation tags. 16 # Load video recommendation tags.
14 - video_tags = pd.read_csv('esot3ria/video_recommendation_tags.csv') 17 + video_tags = pd.read_csv('kaggle_solution_40k.csv')
18 +
19 + # Define batch variables.
20 + batch_video_ids = []
21 + batch_video_vectors = []
22 + error_tags = []
23 +
15 for i, row in video_tags.iterrows(): 24 for i, row in video_tags.iterrows():
16 video_id = row[0] 25 video_id = row[0]
26 + video_vector = np.zeros(100)
27 + for segment_index in range(1, 6):
28 + tag, weight = row[segment_index].split(":")
29 + if tag in tag_vectors.vocab:
30 + video_vector = video_vector + (tag_vectors[tag] * float(weight))
31 + else:
32 + # Pass if tag is unknown
33 + if tag not in error_tags:
34 + error_tags.append(tag)
35 +
36 + batch_video_ids.append(video_id)
37 + batch_video_vectors.append(video_vector)
38 + # Add video vectors.
39 + if (i+1) % BATCH_SIZE == 0:
40 + video_vectors.add(batch_video_ids, batch_video_vectors)
41 + batch_video_ids = []
42 + batch_video_vectors = []
43 + print("Video vectors created: ", i+1)
44 +
45 + # Add rest of video vectors.
46 + video_vectors.add(batch_video_ids, batch_video_vectors)
47 + print("error tags: ")
48 + print(error_tags)
17 49
50 + video_vectors.save("video_vectors.model")
18 51
52 + # Usage
53 + # video_vectors = Word2Vec().wv.load("video_vectors.model")
54 + # video_vectors.most_similar("XwFj", topn=5)
......
This file is too large to display.
...@@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, ...@@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
377 demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1]) 377 demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1])
378 #for item in temp: 378 #for item in temp:
379 for itemIndex in range(0, top_k): 379 for itemIndex in range(0, top_k):
380 - # 20.05.31 Esot3riA
381 # Normalize tag name 380 # Normalize tag name
382 segment_tag = str(voca_dict[str(temp[itemIndex][0])]) 381 segment_tag = str(voca_dict[str(temp[itemIndex][0])])
383 normalized_tag = normalize_tag(segment_tag) 382 normalized_tag = normalize_tag(segment_tag)
......