Showing
8 changed files
with
45 additions
and
45 deletions
This diff could not be displayed because it is too large.
... | @@ -7,12 +7,12 @@ nltk.download('stopwords') | ... | @@ -7,12 +7,12 @@ nltk.download('stopwords') |
7 | vocab = pd.read_csv('../vocabulary.csv') | 7 | vocab = pd.read_csv('../vocabulary.csv') |
8 | 8 | ||
9 | # Lower corpus and Remove () from name. | 9 | # Lower corpus and Remove () from name. |
10 | -vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z]', ' ') | 10 | +vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ') |
11 | -vocab['Name'] = vocab['Name'].str.lower() | ||
12 | for i in range(vocab['Name'].__len__()): | 11 | for i in range(vocab['Name'].__len__()): |
13 | name = vocab['Name'][i] | 12 | name = vocab['Name'][i] |
14 | if isinstance(name, str) and name.find(" (") != -1: | 13 | if isinstance(name, str) and name.find(" (") != -1: |
15 | vocab['Name'][i] = name[:name.find(" (")] | 14 | vocab['Name'][i] = name[:name.find(" (")] |
15 | +vocab['Name'] = vocab['Name'].str.lower() | ||
16 | 16 | ||
17 | # Combine separated names.(mobile phone -> mobile-phone) | 17 | # Combine separated names.(mobile phone -> mobile-phone) |
18 | for name in vocab['Name']: | 18 | for name in vocab['Name']: |
... | @@ -35,9 +35,9 @@ phraser = gensim.models.phrases.Phraser(phrases) | ... | @@ -35,9 +35,9 @@ phraser = gensim.models.phrases.Phraser(phrases) |
35 | vocab_phrased = phraser[tokenlist] | 35 | vocab_phrased = phraser[tokenlist] |
36 | 36 | ||
37 | # Vectorize tags. | 37 | # Vectorize tags. |
38 | -w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, workers=2, min_count=1) | 38 | +w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1) |
39 | -w2v.save('tags_word2vec.model') | 39 | +w2v.save('tag_vectors.model') |
40 | 40 | ||
41 | # word_vectors = w2v.wv | 41 | # word_vectors = w2v.wv |
42 | # vocabs = word_vectors.vocab.keys() | 42 | # vocabs = word_vectors.vocab.keys() |
43 | -# word_vectors_list = [word_vectors[v] for v in vocabs] | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
43 | +# word_vectors_list = [word_vectors[v] for v in vocabs] | ... | ... |
web/backend/yt8m/esot3ria/tag_vectors.model
0 → 100644
This file is too large to display.
This file is too large to display.
1 | -vid_id,seg1,seg2,seg3,seg4,seg5 | ||
2 | -Ndaa,Sports car:0.202,Shower:0.200,Racing:0.200,Greeting card:0.200,Car:0.199 | ||
3 | -Dvaa,Tractor:0.363,Motorsport:0.323,Dance:0.145,Flour:0.092,Cappuccino:0.076 | ||
4 | -gEaa,Cooking:0.246,Food:0.243,Dish (food):0.224,Vegetable:0.167,:0.120 | ||
5 | -Pwaa,Dance:0.633,Wing Chun:0.095,Pencil:0.095,Eye shadow:0.095,Rubber band:0.083 | ||
6 | -jgaa,Concert:0.332,Motorsport:0.209,Motorcycling:0.194,Motorcycle:0.159,Bicycle:0.106 | ||
7 | -1Yaa,Concert:0.249,Dance:0.191,Tuna:0.188,Airplane:0.187,Association football:0.185 | ||
8 | -yVaa,Weight training:0.372,Sport utility vehicle:0.241,Barbell:0.147,Luxury yacht:0.123,Icing (food):0.117 | ||
9 | -BCaa,Mobile phone:0.397,Smartphone:0.395,Dance:0.090,Samsung Galaxy:0.073,Alpine skiing:0.046 | ||
10 | -38aa,Food:0.269,Gold:0.211,Raven (comics):0.208,Car:0.171,Marching band:0.141 | ||
11 | -AFaa,Car:0.386,Sports car:0.276,Motorsport:0.202,Volkswagen:0.078,Food:0.058 | ||
12 | -Ajaa,Concert:0.355,Soldier:0.289,Cello:0.146,Drum kit:0.114,Arena:0.096 | ||
13 | -2Faa,Orchestra:0.424,Disc jockey:0.288,Inflatable boat:0.115,Vegetarian cuisine:0.096,Concert:0.077 | ||
14 | -ujaa,Mobile phone:0.273,Smartphone:0.215,IPhone 5S:0.199,Acoustic guitar:0.170,Door:0.143 | ||
15 | -e2aa,Food:0.319,Cooking:0.313,Dish (food):0.285,Pikachu:0.048,Headset (audio):0.036 | ||
16 | -UTaa,Pet:0.376,Wig:0.172,Mobile phone:0.170,Easter egg:0.156,Food:0.126 | ||
17 | -12aa,Railroad car:0.342,Train:0.300,Muffler:0.142,Car:0.115,BMW 3 Series:0.101 | ||
18 | -Duaa,Jaguar Cars:0.379,MacBook Air:0.189,Ferrari F430:0.168,Coupon:0.137,Hang gliding:0.126 | ||
19 | -cpab,Car:0.408,Sports car:0.254,Motorsport:0.139,Sedan (automobile):0.139,Racing:0.060 | ||
20 | -4rab,Food:0.310,Cooking:0.286,Dish (food):0.265,Meat:0.100,Bee:0.040 | ||
21 | -Vtab,Choir:0.228,Handball:0.201,Hot air balloon:0.200,Fishing:0.199,Sedan (automobile):0.172 | ||
22 | -gkab,Pet:0.374,Mercedes-Benz C-Class:0.285,Cat:0.162,Belle (Disney):0.111,Electric car:0.068 | ||
23 | -RJab,Beer:0.317,Electric car:0.268,Acoustic guitar:0.169,Eye shadow:0.162,Vending machine:0.084 | ||
24 | -utab,Concert:0.303,Booster pack:0.279,Fishing:0.159,Culinary art:0.138,Hair coloring:0.121 | ||
25 | -Aeab,Samurai:0.278,Fishing:0.240,Association football:0.167,Chevrolet Corvette:0.167,Slam dunk:0.148 | ||
26 | -t4ab,Association football:0.520,Barbell:0.166,Teacher:0.105,Biceps curl:0.105,Parachute:0.104 | ||
27 | -53ab,Food:0.315,Cooking:0.269,Dish (food):0.257,Concealer:0.113,Bowling ball:0.046 | ||
28 | -kaab,Necktie:0.257,Primary school:0.209,Turbine:0.187,Guitar amplifier:0.184,Dance:0.163 | ||
29 | -Kdab,Cooking:0.306,Food:0.217,Train:0.175,Acoustic guitar:0.166,Tram:0.137 | ||
30 | -Smab,Association football:0.292,Airbus A320 family:0.210,Racing:0.167,Vampire:0.165,Robot:0.165 | ||
31 | -rAab,Association football:0.559,Pool (cue sports):0.170,Full moon:0.111,Fishing bait:0.091,Eye liner:0.070 | ||
32 | -U3ab,Bride:0.414,Mobile phone:0.267,Smartphone:0.133,Mercedes-Benz C-Class:0.106,Loudspeaker:0.080 | ||
33 | -mBab,Food:0.281,Cooking:0.261,Dish (food):0.260,:0.144,Vegetable:0.054 | ||
34 | -18ab,Cooking:0.243,Dish (food):0.241,Food:0.239,Vegetable:0.166,:0.112 | ||
35 | -NKab,Apartment:0.309,Piano:0.201,Association football:0.179,Table (furniture):0.176,Television set:0.134 |
1 | import pandas as pd | 1 | import pandas as pd |
2 | +import numpy as np | ||
2 | from gensim.models import Word2Vec | 3 | from gensim.models import Word2Vec |
3 | 4 | ||
5 | +BATCH_SIZE = 1000 | ||
6 | + | ||
4 | 7 | ||
5 | def vectorization_video(): | 8 | def vectorization_video(): |
6 | print('[0.1 0.2]') | 9 | print('[0.1 0.2]') |
7 | 10 | ||
8 | 11 | ||
9 | if __name__ == '__main__': | 12 | if __name__ == '__main__': |
10 | - tag_vectors = Word2Vec.load("esot3ria/tags_word2vec.model").wv | 13 | + tag_vectors = Word2Vec.load("tag_vectors.model").wv |
11 | video_vectors = Word2Vec().wv # Empty model | 14 | video_vectors = Word2Vec().wv # Empty model |
12 | 15 | ||
13 | # Load video recommendation tags. | 16 | # Load video recommendation tags. |
14 | - video_tags = pd.read_csv('esot3ria/video_recommendation_tags.csv') | 17 | + video_tags = pd.read_csv('kaggle_solution_40k.csv') |
18 | + | ||
19 | + # Define batch variables. | ||
20 | + batch_video_ids = [] | ||
21 | + batch_video_vectors = [] | ||
22 | + error_tags = [] | ||
23 | + | ||
15 | for i, row in video_tags.iterrows(): | 24 | for i, row in video_tags.iterrows(): |
16 | video_id = row[0] | 25 | video_id = row[0] |
26 | + video_vector = np.zeros(100) | ||
27 | + for segment_index in range(1, 6): | ||
28 | + tag, weight = row[segment_index].split(":") | ||
29 | + if tag in tag_vectors.vocab: | ||
30 | + video_vector = video_vector + (tag_vectors[tag] * float(weight)) | ||
31 | + else: | ||
32 | + # Pass if tag is unknown | ||
33 | + if tag not in error_tags: | ||
34 | + error_tags.append(tag) | ||
35 | + | ||
36 | + batch_video_ids.append(video_id) | ||
37 | + batch_video_vectors.append(video_vector) | ||
38 | + # Add video vectors. | ||
39 | + if (i+1) % BATCH_SIZE == 0: | ||
40 | + video_vectors.add(batch_video_ids, batch_video_vectors) | ||
41 | + batch_video_ids = [] | ||
42 | + batch_video_vectors = [] | ||
43 | + print("Video vectors created: ", i+1) | ||
44 | + | ||
45 | + # Add rest of video vectors. | ||
46 | + video_vectors.add(batch_video_ids, batch_video_vectors) | ||
47 | + print("error tags: ") | ||
48 | + print(error_tags) | ||
17 | 49 | ||
50 | + video_vectors.save("video_vectors.model") | ||
18 | 51 | ||
52 | + # Usage | ||
53 | + # video_vectors = Word2Vec().wv.load("video_vectors.model") | ||
54 | + # video_vectors.most_similar("XwFj", topn=5) | ... | ... |
This file is too large to display.
... | @@ -347,8 +347,8 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, | ... | @@ -347,8 +347,8 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, |
347 | #======================================= | 347 | #======================================= |
348 | segment_id = str(segment_id.split(":")[0]) | 348 | segment_id = str(segment_id.split(":")[0]) |
349 | if segment_id not in segment_id_list: | 349 | if segment_id not in segment_id_list: |
350 | - segment_id_list.append(str(segment_id)) | 350 | + segment_id_list.append(str(segment_id)) |
351 | - segment_classes.append("") | 351 | + segment_classes.append("") |
352 | 352 | ||
353 | index = segment_id_list.index(segment_id) | 353 | index = segment_id_list.index(segment_id) |
354 | 354 | ||
... | @@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, | ... | @@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, |
377 | demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1]) | 377 | demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1]) |
378 | #for item in temp: | 378 | #for item in temp: |
379 | for itemIndex in range(0, top_k): | 379 | for itemIndex in range(0, top_k): |
380 | - # 20.05.31 Esot3riA | ||
381 | # Normalize tag name | 380 | # Normalize tag name |
382 | segment_tag = str(voca_dict[str(temp[itemIndex][0])]) | 381 | segment_tag = str(voca_dict[str(temp[itemIndex][0])]) |
383 | normalized_tag = normalize_tag(segment_tag) | 382 | normalized_tag = normalize_tag(segment_tag) | ... | ... |
-
Please register or login to post a comment