Yoonjunhyeon
This diff could not be displayed because it is too large.
...@@ -4,15 +4,15 @@ import pandas as pd ...@@ -4,15 +4,15 @@ import pandas as pd
4 4
5 # Load files. 5 # Load files.
6 nltk.download('stopwords') 6 nltk.download('stopwords')
7 -vocab = pd.read_csv('vocabulary.csv') 7 +vocab = pd.read_csv('../vocabulary.csv')
8 8
9 # Lower corpus and Remove () from name. 9 # Lower corpus and Remove () from name.
10 -vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z]', ' ') 10 +vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ')
11 -vocab['Name'] = vocab['Name'].str.lower()
12 for i in range(vocab['Name'].__len__()): 11 for i in range(vocab['Name'].__len__()):
13 name = vocab['Name'][i] 12 name = vocab['Name'][i]
14 if isinstance(name, str) and name.find(" (") != -1: 13 if isinstance(name, str) and name.find(" (") != -1:
15 vocab['Name'][i] = name[:name.find(" (")] 14 vocab['Name'][i] = name[:name.find(" (")]
15 +vocab['Name'] = vocab['Name'].str.lower()
16 16
17 # Combine separated names.(mobile phone -> mobile-phone) 17 # Combine separated names.(mobile phone -> mobile-phone)
18 for name in vocab['Name']: 18 for name in vocab['Name']:
...@@ -35,9 +35,9 @@ phraser = gensim.models.phrases.Phraser(phrases) ...@@ -35,9 +35,9 @@ phraser = gensim.models.phrases.Phraser(phrases)
35 vocab_phrased = phraser[tokenlist] 35 vocab_phrased = phraser[tokenlist]
36 36
37 # Vectorize tags. 37 # Vectorize tags.
38 -w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, workers=2, min_count=1) 38 +w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1)
39 -w2v.save('tags_word2vec.model') 39 +w2v.save('tag_vectors.model')
40 40
41 -word_vectors = w2v.wv
42 -vocabs = word_vectors.vocab.keys()
43 -word_vectors_list = [word_vectors[v] for v in vocabs]
...\ No newline at end of file ...\ No newline at end of file
41 +# word_vectors = w2v.wv
42 +# vocabs = word_vectors.vocab.keys()
43 +# word_vectors_list = [word_vectors[v] for v in vocabs]
......
This file is too large to display.
This file is too large to display.
1 -vid_id,seg1,seg2,seg3,seg4,seg5
2 -Ndaa,Sports car:0.202,Shower:0.200,Racing:0.200,Greeting card:0.200,Car:0.199
3 -Dvaa,Tractor:0.363,Motorsport:0.323,Dance:0.145,Flour:0.092,Cappuccino:0.076
4 -gEaa,Cooking:0.246,Food:0.243,Dish (food):0.224,Vegetable:0.167,:0.120
5 -Pwaa,Dance:0.633,Wing Chun:0.095,Pencil:0.095,Eye shadow:0.095,Rubber band:0.083
6 -jgaa,Concert:0.332,Motorsport:0.209,Motorcycling:0.194,Motorcycle:0.159,Bicycle:0.106
7 -1Yaa,Concert:0.249,Dance:0.191,Tuna:0.188,Airplane:0.187,Association football:0.185
8 -yVaa,Weight training:0.372,Sport utility vehicle:0.241,Barbell:0.147,Luxury yacht:0.123,Icing (food):0.117
9 -BCaa,Mobile phone:0.397,Smartphone:0.395,Dance:0.090,Samsung Galaxy:0.073,Alpine skiing:0.046
10 -38aa,Food:0.269,Gold:0.211,Raven (comics):0.208,Car:0.171,Marching band:0.141
11 -AFaa,Car:0.386,Sports car:0.276,Motorsport:0.202,Volkswagen:0.078,Food:0.058
12 -Ajaa,Concert:0.355,Soldier:0.289,Cello:0.146,Drum kit:0.114,Arena:0.096
13 -2Faa,Orchestra:0.424,Disc jockey:0.288,Inflatable boat:0.115,Vegetarian cuisine:0.096,Concert:0.077
14 -ujaa,Mobile phone:0.273,Smartphone:0.215,IPhone 5S:0.199,Acoustic guitar:0.170,Door:0.143
15 -e2aa,Food:0.319,Cooking:0.313,Dish (food):0.285,Pikachu:0.048,Headset (audio):0.036
16 -UTaa,Pet:0.376,Wig:0.172,Mobile phone:0.170,Easter egg:0.156,Food:0.126
17 -12aa,Railroad car:0.342,Train:0.300,Muffler:0.142,Car:0.115,BMW 3 Series:0.101
18 -Duaa,Jaguar Cars:0.379,MacBook Air:0.189,Ferrari F430:0.168,Coupon:0.137,Hang gliding:0.126
19 -cpab,Car:0.408,Sports car:0.254,Motorsport:0.139,Sedan (automobile):0.139,Racing:0.060
20 -4rab,Food:0.310,Cooking:0.286,Dish (food):0.265,Meat:0.100,Bee:0.040
21 -Vtab,Choir:0.228,Handball:0.201,Hot air balloon:0.200,Fishing:0.199,Sedan (automobile):0.172
22 -gkab,Pet:0.374,Mercedes-Benz C-Class:0.285,Cat:0.162,Belle (Disney):0.111,Electric car:0.068
23 -RJab,Beer:0.317,Electric car:0.268,Acoustic guitar:0.169,Eye shadow:0.162,Vending machine:0.084
24 -utab,Concert:0.303,Booster pack:0.279,Fishing:0.159,Culinary art:0.138,Hair coloring:0.121
25 -Aeab,Samurai:0.278,Fishing:0.240,Association football:0.167,Chevrolet Corvette:0.167,Slam dunk:0.148
26 -t4ab,Association football:0.520,Barbell:0.166,Teacher:0.105,Biceps curl:0.105,Parachute:0.104
27 -53ab,Food:0.315,Cooking:0.269,Dish (food):0.257,Concealer:0.113,Bowling ball:0.046
28 -kaab,Necktie:0.257,Primary school:0.209,Turbine:0.187,Guitar amplifier:0.184,Dance:0.163
29 -Kdab,Cooking:0.306,Food:0.217,Train:0.175,Acoustic guitar:0.166,Tram:0.137
30 -Smab,Association football:0.292,Airbus A320 family:0.210,Racing:0.167,Vampire:0.165,Robot:0.165
31 -rAab,Association football:0.559,Pool (cue sports):0.170,Full moon:0.111,Fishing bait:0.091,Eye liner:0.070
32 -U3ab,Bride:0.414,Mobile phone:0.267,Smartphone:0.133,Mercedes-Benz C-Class:0.106,Loudspeaker:0.080
33 -mBab,Food:0.281,Cooking:0.261,Dish (food):0.260,:0.144,Vegetable:0.054
34 -18ab,Cooking:0.243,Dish (food):0.241,Food:0.239,Vegetable:0.166,:0.112
35 -NKab,Apartment:0.309,Piano:0.201,Association football:0.179,Table (furniture):0.176,Television set:0.134
1 import pandas as pd 1 import pandas as pd
2 +import numpy as np
2 from gensim.models import Word2Vec 3 from gensim.models import Word2Vec
3 4
5 +BATCH_SIZE = 1000
6 +
4 7
5 def vectorization_video(): 8 def vectorization_video():
6 print('[0.1 0.2]') 9 print('[0.1 0.2]')
7 10
8 11
9 if __name__ == '__main__': 12 if __name__ == '__main__':
10 - tag_vectors = Word2Vec.load("esot3ria/tags_word2vec.model").wv 13 + tag_vectors = Word2Vec.load("tag_vectors.model").wv
11 video_vectors = Word2Vec().wv # Empty model 14 video_vectors = Word2Vec().wv # Empty model
12 15
13 # Load video recommendation tags. 16 # Load video recommendation tags.
14 - video_tags = pd.read_csv('esot3ria/video_recommendation_tags.csv') 17 + video_tags = pd.read_csv('kaggle_solution_40k.csv')
18 +
19 + # Define batch variables.
20 + batch_video_ids = []
21 + batch_video_vectors = []
22 + error_tags = []
23 +
15 for i, row in video_tags.iterrows(): 24 for i, row in video_tags.iterrows():
16 video_id = row[0] 25 video_id = row[0]
26 + video_vector = np.zeros(100)
27 + for segment_index in range(1, 6):
28 + tag, weight = row[segment_index].split(":")
29 + if tag in tag_vectors.vocab:
30 + video_vector = video_vector + (tag_vectors[tag] * float(weight))
31 + else:
32 + # Pass if tag is unknown
33 + if tag not in error_tags:
34 + error_tags.append(tag)
35 +
36 + batch_video_ids.append(video_id)
37 + batch_video_vectors.append(video_vector)
38 + # Add video vectors.
39 + if (i+1) % BATCH_SIZE == 0:
40 + video_vectors.add(batch_video_ids, batch_video_vectors)
41 + batch_video_ids = []
42 + batch_video_vectors = []
43 + print("Video vectors created: ", i+1)
44 +
45 + # Add rest of video vectors.
46 + video_vectors.add(batch_video_ids, batch_video_vectors)
47 + print("error tags: ")
48 + print(error_tags)
17 49
50 + video_vectors.save("video_vectors.model")
18 51
52 + # Usage
53 + # video_vectors = Word2Vec().wv.load("video_vectors.model")
54 + # video_vectors.most_similar("XwFj", topn=5)
......
This file is too large to display.
This diff could not be displayed because it is too large.
...@@ -347,8 +347,8 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, ...@@ -347,8 +347,8 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
347 #======================================= 347 #=======================================
348 segment_id = str(segment_id.split(":")[0]) 348 segment_id = str(segment_id.split(":")[0])
349 if segment_id not in segment_id_list: 349 if segment_id not in segment_id_list:
350 - segment_id_list.append(str(segment_id)) 350 + segment_id_list.append(str(segment_id))
351 - segment_classes.append("") 351 + segment_classes.append("")
352 352
353 index = segment_id_list.index(segment_id) 353 index = segment_id_list.index(segment_id)
354 354
...@@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size, ...@@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
377 demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1]) 377 demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1])
378 #for item in temp: 378 #for item in temp:
379 for itemIndex in range(0, top_k): 379 for itemIndex in range(0, top_k):
380 - # 20.05.31 Esot3riA
381 # Normalize tag name 380 # Normalize tag name
382 segment_tag = str(voca_dict[str(temp[itemIndex][0])]) 381 segment_tag = str(voca_dict[str(temp[itemIndex][0])])
383 normalized_tag = normalize_tag(segment_tag) 382 normalized_tag = normalize_tag(segment_tag)
......
...@@ -140,7 +140,7 @@ ...@@ -140,7 +140,7 @@
140 267,5888,/m/09xzd,Chevrolet,https://en.wikipedia.org/wiki/Chevrolet,Autos & Vehicles,,,"Chevrolet, colloquially referred to as Chevy and formally the Chevrolet Division of General Motors Company, is an American automobile division of the American manufacturer General Motors. Louis Chevrolet and ousted General Motors founder William C. Durant started the company on November 3, 1911 as the Chevrolet Motor Car Company. Durant used the Chevrolet Motor Car Company to acquire a controlling stake in General Motors with a reverse merger occurring on May 2, 1918 and propelled himself back to the GM presidency. After Durant's second ousting in 1919, Alfred Sloan, with his maxim ""a car for every purse and purpose,"" would pick the Chevrolet brand to become the volume leader in the General Motors family, selling mainstream vehicles to compete with Henry Ford's Model T in 1919 and overtaking Ford as the best-selling car in the United States by 1929. Chevrolet-branded vehicles are sold in most automotive markets worldwide, with the notable exception of Oceania, where GM is represented by its Australian subsidiary, Holden." 140 267,5888,/m/09xzd,Chevrolet,https://en.wikipedia.org/wiki/Chevrolet,Autos & Vehicles,,,"Chevrolet, colloquially referred to as Chevy and formally the Chevrolet Division of General Motors Company, is an American automobile division of the American manufacturer General Motors. Louis Chevrolet and ousted General Motors founder William C. Durant started the company on November 3, 1911 as the Chevrolet Motor Car Company. Durant used the Chevrolet Motor Car Company to acquire a controlling stake in General Motors with a reverse merger occurring on May 2, 1918 and propelled himself back to the GM presidency. After Durant's second ousting in 1919, Alfred Sloan, with his maxim ""a car for every purse and purpose,"" would pick the Chevrolet brand to become the volume leader in the General Motors family, selling mainstream vehicles to compete with Henry Ford's Model T in 1919 and overtaking Ford as the best-selling car in the United States by 1929. Chevrolet-branded vehicles are sold in most automotive markets worldwide, with the notable exception of Oceania, where GM is represented by its Australian subsidiary, Holden."
141 268,5844,/m/01d5g,Batman,https://en.wikipedia.org/wiki/Batman,Arts & Entertainment,,,"Batman is a fictional superhero appearing in American comic books published by DC Comics. The character was created by artist Bob Kane and writer Bill Finger, and first appeared in Detective Comics #27. Originally named the ""Bat-Man"", the character is also referred to by such epithets as the Caped Crusader, the Dark Knight, and the World's Greatest Detective. Batman's secret identity is Bruce Wayne, an American billionaire, playboy, philanthropist, and owner of Wayne Enterprises. After witnessing the murder of his parents Thomas Wayne and Martha Wayne as a child, he swore vengeance against criminals, an oath tempered by a sense of justice. Wayne trains himself physically and intellectually and crafts a bat-inspired persona to fight crime. Batman operates in the fictional Gotham City, with assistance from various supporting characters, including his butler Alfred, police commissioner Gordon, and vigilante allies such as Robin. Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will." 141 268,5844,/m/01d5g,Batman,https://en.wikipedia.org/wiki/Batman,Arts & Entertainment,,,"Batman is a fictional superhero appearing in American comic books published by DC Comics. The character was created by artist Bob Kane and writer Bill Finger, and first appeared in Detective Comics #27. Originally named the ""Bat-Man"", the character is also referred to by such epithets as the Caped Crusader, the Dark Knight, and the World's Greatest Detective. Batman's secret identity is Bruce Wayne, an American billionaire, playboy, philanthropist, and owner of Wayne Enterprises. After witnessing the murder of his parents Thomas Wayne and Martha Wayne as a child, he swore vengeance against criminals, an oath tempered by a sense of justice. Wayne trains himself physically and intellectually and crafts a bat-inspired persona to fight crime. Batman operates in the fictional Gotham City, with assistance from various supporting characters, including his butler Alfred, police commissioner Gordon, and vigilante allies such as Robin. Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will."
142 269,5819,/m/0cfpc,Loudspeaker,https://en.wikipedia.org/wiki/Loudspeaker,Computers & Electronics,,,"A loudspeaker is an electroacoustic transducer; which converts an electrical audio signal into a corresponding sound. The most widely used type of speaker in the 2010s is the dynamic speaker, invented in 1925 by Edward W. Kellogg and Chester W. Rice. The dynamic speaker operates on the same basic principle as a dynamic microphone, but in reverse, to produce sound from an electrical signal. When an alternating current electrical audio signal is applied to its voice coil, a coil of wire suspended in a circular gap between the poles of a permanent magnet, the coil is forced to move rapidly back and forth due to Faraday's law of induction, which causes a diaphragm attached to the coil to move back and forth, pushing on the air to create sound waves. Besides this most common method, there are several alternative technologies that can be used to convert an electrical signal into sound. The sound source must be amplified or strengthened with an audio power amplifier before the signal is sent to the speaker. Speakers are typically housed in a speaker enclosure or speaker cabinet which is often a rectangular or square box made of wood or sometimes plastic." 142 269,5819,/m/0cfpc,Loudspeaker,https://en.wikipedia.org/wiki/Loudspeaker,Computers & Electronics,,,"A loudspeaker is an electroacoustic transducer; which converts an electrical audio signal into a corresponding sound. The most widely used type of speaker in the 2010s is the dynamic speaker, invented in 1925 by Edward W. Kellogg and Chester W. Rice. The dynamic speaker operates on the same basic principle as a dynamic microphone, but in reverse, to produce sound from an electrical signal. When an alternating current electrical audio signal is applied to its voice coil, a coil of wire suspended in a circular gap between the poles of a permanent magnet, the coil is forced to move rapidly back and forth due to Faraday's law of induction, which causes a diaphragm attached to the coil to move back and forth, pushing on the air to create sound waves. Besides this most common method, there are several alternative technologies that can be used to convert an electrical signal into sound. The sound source must be amplified or strengthened with an audio power amplifier before the signal is sent to the speaker. Speakers are typically housed in a speaker enclosure or speaker cabinet which is often a rectangular or square box made of wood or sometimes plastic."
143 -271,5752,/m/0lwkh,Nike Inc,"https://en.wikipedia.org/wiki/Nike,_Inc.",Shopping,Sports,,"Nike, Inc. is an American multinational corporation that is engaged in the design, development, manufacturing and worldwide marketing and sales of footwear, apparel, equipment, accessories and services. The company is headquartered near Beaverton, Oregon, in the Portland metropolitan area. It is one of the world's largest suppliers of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$24.1 billion in its fiscal year 2012. As of 2012, it employed more than 44,000 people worldwide. In 2014 the brand alone was valued at $19 billion, making it the most valuable brand among sports businesses. The company was founded on January 25, 1964, as Blue Ribbon Sports, by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, and subsidiaries including Brand Jordan, Hurley International and Converse." 143 +271,5752,/m/0lwkh,Nike Inc,"https://en.wikipedia.org/wiki/Nike,_Inc.",Shopping,Sports,,"Nike Inc is an American multinational corporation that is engaged in the design, development, manufacturing and worldwide marketing and sales of footwear, apparel, equipment, accessories and services. The company is headquartered near Beaverton, Oregon, in the Portland metropolitan area. It is one of the world's largest suppliers of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$24.1 billion in its fiscal year 2012. As of 2012, it employed more than 44,000 people worldwide. In 2014 the brand alone was valued at $19 billion, making it the most valuable brand among sports businesses. The company was founded on January 25, 1964, as Blue Ribbon Sports, by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, and subsidiaries including Brand Jordan, Hurley International and Converse."
144 276,5735,/m/0bl0l,Garden,https://en.wikipedia.org/wiki/Garden,Home & Garden,,,"A garden is a planned space, usually outdoors, set aside for the display, cultivation, and enjoyment of plants and other forms of nature. The garden can incorporate both natural and man-made materials. The most common form today is known as a residential garden, but the term garden has traditionally been a more general one. Zoos, which display wild animals in simulated natural habitats, were formerly called zoological gardens. Western gardens are almost universally based on plants, with garden often signifying a shortened form of botanical garden. Some traditional types of eastern gardens, such as Zen gardens, use plants sparsely or not at all. Xeriscape gardens use local native plants that do not require irrigation or extensive use of other resources while still providing the benefits of a garden environment. Gardens may exhibit structural enhancements, sometimes called follies, including water features such as fountains, ponds, waterfalls or creeks, dry creek beds, statuary, arbors, trellises and more. Some gardens are for ornamental purposes only, while some gardens also produce food crops, sometimes in separate areas, or sometimes intermixed with the ornamental plants." 144 276,5735,/m/0bl0l,Garden,https://en.wikipedia.org/wiki/Garden,Home & Garden,,,"A garden is a planned space, usually outdoors, set aside for the display, cultivation, and enjoyment of plants and other forms of nature. The garden can incorporate both natural and man-made materials. The most common form today is known as a residential garden, but the term garden has traditionally been a more general one. Zoos, which display wild animals in simulated natural habitats, were formerly called zoological gardens. Western gardens are almost universally based on plants, with garden often signifying a shortened form of botanical garden. Some traditional types of eastern gardens, such as Zen gardens, use plants sparsely or not at all. Xeriscape gardens use local native plants that do not require irrigation or extensive use of other resources while still providing the benefits of a garden environment. Gardens may exhibit structural enhancements, sometimes called follies, including water features such as fountains, ponds, waterfalls or creeks, dry creek beds, statuary, arbors, trellises and more. Some gardens are for ornamental purposes only, while some gardens also produce food crops, sometimes in separate areas, or sometimes intermixed with the ornamental plants."
145 273,5696,/m/071p9,Ski,https://en.wikipedia.org/wiki/Ski,Sports,,,"A ski is a narrow strip of semi-rigid material worn underfoot to glide over snow. Substantially longer than wide and characteristically employed in pairs, skis are attached to ski boots with ski bindings, with either a free, lockable, or partially secured heel. For climbing slopes, ski skins can be attached at the base of the ski. Originally intended as an aid to travel over snow, they are now mainly used recreationally in the sport of skiing." 145 273,5696,/m/071p9,Ski,https://en.wikipedia.org/wiki/Ski,Sports,,,"A ski is a narrow strip of semi-rigid material worn underfoot to glide over snow. Substantially longer than wide and characteristically employed in pairs, skis are attached to ski boots with ski bindings, with either a free, lockable, or partially secured heel. For climbing slopes, ski skins can be attached at the base of the ski. Originally intended as an aid to travel over snow, they are now mainly used recreationally in the sport of skiing."
146 277,5696,/m/05y4t,Paint,https://en.wikipedia.org/wiki/Paint,Arts & Entertainment,Business & Industrial,Home & Garden,"Paint is any liquid, liquefiable, or mastic composition that, after application to a substrate in a thin layer, converts to a solid film. It is most commonly used to protect, color, or provide texture to objects. Paint can be made or purchased in many colors—and in many different types, such as watercolor, synthetic, etc. Paint is typically stored, sold, and applied as a liquid, but most types dry into a solid." 146 277,5696,/m/05y4t,Paint,https://en.wikipedia.org/wiki/Paint,Arts & Entertainment,Business & Industrial,Home & Garden,"Paint is any liquid, liquefiable, or mastic composition that, after application to a substrate in a thin layer, converts to a solid film. It is most commonly used to protect, color, or provide texture to objects. Paint can be made or purchased in many colors—and in many different types, such as watercolor, synthetic, etc. Paint is typically stored, sold, and applied as a liquid, but most types dry into a solid."
......
No preview for this file type