Yoonjunhyeon
This diff could not be displayed because it is too large.
......@@ -4,15 +4,15 @@ import pandas as pd
# Load files.
nltk.download('stopwords')
vocab = pd.read_csv('vocabulary.csv')
vocab = pd.read_csv('../vocabulary.csv')
# Lower corpus and Remove () from name.
vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z]', ' ')
vocab['Name'] = vocab['Name'].str.lower()
vocab['WikiDescription'] = vocab['WikiDescription'].str.lower().str.replace('[^a-zA-Z0-9]', ' ')
for i in range(vocab['Name'].__len__()):
name = vocab['Name'][i]
if isinstance(name, str) and name.find(" (") != -1:
vocab['Name'][i] = name[:name.find(" (")]
vocab['Name'] = vocab['Name'].str.lower()
# Combine separated names.(mobile phone -> mobile-phone)
for name in vocab['Name']:
......@@ -35,9 +35,9 @@ phraser = gensim.models.phrases.Phraser(phrases)
vocab_phrased = phraser[tokenlist]
# Vectorize tags.
w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, workers=2, min_count=1)
w2v.save('tags_word2vec.model')
w2v = gensim.models.word2vec.Word2Vec(sentences=tokenlist, min_count=1)
w2v.save('tag_vectors.model')
word_vectors = w2v.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]
\ No newline at end of file
# word_vectors = w2v.wv
# vocabs = word_vectors.vocab.keys()
# word_vectors_list = [word_vectors[v] for v in vocabs]
......
This file is too large to display.
This file is too large to display.
vid_id,seg1,seg2,seg3,seg4,seg5
Ndaa,Sports car:0.202,Shower:0.200,Racing:0.200,Greeting card:0.200,Car:0.199
Dvaa,Tractor:0.363,Motorsport:0.323,Dance:0.145,Flour:0.092,Cappuccino:0.076
gEaa,Cooking:0.246,Food:0.243,Dish (food):0.224,Vegetable:0.167,:0.120
Pwaa,Dance:0.633,Wing Chun:0.095,Pencil:0.095,Eye shadow:0.095,Rubber band:0.083
jgaa,Concert:0.332,Motorsport:0.209,Motorcycling:0.194,Motorcycle:0.159,Bicycle:0.106
1Yaa,Concert:0.249,Dance:0.191,Tuna:0.188,Airplane:0.187,Association football:0.185
yVaa,Weight training:0.372,Sport utility vehicle:0.241,Barbell:0.147,Luxury yacht:0.123,Icing (food):0.117
BCaa,Mobile phone:0.397,Smartphone:0.395,Dance:0.090,Samsung Galaxy:0.073,Alpine skiing:0.046
38aa,Food:0.269,Gold:0.211,Raven (comics):0.208,Car:0.171,Marching band:0.141
AFaa,Car:0.386,Sports car:0.276,Motorsport:0.202,Volkswagen:0.078,Food:0.058
Ajaa,Concert:0.355,Soldier:0.289,Cello:0.146,Drum kit:0.114,Arena:0.096
2Faa,Orchestra:0.424,Disc jockey:0.288,Inflatable boat:0.115,Vegetarian cuisine:0.096,Concert:0.077
ujaa,Mobile phone:0.273,Smartphone:0.215,IPhone 5S:0.199,Acoustic guitar:0.170,Door:0.143
e2aa,Food:0.319,Cooking:0.313,Dish (food):0.285,Pikachu:0.048,Headset (audio):0.036
UTaa,Pet:0.376,Wig:0.172,Mobile phone:0.170,Easter egg:0.156,Food:0.126
12aa,Railroad car:0.342,Train:0.300,Muffler:0.142,Car:0.115,BMW 3 Series:0.101
Duaa,Jaguar Cars:0.379,MacBook Air:0.189,Ferrari F430:0.168,Coupon:0.137,Hang gliding:0.126
cpab,Car:0.408,Sports car:0.254,Motorsport:0.139,Sedan (automobile):0.139,Racing:0.060
4rab,Food:0.310,Cooking:0.286,Dish (food):0.265,Meat:0.100,Bee:0.040
Vtab,Choir:0.228,Handball:0.201,Hot air balloon:0.200,Fishing:0.199,Sedan (automobile):0.172
gkab,Pet:0.374,Mercedes-Benz C-Class:0.285,Cat:0.162,Belle (Disney):0.111,Electric car:0.068
RJab,Beer:0.317,Electric car:0.268,Acoustic guitar:0.169,Eye shadow:0.162,Vending machine:0.084
utab,Concert:0.303,Booster pack:0.279,Fishing:0.159,Culinary art:0.138,Hair coloring:0.121
Aeab,Samurai:0.278,Fishing:0.240,Association football:0.167,Chevrolet Corvette:0.167,Slam dunk:0.148
t4ab,Association football:0.520,Barbell:0.166,Teacher:0.105,Biceps curl:0.105,Parachute:0.104
53ab,Food:0.315,Cooking:0.269,Dish (food):0.257,Concealer:0.113,Bowling ball:0.046
kaab,Necktie:0.257,Primary school:0.209,Turbine:0.187,Guitar amplifier:0.184,Dance:0.163
Kdab,Cooking:0.306,Food:0.217,Train:0.175,Acoustic guitar:0.166,Tram:0.137
Smab,Association football:0.292,Airbus A320 family:0.210,Racing:0.167,Vampire:0.165,Robot:0.165
rAab,Association football:0.559,Pool (cue sports):0.170,Full moon:0.111,Fishing bait:0.091,Eye liner:0.070
U3ab,Bride:0.414,Mobile phone:0.267,Smartphone:0.133,Mercedes-Benz C-Class:0.106,Loudspeaker:0.080
mBab,Food:0.281,Cooking:0.261,Dish (food):0.260,:0.144,Vegetable:0.054
18ab,Cooking:0.243,Dish (food):0.241,Food:0.239,Vegetable:0.166,:0.112
NKab,Apartment:0.309,Piano:0.201,Association football:0.179,Table (furniture):0.176,Television set:0.134
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
BATCH_SIZE = 1000
def vectorization_video():
print('[0.1 0.2]')
if __name__ == '__main__':
tag_vectors = Word2Vec.load("esot3ria/tags_word2vec.model").wv
tag_vectors = Word2Vec.load("tag_vectors.model").wv
video_vectors = Word2Vec().wv # Empty model
# Load video recommendation tags.
video_tags = pd.read_csv('esot3ria/video_recommendation_tags.csv')
video_tags = pd.read_csv('kaggle_solution_40k.csv')
# Define batch variables.
batch_video_ids = []
batch_video_vectors = []
error_tags = []
for i, row in video_tags.iterrows():
video_id = row[0]
video_vector = np.zeros(100)
for segment_index in range(1, 6):
tag, weight = row[segment_index].split(":")
if tag in tag_vectors.vocab:
video_vector = video_vector + (tag_vectors[tag] * float(weight))
else:
# Pass if tag is unknown
if tag not in error_tags:
error_tags.append(tag)
batch_video_ids.append(video_id)
batch_video_vectors.append(video_vector)
# Add video vectors.
if (i+1) % BATCH_SIZE == 0:
video_vectors.add(batch_video_ids, batch_video_vectors)
batch_video_ids = []
batch_video_vectors = []
print("Video vectors created: ", i+1)
# Add rest of video vectors.
video_vectors.add(batch_video_ids, batch_video_vectors)
print("error tags: ")
print(error_tags)
video_vectors.save("video_vectors.model")
# Usage
# video_vectors = Word2Vec().wv.load("video_vectors.model")
# video_vectors.most_similar("XwFj", topn=5)
......
This file is too large to display.
This diff could not be displayed because it is too large.
......@@ -347,8 +347,8 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
#=======================================
segment_id = str(segment_id.split(":")[0])
if segment_id not in segment_id_list:
segment_id_list.append(str(segment_id))
segment_classes.append("")
segment_id_list.append(str(segment_id))
segment_classes.append("")
index = segment_id_list.index(segment_id)
......@@ -377,7 +377,6 @@ def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
demoninator = float(temp[0][1] + temp[1][1] + temp[2][1] + temp[3][1] + temp[4][1])
#for item in temp:
for itemIndex in range(0, top_k):
# 20.05.31 Esot3riA
# Normalize tag name
segment_tag = str(voca_dict[str(temp[itemIndex][0])])
normalized_tag = normalize_tag(segment_tag)
......
......@@ -140,7 +140,7 @@
267,5888,/m/09xzd,Chevrolet,https://en.wikipedia.org/wiki/Chevrolet,Autos & Vehicles,,,"Chevrolet, colloquially referred to as Chevy and formally the Chevrolet Division of General Motors Company, is an American automobile division of the American manufacturer General Motors. Louis Chevrolet and ousted General Motors founder William C. Durant started the company on November 3, 1911 as the Chevrolet Motor Car Company. Durant used the Chevrolet Motor Car Company to acquire a controlling stake in General Motors with a reverse merger occurring on May 2, 1918 and propelled himself back to the GM presidency. After Durant's second ousting in 1919, Alfred Sloan, with his maxim ""a car for every purse and purpose,"" would pick the Chevrolet brand to become the volume leader in the General Motors family, selling mainstream vehicles to compete with Henry Ford's Model T in 1919 and overtaking Ford as the best-selling car in the United States by 1929. Chevrolet-branded vehicles are sold in most automotive markets worldwide, with the notable exception of Oceania, where GM is represented by its Australian subsidiary, Holden."
268,5844,/m/01d5g,Batman,https://en.wikipedia.org/wiki/Batman,Arts & Entertainment,,,"Batman is a fictional superhero appearing in American comic books published by DC Comics. The character was created by artist Bob Kane and writer Bill Finger, and first appeared in Detective Comics #27. Originally named the ""Bat-Man"", the character is also referred to by such epithets as the Caped Crusader, the Dark Knight, and the World's Greatest Detective. Batman's secret identity is Bruce Wayne, an American billionaire, playboy, philanthropist, and owner of Wayne Enterprises. After witnessing the murder of his parents Thomas Wayne and Martha Wayne as a child, he swore vengeance against criminals, an oath tempered by a sense of justice. Wayne trains himself physically and intellectually and crafts a bat-inspired persona to fight crime. Batman operates in the fictional Gotham City, with assistance from various supporting characters, including his butler Alfred, police commissioner Gordon, and vigilante allies such as Robin. Unlike most superheroes, Batman does not possess any superpowers; rather, he relies on his genius intellect, physical prowess, martial arts abilities, detective skills, science and technology, vast wealth, intimidation, and indomitable will."
269,5819,/m/0cfpc,Loudspeaker,https://en.wikipedia.org/wiki/Loudspeaker,Computers & Electronics,,,"A loudspeaker is an electroacoustic transducer; which converts an electrical audio signal into a corresponding sound. The most widely used type of speaker in the 2010s is the dynamic speaker, invented in 1925 by Edward W. Kellogg and Chester W. Rice. The dynamic speaker operates on the same basic principle as a dynamic microphone, but in reverse, to produce sound from an electrical signal. When an alternating current electrical audio signal is applied to its voice coil, a coil of wire suspended in a circular gap between the poles of a permanent magnet, the coil is forced to move rapidly back and forth due to Faraday's law of induction, which causes a diaphragm attached to the coil to move back and forth, pushing on the air to create sound waves. Besides this most common method, there are several alternative technologies that can be used to convert an electrical signal into sound. The sound source must be amplified or strengthened with an audio power amplifier before the signal is sent to the speaker. Speakers are typically housed in a speaker enclosure or speaker cabinet which is often a rectangular or square box made of wood or sometimes plastic."
271,5752,/m/0lwkh,Nike Inc,"https://en.wikipedia.org/wiki/Nike,_Inc.",Shopping,Sports,,"Nike, Inc. is an American multinational corporation that is engaged in the design, development, manufacturing and worldwide marketing and sales of footwear, apparel, equipment, accessories and services. The company is headquartered near Beaverton, Oregon, in the Portland metropolitan area. It is one of the world's largest suppliers of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$24.1 billion in its fiscal year 2012. As of 2012, it employed more than 44,000 people worldwide. In 2014 the brand alone was valued at $19 billion, making it the most valuable brand among sports businesses. The company was founded on January 25, 1964, as Blue Ribbon Sports, by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, and subsidiaries including Brand Jordan, Hurley International and Converse."
271,5752,/m/0lwkh,Nike Inc,"https://en.wikipedia.org/wiki/Nike,_Inc.",Shopping,Sports,,"Nike Inc is an American multinational corporation that is engaged in the design, development, manufacturing and worldwide marketing and sales of footwear, apparel, equipment, accessories and services. The company is headquartered near Beaverton, Oregon, in the Portland metropolitan area. It is one of the world's largest suppliers of athletic shoes and apparel and a major manufacturer of sports equipment, with revenue in excess of US$24.1 billion in its fiscal year 2012. As of 2012, it employed more than 44,000 people worldwide. In 2014 the brand alone was valued at $19 billion, making it the most valuable brand among sports businesses. The company was founded on January 25, 1964, as Blue Ribbon Sports, by Bill Bowerman and Phil Knight, and officially became Nike, Inc. on May 30, 1971. The company takes its name from Nike, the Greek goddess of victory. Nike markets its products under its own brand, as well as Nike Golf, Nike Pro, Nike+, Air Jordan, Nike Blazers, Air Force 1, Nike Dunk, Air Max, Foamposite, Nike Skateboarding, and subsidiaries including Brand Jordan, Hurley International and Converse."
276,5735,/m/0bl0l,Garden,https://en.wikipedia.org/wiki/Garden,Home & Garden,,,"A garden is a planned space, usually outdoors, set aside for the display, cultivation, and enjoyment of plants and other forms of nature. The garden can incorporate both natural and man-made materials. The most common form today is known as a residential garden, but the term garden has traditionally been a more general one. Zoos, which display wild animals in simulated natural habitats, were formerly called zoological gardens. Western gardens are almost universally based on plants, with garden often signifying a shortened form of botanical garden. Some traditional types of eastern gardens, such as Zen gardens, use plants sparsely or not at all. Xeriscape gardens use local native plants that do not require irrigation or extensive use of other resources while still providing the benefits of a garden environment. Gardens may exhibit structural enhancements, sometimes called follies, including water features such as fountains, ponds, waterfalls or creeks, dry creek beds, statuary, arbors, trellises and more. Some gardens are for ornamental purposes only, while some gardens also produce food crops, sometimes in separate areas, or sometimes intermixed with the ornamental plants."
273,5696,/m/071p9,Ski,https://en.wikipedia.org/wiki/Ski,Sports,,,"A ski is a narrow strip of semi-rigid material worn underfoot to glide over snow. Substantially longer than wide and characteristically employed in pairs, skis are attached to ski boots with ski bindings, with either a free, lockable, or partially secured heel. For climbing slopes, ski skins can be attached at the base of the ski. Originally intended as an aid to travel over snow, they are now mainly used recreationally in the sport of skiing."
277,5696,/m/05y4t,Paint,https://en.wikipedia.org/wiki/Paint,Arts & Entertainment,Business & Industrial,Home & Garden,"Paint is any liquid, liquefiable, or mastic composition that, after application to a substrate in a thin layer, converts to a solid film. It is most commonly used to protect, color, or provide texture to objects. Paint can be made or purchased in many colors—and in many different types, such as watercolor, synthetic, etc. Paint is typically stored, sold, and applied as a liquid, but most types dry into a solid."
......
No preview for this file type