Showing
15 changed files
with
89 additions
and
28 deletions
... | @@ -11,7 +11,7 @@ def softmax(inputA): | ... | @@ -11,7 +11,7 @@ def softmax(inputA): |
11 | normalized_arr = [] | 11 | normalized_arr = [] |
12 | for x in inputA: | 12 | for x in inputA: |
13 | normalized_arr.append(float(x)) | 13 | normalized_arr.append(float(x)) |
14 | - #normalized_arr = normalize(normalized_arr) | 14 | + normalized_arr = normalize(normalized_arr) |
15 | 15 | ||
16 | for i in range(0, len(normalized_arr)): | 16 | for i in range(0, len(normalized_arr)): |
17 | 17 | ||
... | @@ -36,5 +36,5 @@ def normalize(arrs): | ... | @@ -36,5 +36,5 @@ def normalize(arrs): |
36 | minimum = min(normalized_arr) | 36 | minimum = min(normalized_arr) |
37 | denom = float(maximum) - float(minimum) | 37 | denom = float(maximum) - float(minimum) |
38 | for i in range(0,len(normalized_arr)): | 38 | for i in range(0,len(normalized_arr)): |
39 | - normalized_arr[i] = (normalized_arr[i] - minimum)/ denom | 39 | + normalized_arr[i] = ((normalized_arr[i] - minimum)/ denom) * 2 - 1 |
40 | return normalized_arr | 40 | return normalized_arr |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -10,11 +10,11 @@ import video_util as videoutil | ... | @@ -10,11 +10,11 @@ import video_util as videoutil |
10 | 10 | ||
11 | # Define file paths. | 11 | # Define file paths. |
12 | MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/model/inference_model/segment_inference_model" | 12 | MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/model/inference_model/segment_inference_model" |
13 | -VOCAB_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/prevs/vocabulary.csv" | 13 | +VOCAB_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/vocabulary.csv" |
14 | VIDEO_TAGS_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv" | 14 | VIDEO_TAGS_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv" |
15 | TAG_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/tag_vectors.model" | 15 | TAG_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/tag_vectors.model" |
16 | VIDEO_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/video_vectors.model" | 16 | VIDEO_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/video_vectors.model" |
17 | -SEGMENT_LABEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m//prevs/segment_label_ids.csv" | 17 | +SEGMENT_LABEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/segment_label_ids.csv" |
18 | 18 | ||
19 | # Define parameters. | 19 | # Define parameters. |
20 | TAG_TOP_K = 5 | 20 | TAG_TOP_K = 5 | ... | ... |
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
web/backend/yt8m/esot3ria/segment_tags4.csv
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
... | @@ -2,7 +2,7 @@ import nltk | ... | @@ -2,7 +2,7 @@ import nltk |
2 | import gensim | 2 | import gensim |
3 | import pandas as pd | 3 | import pandas as pd |
4 | from gensim.models import Word2Vec | 4 | from gensim.models import Word2Vec |
5 | - | 5 | +import sys |
6 | def normalize_tag(tag): | 6 | def normalize_tag(tag): |
7 | if isinstance(tag, str): | 7 | if isinstance(tag, str): |
8 | new_tag = tag.lower().replace('[^a-zA-Z]', ' ') | 8 | new_tag = tag.lower().replace('[^a-zA-Z]', ' ') |
... | @@ -13,7 +13,7 @@ def normalize_tag(tag): | ... | @@ -13,7 +13,7 @@ def normalize_tag(tag): |
13 | return new_tag | 13 | return new_tag |
14 | else: | 14 | else: |
15 | return tag | 15 | return tag |
16 | - | 16 | +''' |
17 | # Load files. | 17 | # Load files. |
18 | nltk.download('stopwords') | 18 | nltk.download('stopwords') |
19 | vocab = pd.read_csv('E:/khuhub/2015104192/web/backend/yt8m/esot3ria/vocabulary.csv',encoding='utf-8') | 19 | vocab = pd.read_csv('E:/khuhub/2015104192/web/backend/yt8m/esot3ria/vocabulary.csv',encoding='utf-8') |
... | @@ -49,11 +49,14 @@ vocab_phrased = phraser[tokenlist] | ... | @@ -49,11 +49,14 @@ vocab_phrased = phraser[tokenlist] |
49 | 49 | ||
50 | # Vectorize tags. | 50 | # Vectorize tags. |
51 | w2v = gensim.models.word2vec.Word2Vec(sentences=vocab_phrased, min_count=1) | 51 | w2v = gensim.models.word2vec.Word2Vec(sentences=vocab_phrased, min_count=1) |
52 | -w2v.save('E:/khuhub/2015104192/web/backend/yt8m/esot3ria/tag_vectors3.model') | 52 | +w2v.save('E:/khuhub/2015104192/web/backend/yt8m/esot3ria/tag_vectors.model') |
53 | - | 53 | +''' |
54 | - | 54 | + |
55 | -tag_vectors = Word2Vec.load("./tag_vectors3.model").wv | 55 | +tag_vectors = Word2Vec.load("./tag_vectors.model").wv |
56 | -print(tag_vectors['concert']) | 56 | +print(tag_vectors.similarity('koi','koi')) |
57 | +all_sims = tag_vectors.most_similar('koi', topn=sys.maxsize) | ||
58 | +last_10 = list(reversed(all_sims[-10:])) | ||
59 | +print(last_10) | ||
57 | # word_vectors = w2v.wv | 60 | # word_vectors = w2v.wv |
58 | # vocabs = word_vectors.vocab.keys() | 61 | # vocabs = word_vectors.vocab.keys() |
59 | # word_vectors_list = [word_vectors[v] for v in vocabs] | 62 | # word_vectors_list = [word_vectors[v] for v in vocabs] |
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -22,15 +22,16 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -22,15 +22,16 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
22 | maxSimilarity = -1 | 22 | maxSimilarity = -1 |
23 | 23 | ||
24 | kernel = [np.zeros(100) for i in range(0,5)] | 24 | kernel = [np.zeros(100) for i in range(0,5)] |
25 | - | 25 | + tagKernel = [] |
26 | #우선은 비교를 뜰 입력 영상의 단일 비디오벡터를 구함 | 26 | #우선은 비교를 뜰 입력 영상의 단일 비디오벡터를 구함 |
27 | video_vector = np.zeros(100) | 27 | video_vector = np.zeros(100) |
28 | tag_preds =[] | 28 | tag_preds =[] |
29 | - | 29 | + videoTagList = [] |
30 | for (tag, weight) in tags: | 30 | for (tag, weight) in tags: |
31 | tag_preds.append(weight) | 31 | tag_preds.append(weight) |
32 | + videoTagList.append(tag) | ||
32 | #print("tag preds = ",tag_preds) | 33 | #print("tag preds = ",tag_preds) |
33 | - tag_preds = ac.softmax(tag_preds) | 34 | + #tag_preds = ac.softmax(tag_preds) |
34 | for (tag, weight),pred in zip(tags,tag_preds): | 35 | for (tag, weight),pred in zip(tags,tag_preds): |
35 | print(tag,pred) | 36 | print(tag,pred) |
36 | if tag in tag_vectors.vocab: | 37 | if tag in tag_vectors.vocab: |
... | @@ -47,7 +48,7 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -47,7 +48,7 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
47 | for segment in segments: | 48 | for segment in segments: |
48 | segment_vector = np.zeros(100) | 49 | segment_vector = np.zeros(100) |
49 | segTags = [segment[i] for i in range(0,len(segment),2)] | 50 | segTags = [segment[i] for i in range(0,len(segment),2)] |
50 | - segProbs = ac.softmax([float(segment[i]) for i in range(1,len(segment),2)]) | 51 | + segProbs = [float(segment[i]) for i in range(1,len(segment),2)]#ac.softmax([float(segment[i]) for i in range(1,len(segment),2)]) |
51 | 52 | ||
52 | #print(segProbs) | 53 | #print(segProbs) |
53 | for tag, weight in zip(segTags,segProbs): | 54 | for tag, weight in zip(segTags,segProbs): |
... | @@ -80,11 +81,11 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -80,11 +81,11 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
80 | segment = segments[maxSimilarSegment - math.floor(len(kernel)/2) + k] | 81 | segment = segments[maxSimilarSegment - math.floor(len(kernel)/2) + k] |
81 | segment_vector = np.zeros(100) | 82 | segment_vector = np.zeros(100) |
82 | segTags = [segment[i] for i in range(0,len(segment),2)] | 83 | segTags = [segment[i] for i in range(0,len(segment),2)] |
83 | - | 84 | + tagKernel.append(segTags) |
84 | segProbs = ac.softmax([float(segment[i]) for i in range(1,len(segment),2)]) | 85 | segProbs = ac.softmax([float(segment[i]) for i in range(1,len(segment),2)]) |
85 | print(segTags) | 86 | print(segTags) |
86 | print(segProbs) | 87 | print(segProbs) |
87 | - normalize(segProbs) | 88 | + #normalize(segProbs) |
88 | for (tag, weight) in zip(segTags,segProbs): | 89 | for (tag, weight) in zip(segTags,segProbs): |
89 | if tag in tag_vectors.vocab: | 90 | if tag in tag_vectors.vocab: |
90 | #float(weight) | 91 | #float(weight) |
... | @@ -96,11 +97,22 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -96,11 +97,22 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
96 | error_tags.append(tag) | 97 | error_tags.append(tag) |
97 | 98 | ||
98 | kernel[k] = segment_vector | 99 | kernel[k] = segment_vector |
99 | - | 100 | + ''' |
101 | + if(k < int(len(kernel)/2)): | ||
102 | + kernel[k] = kernel[k] * k | ||
103 | + elif(k > int(len(kernel)/2)): | ||
104 | + kernel[k] = kernel[k] * (len(kernel) - k) | ||
105 | + else: | ||
106 | + kernel[k] = kernel[k] * (len(kernel)/2 + 1) | ||
107 | + ''' | ||
108 | + print("TAG kernel") | ||
109 | + #tagKernel = tagKernel[1:5] | ||
110 | + print(tagKernel) | ||
100 | #여기에서 유사한 벡터들을 뽑아냄 | 111 | #여기에서 유사한 벡터들을 뽑아냄 |
101 | #현재는 비디오id로 영상을 얻을 수 없으므로 반환값으로 비디오 아이디와 태그들, 확률 사용 | 112 | #현재는 비디오id로 영상을 얻을 수 없으므로 반환값으로 비디오 아이디와 태그들, 확률 사용 |
102 | video_tags = pd.read_csv('/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv', encoding='utf8',error_bad_lines=False) | 113 | video_tags = pd.read_csv('/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv', encoding='utf8',error_bad_lines=False) |
103 | videoVectorList = [] | 114 | videoVectorList = [] |
115 | + segmentTagList = [] | ||
104 | prevVideoId = "" | 116 | prevVideoId = "" |
105 | minimunVideoIds = [["",-1.0] for i in range(0,top_k)] | 117 | minimunVideoIds = [["",-1.0] for i in range(0,top_k)] |
106 | 118 | ||
... | @@ -117,7 +129,10 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -117,7 +129,10 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
117 | #print("====") | 129 | #print("====") |
118 | #for a in kernel: | 130 | #for a in kernel: |
119 | # print(len(kernel),norm(a)) | 131 | # print(len(kernel),norm(a)) |
120 | - maxima, idx = convolution(videoVectorList,kernel,prevVideoId) | 132 | + convmaxima, convidx = convolution(videoVectorList,kernel,prevVideoId) |
133 | + maxima, idx = differenceMax(segmentTagList,tagKernel,tag_vectors,videoTagList) | ||
134 | + #maxima = maxima + convmaxima | ||
135 | + | ||
121 | #print(video_id,maxima) | 136 | #print(video_id,maxima) |
122 | localMinima = 100 | 137 | localMinima = 100 |
123 | localMinimaIndex = -1 | 138 | localMinimaIndex = -1 |
... | @@ -127,16 +142,21 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): | ... | @@ -127,16 +142,21 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, top_k): |
127 | localMinimaIndex = seg | 142 | localMinimaIndex = seg |
128 | #print(maxima) | 143 | #print(maxima) |
129 | if localMinima < maxima: | 144 | if localMinima < maxima: |
130 | - print(prevVideoId[0:4] + "_" + str(idx),maxima) | 145 | + print(prevVideoId[0:4] + "_" + str(idx),localMinimaIndex,maxima,convmaxima) |
131 | minimunVideoIds[localMinimaIndex] = [prevVideoId[0:4] + "_" + str(idx),maxima] | 146 | minimunVideoIds[localMinimaIndex] = [prevVideoId[0:4] + "_" + str(idx),maxima] |
132 | 147 | ||
133 | 148 | ||
134 | videoVectorList.clear() | 149 | videoVectorList.clear() |
150 | + segmentTagList.clear() | ||
135 | prevVideoId = video_id | 151 | prevVideoId = video_id |
136 | 152 | ||
137 | if video_id == "finished": | 153 | if video_id == "finished": |
138 | break | 154 | break |
139 | videoVectorList.append(video_vectors[video_id]) | 155 | videoVectorList.append(video_vectors[video_id]) |
156 | + tagList = [] | ||
157 | + for i in range(1,top_k+1): | ||
158 | + tagList.append(row[i].split(":")[0]) | ||
159 | + segmentTagList.append(tagList) | ||
140 | 160 | ||
141 | similar_ids = [] | 161 | similar_ids = [] |
142 | for i in range(0,top_k): | 162 | for i in range(0,top_k): |
... | @@ -156,14 +176,13 @@ def cos_sim(A, B): | ... | @@ -156,14 +176,13 @@ def cos_sim(A, B): |
156 | else: | 176 | else: |
157 | return dot(A, B)/(norm(A)*norm(B)) | 177 | return dot(A, B)/(norm(A)*norm(B)) |
158 | 178 | ||
159 | -def shiftKernel(kernel, newValue): | 179 | +def sub_vec_size(A,B): |
160 | - for i in range(0, len(kernel) - 1): | 180 | + dir = A-B |
161 | - kernel[i] = kernel[i+1] | 181 | + return norm(dir) |
162 | - kernel[len(kernel) - 1] = newValue | ||
163 | 182 | ||
164 | def convolution(arrs, _kernel,vidId): | 183 | def convolution(arrs, _kernel,vidId): |
165 | s = len(_kernel) | 184 | s = len(_kernel) |
166 | - | 185 | + l = len(arrs) |
167 | result = [] | 186 | result = [] |
168 | 187 | ||
169 | midpos = math.floor(s/2) | 188 | midpos = math.floor(s/2) |
... | @@ -171,14 +190,50 @@ def convolution(arrs, _kernel,vidId): | ... | @@ -171,14 +190,50 @@ def convolution(arrs, _kernel,vidId): |
171 | arrs.insert(0,np.zeros(100)) | 190 | arrs.insert(0,np.zeros(100)) |
172 | arrs.append(np.zeros(100)) | 191 | arrs.append(np.zeros(100)) |
173 | 192 | ||
193 | + total = 0 | ||
174 | for j in range(midpos,len(arrs) - midpos): | 194 | for j in range(midpos,len(arrs) - midpos): |
175 | convResult = 0 | 195 | convResult = 0 |
176 | for i in range(0, s): | 196 | for i in range(0, s): |
177 | - convResult = convResult + cos_sim(arrs[j - midpos + i],_kernel[i]) | 197 | + if(i == int(len(_kernel)/2)): |
198 | + convResult = convResult - sub_vec_size(arrs[j - midpos + i],_kernel[i]) + dot(arrs[j - midpos + i],_kernel[i]) | ||
178 | result.append(convResult) | 199 | result.append(convResult) |
200 | + total = total + convResult | ||
179 | maxVal = max(result) | 201 | maxVal = max(result) |
180 | index = result.index(maxVal) | 202 | index = result.index(maxVal) |
181 | - return maxVal,index | 203 | + |
204 | + return total/l,index | ||
205 | + | ||
206 | +def differenceMax(arrs, _kernel, w2v, videoTaglist): | ||
207 | + s = len(_kernel) | ||
208 | + | ||
209 | + result = [] | ||
210 | + | ||
211 | + midpos = math.floor(s/2) | ||
212 | + for i in range(0,midpos): | ||
213 | + arrs.insert(0,arrs[0]) | ||
214 | + arrs.append(arrs[len(arrs)-1]) | ||
215 | + | ||
216 | + prevIndex = 0 | ||
217 | + prevMax = -100 | ||
218 | + for j in range(midpos,len(arrs) - midpos): | ||
219 | + convResult = 0 | ||
220 | + processed_vocabNum = 1 | ||
221 | + for i in range(0, s): | ||
222 | + #if i == midpos: | ||
223 | + if(_kernel[i][0] not in arrs[j - midpos + i][0:2]):# and ((videoTaglist[0] not in arrs[j - midpos + i][0:2])) and ((videoTaglist[1] not in arrs[j - midpos + i][0:5])): | ||
224 | + continue | ||
225 | + for ind in range(0,5): | ||
226 | + if(arrs[j - midpos + i][ind] in w2v.vocab) and (_kernel[i][ind] in w2v.vocab): | ||
227 | + convResult = convResult + (w2v.similarity(arrs[j - midpos + i][ind],_kernel[i][ind])) | ||
228 | + processed_vocabNum = processed_vocabNum + 1 | ||
229 | + #convResult = convResult / processed_vocabNum | ||
230 | + if prevMax < convResult: | ||
231 | + prevMax = convResult | ||
232 | + prevIndex = j - midpos | ||
233 | + result.append(convResult) | ||
234 | + #maxVal = max(result) | ||
235 | + #index = result.index(maxVal) | ||
236 | + return prevMax,prevIndex | ||
182 | 237 | ||
183 | def normalize(arrs): | 238 | def normalize(arrs): |
184 | maximum = max(arrs) | 239 | maximum = max(arrs) | ... | ... |
... | @@ -29,7 +29,10 @@ def getVideoInfo(vid_id, video_tags_path, top_k): | ... | @@ -29,7 +29,10 @@ def getVideoInfo(vid_id, video_tags_path, top_k): |
29 | video_tag_tuple = video_tags_info["segment" + str(i)].values[0]# ex: "mobile-phone:0.361" | 29 | video_tag_tuple = video_tags_info["segment" + str(i)].values[0]# ex: "mobile-phone:0.361" |
30 | video_tags.append(video_tag_tuple.split(":")[0]) | 30 | video_tags.append(video_tag_tuple.split(":")[0]) |
31 | if video_url == "": | 31 | if video_url == "": |
32 | - video_url = video_url + ' ' + video_tags | 32 | + for x in video_tags: |
33 | + video_url = video_url + ' ' + x | ||
34 | + | ||
35 | + video_url = video_url + '\nThe similar point is : ' + str(float(vid_id[5:]) * 5) | ||
33 | 36 | ||
34 | return { | 37 | return { |
35 | "video_url": video_url, | 38 | "video_url": video_url, | ... | ... |
This file is too large to display.
This file is too large to display.
This file is too large to display.
web/backend/yt8m/segment_label_ids - 복사본.csv
0 → 100644
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment