윤영빈

final

...@@ -31,10 +31,13 @@ def normalize(arrs): ...@@ -31,10 +31,13 @@ def normalize(arrs):
31 normalized_arr = [] 31 normalized_arr = []
32 for x in arrs: 32 for x in arrs:
33 normalized_arr.append(float(x)) 33 normalized_arr.append(float(x))
34 - 34 +
35 - maximum = max(normalized_arr) 35 + if len(normalized_arr) > 0:
36 - minimum = min(normalized_arr) 36 + maximum = max(normalized_arr)
37 - denom = float(maximum) - float(minimum) 37 + minimum = min(normalized_arr)
38 - for i in range(0,len(normalized_arr)): 38 + denom = float(maximum) - float(minimum)
39 - normalized_arr[i] = ((normalized_arr[i] - minimum)/ denom) * 2 - 1 39 + if denom == 0:
40 + denom = 1
41 + for i in range(0,len(normalized_arr)):
42 + normalized_arr[i] = ((normalized_arr[i] - minimum)/ denom) * 2 - 1
40 return normalized_arr 43 return normalized_arr
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -230,7 +230,7 @@ def inference_pb(file_path, threshold): ...@@ -230,7 +230,7 @@ def inference_pb(file_path, threshold):
230 230
231 # 5. Create recommend videos info, Combine results. 231 # 5. Create recommend videos info, Combine results.
232 recommend_video_ids = recommender.recommend_videos(tag_result, inputVideoTagResults, TAG_VECTOR_MODEL_PATH, 232 recommend_video_ids = recommender.recommend_videos(tag_result, inputVideoTagResults, TAG_VECTOR_MODEL_PATH,
233 - VIDEO_VECTOR_MODEL_PATH, VIDEO_ID_MODEL_PATH, VIDEO_TOP_K) 233 + VIDEO_VECTOR_MODEL_PATH, VIDEO_ID_MODEL_PATH, VIDEO_TOP_K,True)
234 video_result = [videoutil.getVideoInfo(ids, VIDEO_TAGS_PATH, TAG_TOP_K,False) for ids in recommend_video_ids] 234 video_result = [videoutil.getVideoInfo(ids, VIDEO_TAGS_PATH, TAG_TOP_K,False) for ids in recommend_video_ids]
235 235
236 inference_result = { 236 inference_result = {
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
...@@ -6,7 +6,7 @@ import pandas as pd ...@@ -6,7 +6,7 @@ import pandas as pd
6 import math 6 import math
7 import activation as ac 7 import activation as ac
8 8
9 -def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_model, top_k): 9 +def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_model, top_k, isPerSegment = True):
10 # 이 함수에서 모든걸 다 함 10 # 이 함수에서 모든걸 다 함
11 # tags는 label val 로 묶인 문자열 리스트임 11 # tags는 label val 로 묶인 문자열 리스트임
12 # tags의 길이는 segment의 길이 12 # tags의 길이는 segment의 길이
...@@ -21,9 +21,10 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_ ...@@ -21,9 +21,10 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_
21 error_tags = [] 21 error_tags = []
22 maxSimilarSegment = 0 22 maxSimilarSegment = 0
23 maxSimilarity = -1 23 maxSimilarity = -1
24 - print('prev len',len(segments)) 24 +
25 - kernel = [np.zeros(100) for i in range(0,9)] 25 + kernel = [np.zeros(100) for i in range(0,5)]
26 tagKernel = [] 26 tagKernel = []
27 + similar_ids = []
27 #우선은 비교를 뜰 입력 영상의 단일 비디오벡터를 구함 28 #우선은 비교를 뜰 입력 영상의 단일 비디오벡터를 구함
28 video_vector = np.zeros(100) 29 video_vector = np.zeros(100)
29 tag_preds =[] 30 tag_preds =[]
...@@ -31,133 +32,132 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_ ...@@ -31,133 +32,132 @@ def recommend_videos(tags, segments, tag_model_path, video_model_path, video_id_
31 for (tag, weight) in tags: 32 for (tag, weight) in tags:
32 tag_preds.append(weight) 33 tag_preds.append(weight)
33 videoTagList.append(tag) 34 videoTagList.append(tag)
35 + ac.softmax(tag_preds)
34 for (tag, weight),pred in zip(tags,tag_preds): 36 for (tag, weight),pred in zip(tags,tag_preds):
35 print(tag,pred) 37 print(tag,pred)
36 if tag in tag_vectors.vocab: 38 if tag in tag_vectors.vocab:
37 video_vector = video_vector + (tag_vectors[tag] * float(pred)) 39 video_vector = video_vector + (tag_vectors[tag] * float(pred))
38 else: 40 else:
39 - print("unknown",tag) 41 + #print("unknown",tag)
40 # Pass if tag is unknown 42 # Pass if tag is unknown
41 if tag not in error_tags: 43 if tag not in error_tags:
42 error_tags.append(tag) 44 error_tags.append(tag)
43 45
44 - #각 세그먼트마다 비교를 떠서 인덱스를 저장 46 + if(isPerSegment == True):
45 - midpos = math.floor(len(kernel)/2) 47 + #각 세그먼트마다 비교를 떠서 인덱스를 저장
46 - for i in range(0,midpos): 48 + midpos = math.floor(len(kernel)/2)
47 - segments.insert(0,segments[0]) 49 + for i in range(0,midpos):
48 - segments.append(segments[len(segments)-1]) 50 + segments.insert(0,segments[0])
49 - 51 + segments.append(segments[len(segments)-1])
50 - currentIndex = midpos 52 +
51 - for si in range(midpos,len(segments) - midpos - 1): 53 + currentIndex = midpos
52 - similarity = 0 54 + for si in range(midpos,len(segments) - midpos - 1):
53 - for segi in range(-1,2): 55 + similarity = 0
54 - segment = segments[si + segi] 56 + for segi in range(-1,2):
57 + segment = segments[si + segi]
58 + segment_vector = np.zeros(100)
59 + segTags = [segment[i] for i in range(0,len(segment),2)]
60 + segProbs = [float(segment[i]) for i in range(1,len(segment),2)]#ac.softmax([float(segment[i]) for i in range(1,len(segment),2)])
61 +
62 + for tag, weight in zip(segTags,segProbs):
63 + if tag in tag_vectors.vocab:
64 + segment_vector = segment_vector + (tag_vectors[tag] * float(weight))
65 + else:
66 + # Pass if tag is unknown
67 + if tag not in error_tags:
68 + error_tags.append(tag)
69 +
70 + #비디오 벡터와 세그먼트 벡터 비교
71 + #similarity = similarity + cos_sim(video_vector, segment_vector) #cos_sim(video_vector, segment_vector)#
72 +
73 + for currentSegmentTag, videoVectorTag,videoVectorTagPred in zip(segTags,videoTagList,tag_preds):
74 + if(currentSegmentTag in tag_vectors.vocab) and (videoVectorTag in tag_vectors.vocab):
75 + prob = float(videoVectorTagPred)
76 + if videoVectorTag not in segTags:
77 + prob = 0
78 + similarity = similarity + (tag_vectors.similarity(currentSegmentTag,videoVectorTag) * prob)
79 +
80 +
81 + if similarity >= maxSimilarity:
82 + maxSimilarSegment = currentIndex
83 + maxSimilarity = similarity
84 + if maxSimilarSegment < int(len(kernel)/2):
85 + maxSimilarSegment = int(len(kernel)/2)
86 + elif maxSimilarSegment == len(segments) - int(len(kernel)/2):
87 + maxSimilarSegment = len(segments) - int(len(kernel)/2) - 1
88 + #세그먼트 인덱스 증가
89 + currentIndex = currentIndex + 1
90 + #######################################print('maxSimilarSegment',maxSimilarSegment,'len',len(segments))
91 + #커널 생성
92 + for k in range (0,len(kernel)):
93 + segment = segments[maxSimilarSegment - math.floor(len(kernel)/2) + k]
55 segment_vector = np.zeros(100) 94 segment_vector = np.zeros(100)
56 segTags = [segment[i] for i in range(0,len(segment),2)] 95 segTags = [segment[i] for i in range(0,len(segment),2)]
57 - segProbs = [float(segment[i]) for i in range(1,len(segment),2)]#ac.softmax([float(segment[i]) for i in range(1,len(segment),2)]) 96 + tagKernel.append(segTags)
58 - 97 + segProbs = ac.softmax([float(segment[i]) for i in range(1,len(segment),2)])
59 - for tag, weight in zip(segTags,segProbs): 98 + #print(segTags)
99 + #print(segProbs)
100 + for (tag, weight) in zip(segTags,segProbs):
60 if tag in tag_vectors.vocab: 101 if tag in tag_vectors.vocab:
102 + #float(weight)
61 segment_vector = segment_vector + (tag_vectors[tag] * float(weight)) 103 segment_vector = segment_vector + (tag_vectors[tag] * float(weight))
62 else: 104 else:
105 + #print("unknown",tag)
63 # Pass if tag is unknown 106 # Pass if tag is unknown
64 if tag not in error_tags: 107 if tag not in error_tags:
65 error_tags.append(tag) 108 error_tags.append(tag)
66 - 109 +
67 - #비디오 벡터와 세그먼트 벡터 비교 110 + kernel[k] = segment_vector
68 - #similarity = similarity + cos_sim(video_vector, segment_vector) #cos_sim(video_vector, segment_vector)#
69 -
70 - for currentSegmentTag, videoVectorTag,videoVectorTagPred in zip(segTags,videoTagList,tag_preds):
71 - if(currentSegmentTag in tag_vectors.vocab) and (videoVectorTag in tag_vectors.vocab):
72 - prob = float(videoVectorTagPred)
73 - if videoVectorTag not in segTags:
74 - prob = 0
75 - similarity = similarity + (tag_vectors.similarity(currentSegmentTag,videoVectorTag) * prob)
76 -
77 -
78 - if similarity >= maxSimilarity:
79 - maxSimilarSegment = currentIndex
80 - maxSimilarity = similarity
81 - if maxSimilarSegment < int(len(kernel)/2):
82 - maxSimilarSegment = int(len(kernel)/2)
83 - elif maxSimilarSegment == len(segments) - int(len(kernel)/2):
84 - maxSimilarSegment = len(segments) - int(len(kernel)/2) - 1
85 - #세그먼트 인덱스 증가
86 - currentIndex = currentIndex + 1
87 - print('maxSimilarSegment',maxSimilarSegment,'len',len(segments))
88 - #커널 생성
89 - for k in range (0,len(kernel)):
90 - segment = segments[maxSimilarSegment - math.floor(len(kernel)/2) + k]
91 - segment_vector = np.zeros(100)
92 - segTags = [segment[i] for i in range(0,len(segment),2)]
93 - tagKernel.append(segTags)
94 - segProbs = ac.softmax([float(segment[i]) for i in range(1,len(segment),2)])
95 - print(segTags)
96 - print(segProbs)
97 - #normalize(segProbs)
98 - for (tag, weight) in zip(segTags,segProbs):
99 - if tag in tag_vectors.vocab:
100 - #float(weight)
101 - segment_vector = segment_vector + (tag_vectors[tag] * float(weight))
102 - else:
103 - print("unknown",tag)
104 - # Pass if tag is unknown
105 - if tag not in error_tags:
106 - error_tags.append(tag)
107 -
108 - kernel[k] = segment_vector
109 -
110 - #여기에서 유사한 벡터들을 뽑아냄
111 - #현재는 비디오id로 영상을 얻을 수 없으므로 반환값으로 비디오 아이디와 태그들, 확률 사용
112 - video_tags = pd.read_csv('/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv', encoding='utf8',error_bad_lines=False)
113 - videoVectorList = []
114 - segmentTagList = []
115 - prevVideoId = ""
116 - minimunVideoIds = [["",-1.0] for i in range(0,top_k)]
117 111
118 - for i, row in video_tags.iterrows(): 112 + #여기에서 유사한 벡터들을 뽑아냄
119 - video_id = row[0] 113 + #현재는 비디오id로 영상을 얻을 수 없으므로 반환값으로 비디오 아이디와 태그들, 확률 사용
120 - if video_id == "vid_id": 114 + video_tags = pd.read_csv('/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv', encoding='utf8',error_bad_lines=False)
121 - continue 115 + videoVectorList = []
122 - if prevVideoId == "": 116 + segmentTagList = []
123 - prevVideoId = video_id 117 + prevVideoId = ""
124 - 118 + minimunVideoIds = [["",-1.0] for i in range(0,top_k)]
125 - if video_id[0:4] != prevVideoId[0:4]: 119 +
126 - #여기서 모다진걸로 컨볼루션 연산 120 + for i, row in video_tags.iterrows():
127 - #convmaxima, convidx = convolution(videoVectorList,kernel,prevVideoId) 121 + video_id = row[0]
128 - maxima, idx = differenceMax(segmentTagList,tagKernel,tag_vectors,videoTagList) 122 + if video_id == "vid_id":
129 - #maxima = maxima + convmaxima 123 + continue
124 + if prevVideoId == "":
125 + prevVideoId = video_id
130 126
131 - localMinima = 100 127 + if video_id[0:4] != prevVideoId[0:4]:
132 - localMinimaIndex = -1 128 + #여기서 모다진걸로 컨볼루션 연산
133 - for seg in range(0,top_k): 129 + #convmaxima, convidx = convolution(videoVectorList,kernel,prevVideoId)
134 - if float(minimunVideoIds[seg][1]) < localMinima: 130 + maxima, idx = differenceMax(segmentTagList,tagKernel,tag_vectors,videoTagList)
135 - localMinima = float(minimunVideoIds[seg][1]) 131 + #maxima = maxima + convmaxima
136 - localMinimaIndex = seg 132 +
137 - #print(maxima) 133 + localMinima = 100
138 - if localMinima < maxima: 134 + localMinimaIndex = -1
139 - print(prevVideoId[0:4] + "_" + str(idx),localMinimaIndex,maxima) 135 + for seg in range(0,top_k):
140 - minimunVideoIds[localMinimaIndex] = [prevVideoId[0:4] + "_" + str(idx),maxima] 136 + if float(minimunVideoIds[seg][1]) < localMinima:
137 + localMinima = float(minimunVideoIds[seg][1])
138 + localMinimaIndex = seg
139 + #print(maxima)
140 + if localMinima < maxima:
141 + #print(prevVideoId[0:4] + "_" + str(idx),localMinimaIndex,maxima)
142 + minimunVideoIds[localMinimaIndex] = [prevVideoId[0:4] + "_" + str(idx),maxima]
141 143
144 +
145 + videoVectorList.clear()
146 + segmentTagList.clear()
147 + prevVideoId = video_id
142 148
143 - videoVectorList.clear() 149 + if video_id == "finished":
144 - segmentTagList.clear() 150 + break
145 - prevVideoId = video_id 151 + videoVectorList.append(video_vectors[video_id])
152 + tagList = []
153 + for i in range(1,top_k+1):
154 + tagList.append([row[i].split(":")[0],row[i].split(":")[1]])
155 + segmentTagList.append(tagList)
146 156
147 - if video_id == "finished": 157 + for i in range(0,top_k):
148 - break 158 + similar_ids.append(minimunVideoIds[i][0])
149 - videoVectorList.append(video_vectors[video_id]) 159 + else:
150 - tagList = [] 160 + similar_ids = [x[0] for x in video_ids.similar_by_vector(video_vector, top_k)]
151 - for i in range(1,top_k+1):
152 - tagList.append([row[i].split(":")[0],row[i].split(":")[1]])
153 - segmentTagList.append(tagList)
154 -
155 - #similar_ids = []
156 - #for i in range(0,top_k):
157 - # similar_ids.append(minimunVideoIds[i][0])
158 -
159 - similar_ids = [x[0] for x in video_ids.similar_by_vector(video_vector, top_k)]
160 - print('results =' ,similar_ids)
161 return similar_ids 161 return similar_ids
162 162
163 163
...@@ -236,3 +236,139 @@ def normalize(arrs): ...@@ -236,3 +236,139 @@ def normalize(arrs):
236 denom = maximum - minimum 236 denom = maximum - minimum
237 for i in range(0,len(arrs)): 237 for i in range(0,len(arrs)):
238 arrs[i] = (arrs[i] - minimum)/ denom 238 arrs[i] = (arrs[i] - minimum)/ denom
239 +
240 +def test(tag_model_path, video_model_path, video_id_model, video_tags_path, segment_tags_path,test_segment_tags,top_k):
241 + tag_vectors = Word2Vec.load(tag_model_path).wv
242 +
243 + video_tags = pd.read_csv(test_segment_tags, encoding='utf8',error_bad_lines=False)
244 + segmentTagList = []
245 + prevVideoId = ""
246 +
247 + entire_video_tags = pd.read_csv(video_tags_path,encoding='utf8')
248 + entire_segment_tags = pd.read_csv(segment_tags_path,encoding='utf8')
249 +
250 + testResult = {}
251 + totalIdNum = 0
252 +
253 + for i, row in video_tags.iterrows():
254 + video_id = row[0]
255 + if video_id == "vid_id":
256 + continue
257 + if prevVideoId == "":
258 + prevVideoId = video_id
259 +
260 + if video_id[0:4] != prevVideoId[0:4]:
261 + count = {}
262 + cap1 = 0
263 + cap2 = 0
264 + totalSegmentTagProbList = []
265 + for segTag in segmentTagList:
266 + segmentTagProbList = []
267 + for i in range(0,len(segTag)):
268 + try: count[segTag[i][0]] += float(segTag[i][1])
269 + except: count[segTag[i][0]] = float(segTag[i][1])
270 + segmentTagProbList.append(segTag[i][0])
271 + segmentTagProbList.append(segTag[i][1])
272 + totalSegmentTagProbList.append(segmentTagProbList)
273 + sorted(count.items(), key=lambda x: x[1], reverse=True)
274 +
275 + tagnames = list(count.keys())[0:5]
276 + tagprobs = list(count.values())[0:5]
277 + tags = zip(tagnames,tagprobs)
278 +
279 + result = recommend_videos(tags, totalSegmentTagProbList, tag_model_path, video_model_path, video_id_model, top_k,False)
280 + score_avg = 0
281 + print("input tags :",tagnames)
282 + for ids in result:
283 + score = 0
284 + video_tags_info = entire_video_tags.loc[entire_video_tags["vid_id"] == ids]
285 + tagList = []
286 + for i in range(1, top_k + 1):
287 + video_tag_tuple = video_tags_info["segment" + str(i)].values[0]# ex: "mobile-phone:0.361"
288 + tag = video_tag_tuple.split(":")[0]
289 + tagList.append(tag)
290 + if tag in tag_vectors.vocab:
291 + for vidTag,pr in zip(tagnames,tagprobs):
292 + #if vidTag in tag_vectors.vocab:
293 + # score = score + (tag_vectors.similarity(tag,vidTag) * float(pr))
294 + if tag == vidTag:
295 + score += 1
296 + score_avg = score_avg + score
297 + #print("result for id",ids,"is", str(score)," / tags ",tagList)
298 +
299 + print("CAP - 1)score average = ",score_avg/5)
300 + cap1 = score_avg/5
301 +
302 +
303 + result = recommend_videos(tags, totalSegmentTagProbList, tag_model_path, video_model_path, video_id_model, top_k,True)
304 + score_avg = 0
305 + for ids in result:
306 + score = 0
307 + video_tags_info = entire_video_tags.loc[entire_video_tags["vid_id"] == ids[0:4]]#entire_segment_tags.loc[entire_segment_tags["vid_id"] == ids]
308 + tagList = []
309 + for i in range(1, top_k + 1):
310 + video_tag_tuple = video_tags_info["segment" + str(i)].values[0]# ex: "mobile-phone:0.361"
311 + tag = video_tag_tuple.split(":")[0]
312 + tagList.append(tag)
313 + #for vidTag in tagnames:
314 + # if tag == vidTag:
315 + # score += 1
316 + if tag in tag_vectors.vocab:
317 + for vidTag,pr in zip(tagnames,tagprobs):
318 + #if vidTag in tag_vectors.vocab:
319 + # score = score + (tag_vectors.similarity(tag,vidTag) * float(pr))
320 + if tag == vidTag:
321 + score += 1
322 + score_avg = score_avg + score
323 + #print("result for id",ids,"is", str(score)," / tags ",tagList)
324 +
325 + print("CAP - 2)score average = ",score_avg/5)
326 + cap2 = score_avg/5
327 +
328 +
329 + totalIdNum += 1
330 + if cap1 > cap2:
331 + try: testResult['cap1'] += 1
332 + except: testResult['cap1'] = 1
333 + elif cap1 < cap2:
334 + try: testResult['cap2'] += 1
335 + except: testResult['cap2'] = 1
336 + else:
337 + try:
338 + testResult['cap2'] += 0.5
339 + testResult['cap1'] += 0.5
340 + except:
341 + testResult['cap2'] = 0.5
342 + testResult['cap1'] = 0.5
343 +
344 +
345 + print(totalIdNum, testResult)
346 + segmentTagList.clear()
347 + prevVideoId = video_id
348 +
349 + if video_id == "finished":
350 + break
351 +
352 + tagList = []
353 + for i in range(1,top_k+1):
354 + tagList.append([row[i].split(":")[0],row[i].split(":")[1]])
355 + segmentTagList.append(tagList)
356 + #===========
357 +
358 +
359 +VIDEO_TAGS_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/segment_tags.csv"
360 +VIDEO_IDS_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/videoIds.csv"
361 +TAG_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/tag_vectors.model"
362 +VIDEO_VECTOR_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/video_vectors.model"
363 +VIDEO_VECTOR_MODEL2_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/video_vectors2.model"
364 +VIDEO_ID_MODEL_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/videoId_vectors.model"
365 +
366 +TEST_TAGS_PATH = "/mnt/e/khuhub/2015104192/web/backend/yt8m/esot3ria/test_segement_tags.csv"
367 +
368 +test(TAG_VECTOR_MODEL_PATH,
369 + VIDEO_VECTOR_MODEL_PATH,
370 + VIDEO_ID_MODEL_PATH,
371 + VIDEO_IDS_PATH,
372 + VIDEO_TAGS_PATH,
373 + TEST_TAGS_PATH,
374 + 5)
...\ No newline at end of file ...\ No newline at end of file
......
1 +결과
2 +커널크기/비디오 수/캡2/캡1
3 +1/260/29/31 -> 129/131
4 +3/260/32/30 -> 131/129
5 +5/260/36/24 -> 136/124
6 +7/260/35/25 -> 135/125
...\ No newline at end of file ...\ No newline at end of file