Showing
3 changed files
with
101 additions
and
109 deletions
... | @@ -2,9 +2,6 @@ import os | ... | @@ -2,9 +2,6 @@ import os |
2 | import csv | 2 | import csv |
3 | import re | 3 | import re |
4 | 4 | ||
5 | -from sklearn.metrics.pairwise import cosine_similarity | ||
6 | -from sklearn.feature_extraction.text import TfidfVectorizer | ||
7 | -from sklearn.cluster import KMeans | ||
8 | from konlpy.tag import Okt | 5 | from konlpy.tag import Okt |
9 | from konlpy.tag import Komoran | 6 | from konlpy.tag import Komoran |
10 | 7 | ||
... | @@ -13,40 +10,30 @@ from textrank import KeywordSummarizer | ... | @@ -13,40 +10,30 @@ from textrank import KeywordSummarizer |
13 | okt = Okt() | 10 | okt = Okt() |
14 | def Okt_tokenizer(sent): | 11 | def Okt_tokenizer(sent): |
15 | words = okt.nouns(sent) | 12 | words = okt.nouns(sent) |
16 | - # words = okt.pos(sent, join=True, stem=True) | ||
17 | - # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)] | ||
18 | return words | 13 | return words |
19 | 14 | ||
20 | komoran = Komoran() | 15 | komoran = Komoran() |
21 | def komoran_tokenizer(sent): | 16 | def komoran_tokenizer(sent): |
22 | - # words = [] | ||
23 | - # for sentence in sent: | ||
24 | - # words += komoran.pos(sentence, join=True) | ||
25 | - # print("check : ", komoran.pos(sentence, join=True)) | ||
26 | - # words = [komoran.pos(sentence, join=True) for sentence in sent] | ||
27 | words = komoran.pos(sent, join=True) | 17 | words = komoran.pos(sent, join=True) |
28 | words = [w for w in words if('/NNG' in w or '/NNP' in w)] | 18 | words = [w for w in words if('/NNG' in w or '/NNP' in w)] |
29 | return words | 19 | return words |
30 | 20 | ||
31 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 21 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
32 | 22 | ||
33 | -posts = [] | ||
34 | -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db: | ||
35 | - reader = csv.reader(db) | ||
36 | - for data in reader: | ||
37 | - data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '') | ||
38 | - data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '') | ||
39 | - posts.append(data[0] + data[1]) | ||
40 | - | ||
41 | -# tfidf_vectorizer = TfidfVectorizer() | ||
42 | -# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns) | ||
43 | -# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns) | ||
44 | -keyword_extractor = KeywordSummarizer( | ||
45 | - # tokenize=Okt_tokenizer, | ||
46 | - tokenize=komoran_tokenizer, | ||
47 | - window = -1, | ||
48 | - verbose= False | ||
49 | -) | ||
50 | - | ||
51 | -keywords = keyword_extractor.summarize(posts, topk=30) | ||
52 | -print(keywords) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
23 | +def GetKeywords(): | ||
24 | + posts = [] | ||
25 | + with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db: | ||
26 | + reader = csv.reader(db) | ||
27 | + for data in reader: | ||
28 | + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '') | ||
29 | + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '') | ||
30 | + posts.append(data[0] + data[1]) | ||
31 | + | ||
32 | + keyword_extractor = KeywordSummarizer( | ||
33 | + tokenize=komoran_tokenizer, | ||
34 | + window = -1, | ||
35 | + verbose= False | ||
36 | + ) | ||
37 | + | ||
38 | + keywords = keyword_extractor.summarize(posts, topk=30) | ||
39 | + return keywords | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException | ... | @@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException |
10 | from bs4 import BeautifulSoup | 10 | from bs4 import BeautifulSoup |
11 | from datetime import datetime, timedelta | 11 | from datetime import datetime, timedelta |
12 | 12 | ||
13 | +BASE_DIR = os.path.dirname(os.path.realpath(__file__)) | ||
14 | + | ||
13 | def sleeptime(): | 15 | def sleeptime(): |
14 | time.sleep(random.randint(1, 3)) | 16 | time.sleep(random.randint(1, 3)) |
15 | 17 | ||
... | @@ -18,83 +20,87 @@ def Click(xpath): | ... | @@ -18,83 +20,87 @@ def Click(xpath): |
18 | driver.execute_script("arguments[0].click();", element) | 20 | driver.execute_script("arguments[0].click();", element) |
19 | sleeptime() | 21 | sleeptime() |
20 | 22 | ||
21 | -login_info = { | 23 | +def GetData(): |
22 | - 'userID' : 'id', | 24 | + login_info = { |
23 | - 'userpw' : '********' | 25 | + 'userID' : 'qdw0313', |
24 | - | 26 | + 'userpw' : 'fejUfrQxHWwtcGcP0' |
25 | -} | 27 | + } |
26 | - | 28 | + |
27 | -options = webdriver.ChromeOptions() | 29 | + options = webdriver.ChromeOptions() |
28 | -# options.add_argument('headless') | 30 | + options.add_argument('headless') |
29 | -options.add_argument('no-sandbox') | 31 | + options.add_argument('no-sandbox') |
30 | -options.add_argument('window-size=1920x1080') | 32 | + options.add_argument('window-size=1920x1080') |
31 | -options.add_argument('disable-gpu') | 33 | + options.add_argument('disable-gpu') |
32 | -options.add_argument('disable-dev-shm-usage') | 34 | + options.add_argument('disable-dev-shm-usage') |
33 | -options.add_argument('lang=ko_KR') | 35 | + options.add_argument('lang=ko_KR') |
34 | -options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') | 36 | + options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') |
35 | - | 37 | + |
36 | -driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) | 38 | + # driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) |
37 | - | 39 | + driver = webdriver.Chrome(BASE_DIR + '/chromedriver.exe', options=options) |
38 | -driver.get('about:blank') | 40 | + |
39 | -driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") | 41 | + driver.get('about:blank') |
40 | -sleeptime() | 42 | + driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") |
41 | -driver.get('https://everytime.kr/login') | 43 | + sleeptime() |
42 | - | 44 | + driver.get('https://everytime.kr/login') |
43 | -sleeptime() | 45 | + |
44 | -driver.find_element_by_name('userid').send_keys(login_info['userID']) | 46 | + sleeptime() |
45 | -driver.find_element_by_name('password').send_keys(login_info['userpw']) | 47 | + driver.find_element_by_name('userid').send_keys(login_info['userID']) |
46 | -driver.find_element_by_class_name('submit').click() | 48 | + driver.find_element_by_name('password').send_keys(login_info['userpw']) |
47 | -sleeptime() | 49 | + driver.find_element_by_class_name('submit').click() |
48 | - | 50 | + sleeptime() |
49 | -# 국제캠 자게 | 51 | + |
50 | -sleeptime() | 52 | + # 국제캠 자게 |
51 | -yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') | 53 | + sleeptime() |
52 | -print(yesterday) | 54 | + yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') |
53 | -swt = True | 55 | + print(yesterday) |
54 | -page = 0 | 56 | + swt = True |
55 | - | 57 | + page = 0 |
56 | -post_df = pd.DataFrame(columns=['title', 'content']) | 58 | + |
57 | -while swt: | 59 | + post_df = pd.DataFrame(columns=['title', 'content']) |
58 | - if page < 1: | 60 | + while swt: |
59 | - Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') | 61 | + if page < 1: |
60 | - page += 1 | 62 | + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') |
61 | - else: | ||
62 | - if page == 1: | ||
63 | - Click('//*[@id="container"]/div[2]/div[2]/a') | ||
64 | - page += 1 | ||
65 | - elif page == 2: | ||
66 | - Click('//*[@id="container"]/div[2]/div[2]/a[2]') | ||
67 | page += 1 | 63 | page += 1 |
68 | else: | 64 | else: |
69 | - Click('//*[@id="container"]/div[2]/div[2]/a[3]') | ||
70 | - | ||
71 | - html = driver.page_source | ||
72 | - soup = BeautifulSoup(html, 'html.parser') | ||
73 | - | ||
74 | - TitleList = soup.select('#container > div.wrap.articles > article > a > h2') | ||
75 | - DateList = soup.select('#container > div.wrap.articles > article > a > time') | ||
76 | - | ||
77 | - for post in zip(TitleList, DateList): | ||
78 | - title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | ||
79 | - try: | ||
80 | - Click("//h2[contains(text(), '{}')]".format(title)) | ||
81 | - except NoSuchElementException: | ||
82 | - continue | ||
83 | - content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text | ||
84 | - driver.back() | ||
85 | - sleeptime() | ||
86 | - | ||
87 | - if not (post_df['title'] == title).any(): | ||
88 | - # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) | ||
89 | - content = re.sub(pattern='[^\w\s]', repl='', string=content) | ||
90 | - post_df = post_df.append(pd.DataFrame([[title, content]], | ||
91 | - columns=['title', 'content'])) | ||
92 | - # print("{0}. {1} : {2}".format(idx, title, content)) | ||
93 | - print(post[1].text) | ||
94 | - if post[1].text < yesterday: | ||
95 | - break | ||
96 | - | ||
97 | -post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') | ||
98 | -print("CVS file saved") | ||
99 | -post_df.to_json('data.json', orient='records', encoding='utf-8-sig') | ||
100 | -print("JSON file saved") | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
65 | + if page == 1: | ||
66 | + Click('//*[@id="container"]/div[2]/div[2]/a') | ||
67 | + page += 1 | ||
68 | + elif page == 2: | ||
69 | + Click('//*[@id="container"]/div[2]/div[2]/a[2]') | ||
70 | + page += 1 | ||
71 | + else: | ||
72 | + Click('//*[@id="container"]/div[2]/div[2]/a[3]') | ||
73 | + | ||
74 | + html = driver.page_source | ||
75 | + soup = BeautifulSoup(html, 'html.parser') | ||
76 | + | ||
77 | + TitleList = soup.select('#container > div.wrap.articles > article > a > h2') | ||
78 | + DateList = soup.select('#container > div.wrap.articles > article > a > time') | ||
79 | + | ||
80 | + for post in zip(TitleList, DateList): | ||
81 | + title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | ||
82 | + try: | ||
83 | + Click("//h2[contains(text(), '{}')]".format(title)) | ||
84 | + except NoSuchElementException: | ||
85 | + continue | ||
86 | + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text | ||
87 | + driver.back() | ||
88 | + sleeptime() | ||
89 | + | ||
90 | + if not (post_df['title'] == title).any(): | ||
91 | + # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) | ||
92 | + content = re.sub(pattern='[^\w\s]', repl='', string=content) | ||
93 | + post_df = post_df.append(pd.DataFrame([[title, content]], | ||
94 | + columns=['title', 'content'])) | ||
95 | + # print("{0}. {1} : {2}".format(idx, title, content)) | ||
96 | + print(post[1].text) | ||
97 | + print(yesterday < "06/02 16:35") | ||
98 | + exit() | ||
99 | + if post[1].text <= yesterday: | ||
100 | + break | ||
101 | + | ||
102 | + post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False) | ||
103 | + print("CVS file saved") | ||
104 | + with open('data.json', 'w', encoding='utf-8') as file: | ||
105 | + post_df.to_json(file, force_ascii=False) | ||
106 | + print("JSON file saved") | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment