박은주

Crawler Modified

......@@ -15,7 +15,6 @@ chromedriver.exe
*.csv
*.xml
*.json
*.png
*.org
/KoreanSentimentAnalyzer/.gitignore
......
......@@ -2,9 +2,6 @@ import os
import csv
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt
from konlpy.tag import Komoran
......@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer
okt = Okt()
def Okt_tokenizer(sent):
words = okt.nouns(sent)
# words = okt.pos(sent, join=True, stem=True)
# words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
return words
komoran = Komoran()
def komoran_tokenizer(sent):
# words = []
# for sentence in sent:
# words += komoran.pos(sentence, join=True)
# print("check : ", komoran.pos(sentence, join=True))
# words = [komoran.pos(sentence, join=True) for sentence in sent]
words = komoran.pos(sent, join=True)
words = [w for w in words if('/NNG' in w or '/NNP' in w)]
return words
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
posts = []
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
def GetKeywords():
posts = []
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
reader = csv.reader(db)
for data in reader:
data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
posts.append(data[0] + data[1])
# tfidf_vectorizer = TfidfVectorizer()
# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
keyword_extractor = KeywordSummarizer(
# tokenize=Okt_tokenizer,
keyword_extractor = KeywordSummarizer(
tokenize=komoran_tokenizer,
window = -1,
verbose= False
)
)
keywords = keyword_extractor.summarize(posts, topk=30)
print(keywords)
\ No newline at end of file
keywords = keyword_extractor.summarize(posts, topk=30)
return keywords
\ No newline at end of file
......
......@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
def sleeptime():
time.sleep(random.randint(1, 3))
......@@ -18,43 +20,44 @@ def Click(xpath):
driver.execute_script("arguments[0].click();", element)
sleeptime()
login_info = {
'userID' : 'id',
'userpw' : '********'
}
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument('no-sandbox')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')
options.add_argument('disable-dev-shm-usage')
options.add_argument('lang=ko_KR')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
driver.get('about:blank')
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
sleeptime()
driver.get('https://everytime.kr/login')
sleeptime()
driver.find_element_by_name('userid').send_keys(login_info['userID'])
driver.find_element_by_name('password').send_keys(login_info['userpw'])
driver.find_element_by_class_name('submit').click()
sleeptime()
# 국제캠 자게
sleeptime()
yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
print(yesterday)
swt = True
page = 0
post_df = pd.DataFrame(columns=['title', 'content'])
while swt:
def GetData():
login_info = {
'userID' : 'qdw0313',
'userpw' : 'fejUfrQxHWwtcGcP0'
}
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('no-sandbox')
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')
options.add_argument('disable-dev-shm-usage')
options.add_argument('lang=ko_KR')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
# driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
driver = webdriver.Chrome(BASE_DIR + '/chromedriver.exe', options=options)
driver.get('about:blank')
driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
sleeptime()
driver.get('https://everytime.kr/login')
sleeptime()
driver.find_element_by_name('userid').send_keys(login_info['userID'])
driver.find_element_by_name('password').send_keys(login_info['userpw'])
driver.find_element_by_class_name('submit').click()
sleeptime()
# 국제캠 자게
sleeptime()
yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
print(yesterday)
swt = True
page = 0
post_df = pd.DataFrame(columns=['title', 'content'])
while swt:
if page < 1:
Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
page += 1
......@@ -91,10 +94,13 @@ while swt:
columns=['title', 'content']))
# print("{0}. {1} : {2}".format(idx, title, content))
print(post[1].text)
if post[1].text < yesterday:
print(yesterday < "06/02 16:35")
exit()
if post[1].text <= yesterday:
break
post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
print("CVS file saved")
post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
print("JSON file saved")
\ No newline at end of file
post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
print("CVS file saved")
with open('data.json', 'w', encoding='utf-8') as file:
post_df.to_json(file, force_ascii=False)
print("JSON file saved")
\ No newline at end of file
......