content.py 4.61 KB
import csv
import json
import time
import random
import os
import re
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from hanspell import spell_checker
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from pytz import timezone
# from pyvirtualdisplay import Display

BASE_DIR = os.path.dirname(os.path.realpath(__file__))

def sleeptime():
    time.sleep(random.randint(1, 3))

def Click(xpath, driver):
    element = driver.find_element_by_xpath(xpath)
    driver.execute_script("arguments[0].click();", element)
    sleeptime()

def TextPreprocess(text):
    text = re.sub(pattern='[^\w\s]', repl='', string=text)
    text = re.sub(pattern='\n', repl='. ', string=text)

    spelled_sent = spell_checker.check(text)
    text = spelled_sent.checked

    return text

def GetData():
    # display = Display(visible=0, size=(1920,1080))
    # display.start()

    login_info = {
        'userID' : '********',
        'userpw' : '********'
    }

    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('no-sandbox')
    options.add_argument('window-size=1920x1080')
    options.add_argument('disable-gpu')
    options.add_argument('disable-dev-shm-usage')
    options.add_argument('lang=ko_KR')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')

    # driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
    # driver = webdriver.Chrome(executable_path=BASE_DIR + '/chromedriver.exe', options=options)
    driver = webdriver.Chrome(options=options)
    utc_patam = {'timezoneId': 'Asia/Seoul'}
    driver.execute_cdp_cmd('Emulation.setTimezoneOverride', utc_patam)

    driver.get('about:blank')
    driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
    sleeptime()
    driver.get('https://everytime.kr/login')

    sleeptime()
    driver.find_element_by_name('userid').send_keys(login_info['userID'])
    driver.find_element_by_name('password').send_keys(login_info['userpw'])
    driver.find_element_by_class_name('submit').click()
    sleeptime()

    # 국제캠 자게
    KST = timezone('Asia/Seoul')
    today = datetime.utcnow().astimezone(KST)
    # today = datetime.today()

    sleeptime()
    yesterday = (today - timedelta(1)).strftime('%m/%d %H:%M')
    swt = True
    page = 0

    post_df = pd.DataFrame(columns=['title', 'content'])
    while swt:
        if page < 1:
            Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a', driver)
            page += 1
        else:
            if page == 1:
                Click('//*[@id="container"]/div[2]/div[2]/a', driver)
                page += 1
            elif page == 2:
                Click('//*[@id="container"]/div[2]/div[2]/a[2]', driver)
                page += 1
            else:
                Click('//*[@id="container"]/div[2]/div[2]/a[3]', driver)
                page += 1

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        DateList = soup.select('#container > div.wrap.articles > article > a > time')
        TitleList = soup.select('#container > div.wrap.articles > article > a > h2')

        for post in zip(TitleList, DateList):
            title = TextPreprocess(post[0].text)

            try:
                Click("//h2[contains(text(), '{}')]".format(title), driver)
            except NoSuchElementException:
                continue
            content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
            driver.back()
            sleeptime()

            if not (post_df['title'] == title).any():
                content = TextPreprocess(content)
                post_df = post_df.append(pd.DataFrame([[title, content]],
                                                      columns=['title', 'content']))
            print(title)
            print(post[1].text)
            if post[1].text <= yesterday:
                swt = False
                break
        print('page : {}'.format(page))

    post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
    print("CVS file saved")

    with open(BASE_DIR + 'date.txt', 'w', encoding='utf-8-sig') as txt_file:
        txt_file.write((today - timedelta(1)).strftime("%Y년 %m월 %d일"))

    with open('data.txt', 'w', encoding='utf-8-sig') as txt_file:
        for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
            txt_file.write(post[0] + post[1] + '.\n')
    print("txt file saved")