수정: 전문 크롤링

박은주
Commit 489e0a3e184e2b2797600f3a3a2bb8efc577f466 489e0a3e 2 parents 0bf5691f 659b953b
Showing 4 changed files with 53 additions and 20 deletions
.gitignore
GetTopic.py
README.md
content.py
--- a/.gitignore
View file @489e0a3
+++ b/.gitignore
View file @489e0a3
@@ -19,4 +19,8 @@ chromedriver.exe
 *.org
 
 /KoreanSentimentAnalyzer/.gitignore
- /KoreanSentimentAnalyzer/.git/
\ No newline at end of file
+ /KoreanSentimentAnalyzer/.git/
+ /textrank/
+ /textrank.egg-info/
+ /build/
+ /dist/
--- a/GetTopic.py
View file @489e0a3
+++ b/GetTopic.py
View file @489e0a3
 import os
 import csv
+ import re
 
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from konlpy.tag import Okt
+ from konlpy.tag import Komoran
+ 
+ from textrank import KeywordSummarizer
 
 okt = Okt()
- def DocToNouns(docs):
-     return [{
-         'id': i,
-         'nouns': ' '.join(okt.nouns(doc)),
-     } for i, doc in enumerate(docs)]
+ def Okt_tokenizer(sent):
+     words = okt.nouns(sent)
+     # words = okt.pos(sent, join=True, stem=True)
+     # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
+     return words
+ 
+ komoran = Komoran()
+ def komoran_tokenizer(sent):
+     # words = []
+     # for sentence in sent:
+     #     words += komoran.pos(sentence, join=True)
+     #     print("check : ", komoran.pos(sentence, join=True))
+     # words = [komoran.pos(sentence, join=True) for sentence in sent]
+     words = komoran.pos(sent, join=True)
+     words = [w for w in words if('/NNG' in w or '/NNP' in w)]
+     return words
 
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
 posts = []
- with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db:
+ with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
     reader = csv.reader(db)
     for data in reader:
-         posts.append(data)
+         data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
+         data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
+         posts.append(data[0] + data[1])
+ 
+ # tfidf_vectorizer = TfidfVectorizer()
+ # title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
+ # content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
+ keyword_extractor = KeywordSummarizer(
+     # tokenize=Okt_tokenizer,
+     tokenize=komoran_tokenizer,
+     window = -1,
+     verbose= False
+ )
 
- nouns = DocToNouns(posts)
- print(nouns)
\ No newline at end of file
+ keywords = keyword_extractor.summarize(posts, topk=30)
+ print(keywords)
\ No newline at end of file
--- a/README.md
View file @489e0a3
+++ b/README.md
View file @489e0a3
@@ -10,7 +10,12 @@
 * konlpy
 * jpype
 
+ ## Package
+ > TextRank Package https://github.com/lovit/textrank/ <br>
+ > Sentiment Analyzer https://github.com/mrlee23/KoreanSentimentAnalyzer
+ 
 
 ## WORK
- [] everytime.kr(경희대)로부터 24시간 이내의 이슈를 정리 ; 주기능
- [] 질문 입력시 해당하는 게시글 검색 ; 챗봇
\ No newline at end of file
+ - [ ] everytime.kr(경희대)로부터 24시간 이내의 이슈 토픽 정리 ; 주기능<br>
+ - [ ] 질문 입력시 해당하는 게시글 검색<br>
+ - [ ] 지난 24시간 이내의 게시판 분위기 정리
\ No newline at end of file
--- a/content.py
View file @489e0a3
+++ b/content.py
View file @489e0a3
@@ -19,12 +19,13 @@ def Click(xpath):
     sleeptime()
 
 login_info = {
-     'userID' : 'qdw0313',
-     'userpw' : 'fejUfrQxHWwtcGcP0'
+     'userID' : 'id',
+     'userpw' : '********'
+ 
 }
 
 options = webdriver.ChromeOptions()
- options.add_argument('headless')
+ # options.add_argument('headless')
 options.add_argument('no-sandbox')
 options.add_argument('window-size=1920x1080')
 options.add_argument('disable-gpu')
@@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage')
 options.add_argument('lang=ko_KR')
 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
 
- driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options)
+ driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
 
 driver.get('about:blank')
 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
@@ -72,10 +73,7 @@ while swt:
 
     TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
     DateList = soup.select('#container > div.wrap.articles > article > a > time')
-     # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
 
-     # idx = 1
-     # for post in zip(TitleList, ContentList, DateList):
     for post in zip(TitleList, DateList):
         title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
         try:
@@ -94,7 +92,6 @@ while swt:
             # print("{0}. {1} : {2}".format(idx, title, content))
         print(post[1].text)
         if post[1].text < yesterday:
-             swt = False
             break
 
 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')