수정: 전문 크롤링

박은주
Commit 489e0a3e184e2b2797600f3a3a2bb8efc577f466 489e0a3e 2 parents 0bf5691f 659b953b
Showing 4 changed files with 52 additions and 19 deletions
.gitignore
GetTopic.py
README.md
content.py
--- a/.gitignore
View file @489e0a3
+++ b/.gitignore
View file @489e0a3
@@ -20,3 +20,7 @@ chromedriver.exe
 /KoreanSentimentAnalyzer/.gitignore
 /KoreanSentimentAnalyzer/.git/
+/textrank/
+/textrank.egg-info/
+/build/
+/dist/
--- a/GetTopic.py
View file @489e0a3
+++ b/GetTopic.py
View file @489e0a3
 import os
 import csv
+import re
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from konlpy.tag import Okt
+from konlpy.tag import Komoran
+
+from textrank import KeywordSummarizer
 okt = Okt()
-def DocToNouns(docs):
+def Okt_tokenizer(sent):
-    return [{
+    words = okt.nouns(sent)
-        'id': i,
+    # words = okt.pos(sent, join=True, stem=True)
-        'nouns': ' '.join(okt.nouns(doc)),
+    # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
-    } for i, doc in enumerate(docs)]
+    return words
+
+komoran = Komoran()
+def komoran_tokenizer(sent):
+    # words = []
+    # for sentence in sent:
+    #     words += komoran.pos(sentence, join=True)
+    #     print("check : ", komoran.pos(sentence, join=True))
+    # words = [komoran.pos(sentence, join=True) for sentence in sent]
+    words = komoran.pos(sent, join=True)
+    words = [w for w in words if('/NNG' in w or '/NNP' in w)]
+    return words
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 posts = []
-with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db:
+with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
     reader = csv.reader(db)
     for data in reader:
-        posts.append(data)
+        data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
+        data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
+        posts.append(data[0] + data[1])
+
+# tfidf_vectorizer = TfidfVectorizer()
+# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
+# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
+keyword_extractor = KeywordSummarizer(
+    # tokenize=Okt_tokenizer,
+    tokenize=komoran_tokenizer,
+    window = -1,
+    verbose= False
+)
-nouns = DocToNouns(posts)
-print(nouns)
\ No newline at end of file
+keywords = keyword_extractor.summarize(posts, topk=30)
+print(keywords)
\ No newline at end of file
--- a/README.md
View file @489e0a3
+++ b/README.md
View file @489e0a3
@@ -10,7 +10,12 @@
 * konlpy
 * jpype
+## Package
+> TextRank Package https://github.com/lovit/textrank/ <br>
+> Sentiment Analyzer https://github.com/mrlee23/KoreanSentimentAnalyzer
+
 ## WORK
-[] everytime.kr(경희대)로부터 24시간 이내의 이슈를 정리 ; 주기능
-[] 질문 입력시 해당하는 게시글 검색 ; 챗봇
\ No newline at end of file
+- [ ] everytime.kr(경희대)로부터 24시간 이내의 이슈 토픽 정리 ; 주기능<br>
+- [ ] 질문 입력시 해당하는 게시글 검색<br>
+- [ ] 지난 24시간 이내의 게시판 분위기 정리
\ No newline at end of file
--- a/content.py
View file @489e0a3
+++ b/content.py
View file @489e0a3
@@ -19,12 +19,13 @@ def Click(xpath):
     sleeptime()
 login_info = {
-    'userID' : 'qdw0313',
+    'userID' : 'id',
-    'userpw' : 'fejUfrQxHWwtcGcP0'
+    'userpw' : '********'
+
 }
 options = webdriver.ChromeOptions()
-options.add_argument('headless')
+# options.add_argument('headless')
 options.add_argument('no-sandbox')
 options.add_argument('window-size=1920x1080')
 options.add_argument('disable-gpu')
@@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage')
 options.add_argument('lang=ko_KR')
 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
-driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options)
+driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
 driver.get('about:blank')
 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
@@ -72,10 +73,7 @@ while swt:
     TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
     DateList = soup.select('#container > div.wrap.articles > article > a > time')
-    # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
-    # idx = 1
-    # for post in zip(TitleList, ContentList, DateList):
     for post in zip(TitleList, DateList):
         title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
         try:
@@ -94,7 +92,6 @@ while swt:
             # print("{0}. {1} : {2}".format(idx, title, content))
         print(post[1].text)
         if post[1].text < yesterday:
-            swt = False
             break
 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')