Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2021-1-capstone-design1
/
HCG_Project1
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
양지수
2021-04-18 13:03:23 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
2fb6b76b2d01550c9343994646eefc07fce5882f
2fb6b76b
1 parent
cffaec02
정렬후 가중치삭제후 저장 필요
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
48 additions
and
8 deletions
knp.py
knp.py
View file @
2fb6b76
...
...
@@ -4,6 +4,8 @@ import konlpy
from
konlpy.tag
import
*
import
openpyxl
import
pandas
as
pd
from
math
import
log10
from
operator
import
itemgetter
#형태소분석라이브러리
#okt = Okt()
...
...
@@ -22,24 +24,62 @@ for row in sheet.rows: #data에 크롤링한 뉴스 제목들 저장
)
#print(data)
#print(type(data[1])) #str
#newData=[]
newData2
=
[]
#for i in range(len(data)):
# newData.append(okt.nouns(data[i])) #명사만 추출okt
#print(newData)
for
i
in
range
(
len
(
data
)
-
1
):
newData2
.
append
(
hannanum
.
nouns
(
data
[
i
+
1
]))
#명사만 추출hannanum가 okt보다 성능좋음
print
(
newData2
)
#
print(newData2)
newData3
=
[]
for
i
in
range
(
len
(
newData2
)):
newData3
.
append
([])
for
j
in
newData2
[
i
]:
if
any
(
map
(
str
.
isdigit
,
j
))
==
False
:
if
any
(
map
(
str
.
isdigit
,
j
))
==
False
and
len
(
j
)
>
1
:
#추출한 결과가 숫자포함이거나 한글자 인것 제외
newData3
[
i
]
.
append
(
j
)
print
(
newData3
)
#
print(newData3)
#print(type(newData2))#newData2 데이터 형식은 list
df
=
pd
.
DataFrame
.
from_records
(
newData3
)
#newData2 dataframe으로 변환
df
.
to_excel
(
filename
+
'_명사추출_숫자제외'
+
'.xlsx'
)
#파일명의 엑셀로 변환
#df= pd.DataFrame.from_records(newData3)#newData3 dataframe으로 변환
#df.to_excel(filename+'_명사추출_숫자제외'+'.xlsx') #파일명의 엑셀로 변환
#TF-IDF함수 시작
def
f
(
t
,
d
):
# 엑셀 d 안에 있는 t 빈도 세기
return
d
.
count
(
t
)
def
tf
(
t
,
d
):
#tf(t,d)증가빈도 공식 적용
return
0.5
+
0.5
*
f
(
t
,
d
)
/
max
([
f
(
w
,
d
)
for
w
in
d
])
def
idf
(
t
,
D
):
#역문서 빈도 공식 적용
numerator
=
len
(
D
)
#문서 집합에 포함 된 문서 수
denominator
=
1
+
len
([
True
for
d
in
D
if
t
in
d
])
#1더해서 0되는 것 방지
return
log10
(
numerator
/
denominator
)
def
tfidf
(
t
,
d
,
D
):
return
tf
(
t
,
d
)
*
idf
(
t
,
D
)
def
tfidfScorer
(
D
):
result
=
[]
for
d
in
D
:
result
.
append
([(
t
,
tfidf
(
t
,
d
,
D
))
for
t
in
d
]
)
return
result
#newData3는 명사추출을 통해 분리되어있음(이미 split상태)
if
__name__
==
'__main__'
:
corpus
=
[]
for
i
in
range
(
len
(
newData3
)):
corpus
.
append
(
newData3
[
i
])
TfIf
=
[]
#결과저장
for
i
,
result
in
enumerate
(
tfidfScorer
(
corpus
)):
#print('====== document[%d] ======' % i)
#print(result)
TfIf
.
append
(
result
)
print
(
TfIf
)
#TFIF는 (단어,가중치) 조합으로 저장
#df= pd.DataFrame.from_records(TfIf)#TfIf dataframe으로 변환
#df.to_excel(filename+'_가중치추출'+'.xlsx')
...
...
Please
register
or
login
to post a comment