processing.py
1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
romance_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/romancePlot.csv')
thriller_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/thrillerPlot.csv')
print(len(romance_plot)) #5699 ==> train 2500 test 2500
print(len(thriller_plot)) #9823 ==> train 2500 test 2500
#전처리(1) 전부 소문자로 변환
romance_plot.줄거리 = romance_plot.줄거리.str.lower()
thriller_plot.줄거리 = thriller_plot.줄거리.str.lower()
#전처리(1-1) 데이터 csv 파일로 옮기기
#romance_plot 2899개 train_data로 to_csv || 2800개 test_data로 to_csv
#thriller_plot 2800개 train_data로 to_csv || 2899개 test_data로 to_csv
#전처리(2) 토큰화, 불용어 처리
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(romance_plot.줄거리)#오류
result = []
for w in word_tokens:
if w not in stop_words:
result.append(w)
print(word_tokens)
print(result)
Resource stopwords not found.
Please use the NLTK Downloader to obtain the resource:
import nltk
nltk.download(‘stopwords’)
#nltk
X_train, X_test,
y_train, y_test
x = 줄거리
y = 결과(로맨스, 스릴러 등)