broad plot.ipynb
13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"브로드웨이 링크 꺼내서 줄거리 크롤링하기.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"csv에서 링크를 matrix에 저장"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"with open('/Users/김서영/Desktop/datacap/data/Musicaldata/mulink.csv','r') as f:\n",
" reader = csv.reader(f)\n",
" matrix=[]\n",
" for col in reader:\n",
" matrix.append(col)\n",
" \n",
" \n",
"print(matrix)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"링크 열어서 텍스트 파일로 저장\n",
"\n",
"아래것은 링크 크롤링만."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"반복문으로 링크 열기+저장"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"ename": "HTTPError",
"evalue": "HTTP Error 400: Bad Request",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mHTTPError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-26-99d2481817e5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m330\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m335\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmatrix\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mreq\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[0;32m 220\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[0mopener\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 222\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 223\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mopen\u001b[1;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[0;32m 529\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mprocessor\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprocess_response\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 530\u001b[0m \u001b[0mmeth\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprocessor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 531\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 532\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 533\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mhttp_response\u001b[1;34m(self, request, response)\u001b[0m\n\u001b[0;32m 639\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;36m200\u001b[0m \u001b[1;33m<=\u001b[0m \u001b[0mcode\u001b[0m \u001b[1;33m<\u001b[0m \u001b[1;36m300\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 640\u001b[0m response = self.parent.error(\n\u001b[1;32m--> 641\u001b[1;33m 'http', request, response, code, msg, hdrs)\n\u001b[0m\u001b[0;32m 642\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 643\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36merror\u001b[1;34m(self, proto, *args)\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhttp_err\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 568\u001b[0m \u001b[0margs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'default'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'http_error_default'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0morig_args\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 569\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call_chain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 570\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[1;31m# XXX probably also want an abstract factory that knows when it makes\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[1;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[0;32m 501\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 502\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 504\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 505\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mhttp_error_default\u001b[1;34m(self, req, fp, code, msg, hdrs)\u001b[0m\n\u001b[0;32m 647\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mHTTPDefaultErrorHandler\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 648\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mhttp_error_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 649\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mHTTPError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfull_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhdrs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 650\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 651\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0mHTTPRedirectHandler\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mBaseHandler\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mHTTPError\u001b[0m: HTTP Error 400: Bad Request"
]
}
],
"source": [
"import urllib.request\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import re\n",
"import csv\n",
"\n",
"matrix=[]\n",
"\n",
"with open('/Users/김서영/Desktop/datacap/data/musicalData/mulink.csv','r') as f:\n",
" reader = csv.reader(f)\n",
" for col in reader:\n",
" matrix.append(col)\n",
"\n",
"plotzip= []\n",
"for a in range(330,335):\n",
" url = matrix[0][a]\n",
" req = urllib.request.urlopen(url)\n",
" res = req.read()\n",
"\n",
" soup = BeautifulSoup(res,'html.parser')\n",
" paramcode = soup.find('div',{'class':'col-sm-8'})\n",
" muname = soup.find('h1')\n",
" name = muname.get_text()\n",
" plot = paramcode.get_text()\n",
" #print(name)\n",
" #print(plot)\n",
" parse = plot.replace(',','')\n",
" para = parse.replace(\"\\n\", \"\")\n",
" #plotzip.append(name+','+para)\n",
" plotzip.append(para+\",\")\n",
" \n",
" \n",
"print(plotzip)\n",
" \n",
"\n",
"\n",
"#parse = re.sub(',\\n', '', plot)\n",
"\n",
"\n",
"#파일로 저장\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"f=open(\"/Users/김서영/Desktop/datacap/data/BMplot.csv\",'w',-1, \"utf-8\")\n",
"f.writelines(plotzip)\n",
"#f.write(name+',')\n",
"#f.write(para)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(plotzip[2])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import urllib.request\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"import re\n",
"\n",
"\n",
"url = \"https://broadwaymusicalhome.com/shows/9to5themusical.htm\"\n",
"req = urllib.request.urlopen(url)\n",
"res = req.read()\n",
"\n",
"soup = BeautifulSoup(res,'html.parser')\n",
"paramcode = soup.find('div',{'class':'col-sm-8'})\n",
"muname = soup.find('h1')\n",
"name = muname.get_text()\n",
"plot = paramcode.get_text()\n",
"print(name)\n",
"print(plot)\n",
"\n",
"\n",
"#parse = re.sub(',\\n', '', plot)\n",
"parse = plot.replace(',','')\n",
"para = parse.replace(\"\\n\", \"\")\n",
"\n",
"plotzip= []\n",
"\n",
"plotzip = name+','+para\n",
"\n",
"print(plotzip)\n",
"#파일로 저장\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}