mlink-checkpoint.ipynb 5.06 KB

Raw Blame History Permalink

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "\n",
    "url = \"http://www.playdb.co.kr/playdb/playdblist.asp?Page=40&sReqMainCategory=000001&sReqSubCategory=&sReqDistrict=&sReqTab=2&sPlayType=1&sStartYear=2019&sSelectType=1\"\n",
    "req = urllib.request.urlopen(url)\n",
    "res = req.read()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "soup = BeautifulSoup(res,'html.parser')\n",
    "for link in soup.find_all('a'):\n",
    "    #a= link.get('href')\n",
    "    if 'PlaydbDetail' in link.get('href'):\n",
    "        paramcode = link.get('href')\n",
    "        print(link.get('href'))\n",
    "        print(paramcode)\n",
    "\n",
    "    \n",
    "##keywords = soup.find_all('span',class_='ah_k')\n",
    "#get_text() == 데이터에서 문자열만 추출\n",
    "#strip() == 데이터의 양옆 공백제거\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "웹 크롤링 후 csv 파일로 저장.\n",
    "뒷페이지로 갈 수록 링크 사라지는 문제 해결 요청."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import time\n",
    "time.sleep(5)\n",
    "\n",
    "toturl = []\n",
    "whole_source = \"\"\n",
    "#171페이지까지 있음\n",
    "for pageNo in range(1,172):\n",
    "    url = \"http://www.playdb.co.kr/playdb/playdblist.asp?Page=\" + str(pageNo) +\"&sReqMainCategory=000001&sReqSubCategory=&sReqDistrict=&sReqTab=2&sPlayType=1&sStartYear=2019&sSelectType=1\"\n",
    "    response = requests.get(url)\n",
    "    whole_source = whole_source + response.text\n",
    "    \n",
    "f = open(\"/Users/김서영/Desktop/temp/data/playdblink.csv\", \"w\")\n",
    "soup = BeautifulSoup(whole_source, 'html.parser')\n",
    "for link in soup.find_all('a'):\n",
    "    if 'PlaydbDetail' in link.get('href'):\n",
    "        paramcode = link.get('href')\n",
    "        print(link.get('href'))\n",
    "        f.write(paramcode+\",\")\n",
    "        #toturl.append(link.get('href'))\n",
    "    #paramcode = soup.find('a')\n",
    "    #for link in soup.find_all('a'):\n",
    "   \n",
    " \n",
    "#print(toturl)\n",
    "    \n",
    "##keywords = soup.find_all('span',class_='ah_k')\n",
    "#get_text() == 데이터에서 문자열만 추출\n",
    "#strip() == 데이터의 양옆 공백제거\n",
    "#중복되는 url지우기!!!!!\n",
    "        \n",
    "    \n",
    "# 데이터의 헤더부분을 입력한다.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "이 아래로는 연습 해둔 것들. 실행 x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "toturl = []\n",
    "whole_source = \"\"\n",
    "#171페이지까지 있음\n",
    "for pageNo in range(1,172):\n",
    "    url = \"http://www.playdb.co.kr/playdb/playdblist.asp?Page=\" + str(pageNo) +\"&sReqMainCategory=000001&sReqSubCategory=&sReqDistrict=&sReqTab=2&sPlayType=1&sStartYear=2019&sSelectType=1\"\n",
    "    response = requests.get(url)\n",
    "    whole_source = whole_source + response.text\n",
    "\n",
    "soup = BeautifulSoup(whole_source, 'html.parser')\n",
    "    #soup = BeautifulSoup(res,'html.parser')\n",
    "for link in soup.find_all('a'):\n",
    "    if 'PlaydbDetail' in link.get('href'):\n",
    "        paramcode = link.get('href')\n",
    "        #print(link.get('href'))\n",
    "        toturl.append(link.get('href'))\n",
    "    #paramcode = soup.find('a')\n",
    "    #for link in soup.find_all('a'):\n",
    "print(toturl)\n",
    "    \n",
    "##keywords = soup.find_all('span',class_='ah_k')\n",
    "#get_text() == 데이터에서 문자열만 추출\n",
    "#strip() == 데이터의 양옆 공백제거\n",
    "#중복되는 url지우기!!!!!\n",
    "        \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "dataframe = pd.DataFrame(toturl)\n",
    "dataframe.to_csv(\"/Users/김서영/Desktop/temp/mUrl.csv\", header = False, index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}