Korean pre processing.ipynb 11.8 KB
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "stock-prediction.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "9EBLJGRkA7au",
        "colab_type": "code",
        "outputId": "6aa769f2-a86f-463c-893c-7a8b0b3aca08",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 122
        }
      },
      "source": [
        "from google.colab import auth\n",
        "auth.authenticate_user()\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
            "\n",
            "Enter your authorization code:\n",
            "··········\n",
            "Mounted at /content/gdrive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xz6TIi8x-3wI",
        "colab_type": "code",
        "outputId": "47fb88a4-fa0e-4327-b5d0-9ab7f42041f2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "%tensorflow_version 1.x"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "TensorFlow 1.x selected.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ykRUwvkvIoKH",
        "colab_type": "code",
        "outputId": "6d116660-4a7f-4c85-a733-6956210958c9",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 68
        }
      },
      "source": [
        "!cd gdrive/'My Drive'/'capstone 2' && ls"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "main.go\t\t       preprocessed_data.gsheet  title.csv     Word2vec.model\n",
            "metadata.tsv\t       title2020.csv\t\t title.gsheet\n",
            "preprocessed_data.csv  title2.csv\t\t word2vec\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C-V3pgCEX5xR",
        "colab_type": "text"
      },
      "source": [
        "### 데이터 전처리 \n",
        "[~ 포토 \\~]가 들어간 기사 제목은 데이터 리스트에서 삭제 하고 [\\~]가 들어간 기사 제목은 [\\~] 삭제\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oZ2Q2_uWViO3",
        "colab_type": "code",
        "outputId": "0fc44251-8920-4e1a-9472-5603d0ce1264",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 785
        }
      },
      "source": [
        "!pip3 install hanja==0.13.0\n",
        "!pip3 install git+https://github.com/haven-jeon/PyKoSpacing.git"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: hanja==0.13.0 in /usr/local/lib/python3.6/dist-packages (0.13.0)\n",
            "Collecting git+https://github.com/haven-jeon/PyKoSpacing.git\n",
            "  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /tmp/pip-req-build-n_sau7zy\n",
            "  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /tmp/pip-req-build-n_sau7zy\n",
            "Requirement already satisfied (use --upgrade to upgrade): pykospacing==0.1 from git+https://github.com/haven-jeon/PyKoSpacing.git in /usr/local/lib/python3.6/dist-packages\n",
            "Collecting tensorflow<=1.6.0,>=1.4.0\n",
            "  Using cached https://files.pythonhosted.org/packages/d9/0f/fbd8bb92459c75db93040f80702ebe4ba83a52cdb6ad930654c31dc0b711/tensorflow-1.6.0-cp36-cp36m-manylinux1_x86_64.whl\n",
            "Requirement already satisfied: keras>=2.1.5 in /usr/local/lib/python3.6/dist-packages (from pykospacing==0.1) (2.2.5)\n",
            "Requirement already satisfied: h5py>=2.7.1 in /usr/local/lib/python3.6/dist-packages (from pykospacing==0.1) (2.10.0)\n",
            "Requirement already satisfied: argparse>=1.4.0 in /usr/local/lib/python3.6/dist-packages (from pykospacing==0.1) (1.4.0)\n",
            "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.12.0)\n",
            "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (0.34.2)\n",
            "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (0.8.1)\n",
            "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.18.2)\n",
            "Collecting tensorboard<1.7.0,>=1.6.0\n",
            "  Using cached https://files.pythonhosted.org/packages/b0/67/a8c91665987d359211dcdca5c8b2a7c1e0876eb0702a4383c1e4ff76228d/tensorboard-1.6.0-py3-none-any.whl\n",
            "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (3.10.0)\n",
            "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.1.0)\n",
            "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.27.2)\n",
            "Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (0.3.3)\n",
            "Requirement already satisfied: absl-py>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (0.9.0)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras>=2.1.5->pykospacing==0.1) (3.13)\n",
            "Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras>=2.1.5->pykospacing==0.1) (1.4.1)\n",
            "Requirement already satisfied: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from keras>=2.1.5->pykospacing==0.1) (1.0.8)\n",
            "Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from keras>=2.1.5->pykospacing==0.1) (1.1.0)\n",
            "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (3.2.1)\n",
            "Requirement already satisfied: html5lib==0.9999999 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (0.9999999)\n",
            "Requirement already satisfied: bleach==1.5.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.5.0)\n",
            "Requirement already satisfied: werkzeug>=0.11.10 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.7.0,>=1.6.0->tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (1.0.0)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.4.0->tensorflow<=1.6.0,>=1.4.0->pykospacing==0.1) (46.0.0)\n",
            "Building wheels for collected packages: pykospacing\n",
            "  Building wheel for pykospacing (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for pykospacing: filename=pykospacing-0.1-cp36-none-any.whl size=2255598 sha256=249ac07d0d8b26e4b9d4a1821995b953f9f90c6206cb378f183ff2be5001b607\n",
            "  Stored in directory: /tmp/pip-ephem-wheel-cache-th23h_qr/wheels/4d/45/58/e26cb2b7f6a063d234158c6fd1e5700f6e15b99d67154340ba\n",
            "Successfully built pykospacing\n",
            "\u001b[31mERROR: magenta 0.3.19 has requirement tensorflow>=1.12.0, but you'll have tensorflow 1.6.0 which is incompatible.\u001b[0m\n",
            "Installing collected packages: tensorboard, tensorflow\n",
            "  Found existing installation: tensorboard 2.2.0\n",
            "    Uninstalling tensorboard-2.2.0:\n",
            "      Successfully uninstalled tensorboard-2.2.0\n",
            "  Found existing installation: tensorflow 2.2.0rc2\n",
            "    Uninstalling tensorflow-2.2.0rc2:\n",
            "      Successfully uninstalled tensorflow-2.2.0rc2\n",
            "Successfully installed tensorboard-1.6.0 tensorflow-1.6.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ddf2Fp7-JAYI",
        "colab_type": "code",
        "outputId": "0d3029dd-6db6-41eb-8142-24d42bd516b6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "import hanja\n",
        "import pandas as pd\n",
        "import re\n",
        "from pykospacing import spacing\n",
        "\n",
        "news_list = pd.read_csv('gdrive/My Drive/capstone 2/title.csv', encoding='utf-8')\n",
        "print(len(news_list))\n",
        "photo_regexp = \"\\[.*포토.*\\]\"\n",
        "brac_regexp = \"\\[.*\\]\"\n",
        "spechar_with_regexp = \"[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\\s\\+\\.\\-]\"\n",
        "spechar_without_regexp = \"[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\\s\\.]\"\n",
        "percentage = \"(\\+\\d*\\.\\d*)|(\\-\\d*\\.\\d*)\"\n",
        "is_exist_regexp = \"[가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9\\s\\.]\"\n",
        "\n",
        "\n",
        "for i, title in enumerate(news_list['title']):\n",
        "  if re.search(photo_regexp,title):\n",
        "    news_list.drop(i, inplace=True)\n",
        "  else :\n",
        "    if \"\" in title:\n",
        "      title = title.replace(\"\",\"상승\")\n",
        "    if \"\" in title:\n",
        "      title = title.replace(\"\",\"하락\")\n",
        "\n",
        "    title = hanja.translate(title, 'substitution')\n",
        "    title = re.sub(brac_regexp, '', title)\n",
        "    title = re.sub(\"\\.{3}|\\.{2}\", '', title)\n",
        "\n",
        "    if re.search(percentage,title):\n",
        "      title = re.sub(spechar_with_regexp, '',title).lstrip()\n",
        "    else:\n",
        "      title = re.sub(spechar_without_regexp, '', title).lstrip()\n",
        " \n",
        "    if ( not re.search(is_exist_regexp,title) ) or (len(title) == 0):\n",
        "      news_list.drop(i, inplace=True)\n",
        "    \n",
        "    news_list[\"title\"][i] = spacing(title)\n",
        "    \n",
        "\n",
        "\n",
        "df = pd.DataFrame(news_list, columns=['title','date','publication'])\n",
        "df.to_csv('gdrive/My Drive/capstone 2/without_percentage_preprocessed_data.csv',sep=',',encoding='UTF-8',index=False) \n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100960\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}