bert_news_label body.ipynb 43.6 KB

Raw Blame History Permalink

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "bert news label.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "58B51bnMtDVX",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 122
        },
        "outputId": "6e85676a-2b15-4885-b467-3358de1e7189"
      },
      "source": [
        "from google.colab import auth\n",
        "auth.authenticate_user()\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly\n",
            "\n",
            "Enter your authorization code:\n",
            "··········\n",
            "Mounted at /content/gdrive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2GWn_WDkvp3g",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import pandas as pd\n",
        "combined_data = pd.read_csv('gdrive/My Drive/capstone 2/event_embedding/Thesis_data/combined_data3.csv', encoding='utf-8') \n",
        "combined_data\n",
        "\n",
        "\n",
        "path = \"gdrive/My Drive/capstone 2/\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ovci8fVpZUmN",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 419
        },
        "outputId": "55dddc67-b92a-4cc6-a9b8-152e594441ce"
      },
      "source": [
        "combined_data"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>time</th>\n",
              "      <th>headline</th>\n",
              "      <th>body</th>\n",
              "      <th>Price</th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Vol</th>\n",
              "      <th>Change</th>\n",
              "      <th>index</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>20050107</td>\n",
              "      <td>Stocks End Lower</td>\n",
              "      <td>Monday. Among some of the other highlights, c...</td>\n",
              "      <td>4.93</td>\n",
              "      <td>4.99</td>\n",
              "      <td>5.05</td>\n",
              "      <td>4.85</td>\n",
              "      <td>434.26M</td>\n",
              "      <td>-0.40%</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>20050107</td>\n",
              "      <td>Vital Signs for the Week of Jan. 10</td>\n",
              "      <td>Palo Alto, Calif.                EARNINGS REP...</td>\n",
              "      <td>4.93</td>\n",
              "      <td>4.99</td>\n",
              "      <td>5.05</td>\n",
              "      <td>4.85</td>\n",
              "      <td>434.26M</td>\n",
              "      <td>-0.40%</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>20050110</td>\n",
              "      <td>Tightwad IT Buyers Loosen Up</td>\n",
              "      <td>plain-vanilla desktops, according to NPD Grou...</td>\n",
              "      <td>4.61</td>\n",
              "      <td>4.88</td>\n",
              "      <td>4.94</td>\n",
              "      <td>4.58</td>\n",
              "      <td>654.04M</td>\n",
              "      <td>-6.49%</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>20050110</td>\n",
              "      <td>Stocks Finish Slightly Higher</td>\n",
              "      <td>regular session.       Looking ahead this wee...</td>\n",
              "      <td>4.61</td>\n",
              "      <td>4.88</td>\n",
              "      <td>4.94</td>\n",
              "      <td>4.58</td>\n",
              "      <td>654.04M</td>\n",
              "      <td>-6.49%</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>20050110</td>\n",
              "      <td>Commentary: The New Driver In Chipland</td>\n",
              "      <td>easy to see the consumer influence. Digital c...</td>\n",
              "      <td>4.61</td>\n",
              "      <td>4.88</td>\n",
              "      <td>4.94</td>\n",
              "      <td>4.58</td>\n",
              "      <td>654.04M</td>\n",
              "      <td>-6.49%</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24046</th>\n",
              "      <td>20150108</td>\n",
              "      <td>Israel's Water Ninja</td>\n",
              "      <td>influenced by his grandfather, who built Tel ...</td>\n",
              "      <td>112.01</td>\n",
              "      <td>112.67</td>\n",
              "      <td>113.25</td>\n",
              "      <td>110.21</td>\n",
              "      <td>53.70M</td>\n",
              "      <td>0.11%</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24047</th>\n",
              "      <td>20150108</td>\n",
              "      <td>What Drivers Want: Design Lessons From Ford's ...</td>\n",
              "      <td>faster, simpler, and easier to use. Will the ...</td>\n",
              "      <td>112.01</td>\n",
              "      <td>112.67</td>\n",
              "      <td>113.25</td>\n",
              "      <td>110.21</td>\n",
              "      <td>53.70M</td>\n",
              "      <td>0.11%</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24048</th>\n",
              "      <td>20150108</td>\n",
              "      <td>AT&amp;T May Face FCC Fine Over Mobile Data Slowdo...</td>\n",
              "      <td>halting the practice and millions of dollars ...</td>\n",
              "      <td>112.01</td>\n",
              "      <td>112.67</td>\n",
              "      <td>113.25</td>\n",
              "      <td>110.21</td>\n",
              "      <td>53.70M</td>\n",
              "      <td>0.11%</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24049</th>\n",
              "      <td>20150108</td>\n",
              "      <td>Is Samsung Feeling the Squeeze From Apple?</td>\n",
              "      <td>UBS Managing Director Steve Milunovich \\ndisc...</td>\n",
              "      <td>112.01</td>\n",
              "      <td>112.67</td>\n",
              "      <td>113.25</td>\n",
              "      <td>110.21</td>\n",
              "      <td>53.70M</td>\n",
              "      <td>0.11%</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24050</th>\n",
              "      <td>20150108</td>\n",
              "      <td>Company News: Auto Industry, U.S. Steel, Veriz...</td>\n",
              "      <td>billion sale to Apple last year. The complain...</td>\n",
              "      <td>112.01</td>\n",
              "      <td>112.67</td>\n",
              "      <td>113.25</td>\n",
              "      <td>110.21</td>\n",
              "      <td>53.70M</td>\n",
              "      <td>0.11%</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>24051 rows × 10 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "           time  ... index\n",
              "0      20050107  ...     0\n",
              "1      20050107  ...     0\n",
              "2      20050110  ...     0\n",
              "3      20050110  ...     0\n",
              "4      20050110  ...     0\n",
              "...         ...  ...   ...\n",
              "24046  20150108  ...     1\n",
              "24047  20150108  ...     1\n",
              "24048  20150108  ...     1\n",
              "24049  20150108  ...     1\n",
              "24050  20150108  ...     1\n",
              "\n",
              "[24051 rows x 10 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XBgA_6YRv3KB",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "outputId": "73a28fca-497e-4b21-a3f1-1e4de356f3e6"
      },
      "source": [
        "%tensorflow_version 1.x\n",
        "import tensorflow as tf\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np  \n",
        "import re\n",
        "import pickle\n",
        "\n",
        "import keras as keras\n",
        "from keras.models import load_model\n",
        "from keras import backend as K\n",
        "from keras import Input, Model\n",
        "from keras import optimizers\n",
        "\n",
        "import codecs\n",
        "from tqdm import tqdm\n",
        "import shutil\n",
        "import warnings\n",
        "import tensorflow as tf\n",
        "import os\n",
        "warnings.filterwarnings(action='ignore')\n",
        "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n",
        "tf.logging.set_verbosity(tf.logging.ERROR)\n",
        "\n",
        "!pip install keras-bert\n",
        "!pip install keras-radam"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "TensorFlow 1.x selected.\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "Using TensorFlow backend.\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "Collecting keras-bert\n",
            "  Downloading https://files.pythonhosted.org/packages/2c/0f/cdc886c1018943ea62d3209bc964413d5aa9d0eb7e493abd8545be679294/keras-bert-0.81.0.tar.gz\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from keras-bert) (1.18.4)\n",
            "Requirement already satisfied: Keras in /usr/local/lib/python3.6/dist-packages (from keras-bert) (2.3.1)\n",
            "Collecting keras-transformer>=0.30.0\n",
            "  Downloading https://files.pythonhosted.org/packages/22/b9/9040ec948ef895e71df6bee505a1f7e1c99ffedb409cb6eb329f04ece6e0/keras-transformer-0.33.0.tar.gz\n",
            "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (2.10.0)\n",
            "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (1.1.2)\n",
            "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (1.12.0)\n",
            "Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (1.4.1)\n",
            "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (1.0.8)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from Keras->keras-bert) (3.13)\n",
            "Collecting keras-pos-embd>=0.10.0\n",
            "  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz\n",
            "Collecting keras-multi-head>=0.22.0\n",
            "  Downloading https://files.pythonhosted.org/packages/a5/f0/a9a7528b8fefacaa9c5db736036fd8c061d754830a29c34129f6847bd338/keras-multi-head-0.24.0.tar.gz\n",
            "Collecting keras-layer-normalization>=0.12.0\n",
            "  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz\n",
            "Collecting keras-position-wise-feed-forward>=0.5.0\n",
            "  Downloading https://files.pythonhosted.org/packages/e3/59/f0faa1037c033059e7e9e7758e6c23b4d1c0772cd48de14c4b6fd4033ad5/keras-position-wise-feed-forward-0.6.0.tar.gz\n",
            "Collecting keras-embed-sim>=0.7.0\n",
            "  Downloading https://files.pythonhosted.org/packages/bc/20/735fd53f6896e2af63af47e212601c1b8a7a80d00b6126c388c9d1233892/keras-embed-sim-0.7.0.tar.gz\n",
            "Collecting keras-self-attention==0.41.0\n",
            "  Downloading https://files.pythonhosted.org/packages/1b/1c/01599219bef7266fa43b3316e4f55bcb487734d3bafdc60ffd564f3cfe29/keras-self-attention-0.41.0.tar.gz\n",
            "Building wheels for collected packages: keras-bert, keras-transformer, keras-pos-embd, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-self-attention\n",
            "  Building wheel for keras-bert (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-bert: filename=keras_bert-0.81.0-cp36-none-any.whl size=37913 sha256=f6e87897fa56346f3a9bd0607c976c0fb72e1d4f5d5798159416838347b34b2f\n",
            "  Stored in directory: /root/.cache/pip/wheels/bd/27/da/ffc2d573aa48b87440ec4f98bc7c992e3a2d899edb2d22ef9e\n",
            "  Building wheel for keras-transformer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-transformer: filename=keras_transformer-0.33.0-cp36-none-any.whl size=13260 sha256=4cf6dcab922b6caf627c1ba6adc5dbe6e8e2e4d7f59247b710d043b3bc5f8da2\n",
            "  Stored in directory: /root/.cache/pip/wheels/26/98/13/a28402939e1d48edd8704e6b02f223795af4a706815f4bf6d8\n",
            "  Building wheel for keras-pos-embd (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-pos-embd: filename=keras_pos_embd-0.11.0-cp36-none-any.whl size=7554 sha256=8d7fac58ed8196ae123121c05fc80e7cdbcd03425613de81b7512c0a270a4ba2\n",
            "  Stored in directory: /root/.cache/pip/wheels/5b/a1/a0/ce6b1d49ba1a9a76f592e70cf297b05c96bc9f418146761032\n",
            "  Building wheel for keras-multi-head (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-multi-head: filename=keras_multi_head-0.24.0-cp36-none-any.whl size=15511 sha256=965f1fd64d0293581290a3590617435dce809574fa0029af5b70f2a827244133\n",
            "  Stored in directory: /root/.cache/pip/wheels/b6/84/01/dbcb50629030c8647a19dd0b7134574fad56c531bdb243bd20\n",
            "  Building wheel for keras-layer-normalization (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.14.0-cp36-none-any.whl size=5268 sha256=c9f4b2d27ebb8746e641efeaa10ccd6d26ccecf07851d6faebe0ffb4863deaa1\n",
            "  Stored in directory: /root/.cache/pip/wheels/54/80/22/a638a7d406fd155e507aa33d703e3fa2612b9eb7bb4f4fe667\n",
            "  Building wheel for keras-position-wise-feed-forward (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-position-wise-feed-forward: filename=keras_position_wise_feed_forward-0.6.0-cp36-none-any.whl size=5623 sha256=d502009afa989aa58bd189344430c7c5518e9465a0a1c6e4ef21d77a162d9c97\n",
            "  Stored in directory: /root/.cache/pip/wheels/39/e2/e2/3514fef126a00574b13bc0b9e23891800158df3a3c19c96e3b\n",
            "  Building wheel for keras-embed-sim (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-embed-sim: filename=keras_embed_sim-0.7.0-cp36-none-any.whl size=4676 sha256=c7445fbf736a11babf19d02ddb3d76f098a00706c800f3080ebc9a55745ca146\n",
            "  Stored in directory: /root/.cache/pip/wheels/d1/bc/b1/b0c45cee4ca2e6c86586b0218ffafe7f0703c6d07fdf049866\n",
            "  Building wheel for keras-self-attention (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-self-attention: filename=keras_self_attention-0.41.0-cp36-none-any.whl size=17288 sha256=bdeda9b286ae3be34885c5183effca526d866cba7dd00c740f02eb340e1fab42\n",
            "  Stored in directory: /root/.cache/pip/wheels/cc/dc/17/84258b27a04cd38ac91998abe148203720ca696186635db694\n",
            "Successfully built keras-bert keras-transformer keras-pos-embd keras-multi-head keras-layer-normalization keras-position-wise-feed-forward keras-embed-sim keras-self-attention\n",
            "Installing collected packages: keras-pos-embd, keras-self-attention, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-transformer, keras-bert\n",
            "Successfully installed keras-bert-0.81.0 keras-embed-sim-0.7.0 keras-layer-normalization-0.14.0 keras-multi-head-0.24.0 keras-pos-embd-0.11.0 keras-position-wise-feed-forward-0.6.0 keras-self-attention-0.41.0 keras-transformer-0.33.0\n",
            "Collecting keras-radam\n",
            "  Downloading https://files.pythonhosted.org/packages/46/8d/b83ccaa94253fbc920b21981f038393041d92236bb541751b98a66a2ac1d/keras-radam-0.15.0.tar.gz\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from keras-radam) (1.18.4)\n",
            "Requirement already satisfied: Keras in /usr/local/lib/python3.6/dist-packages (from keras-radam) (2.3.1)\n",
            "Requirement already satisfied: h5py in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (2.10.0)\n",
            "Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (1.4.1)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (3.13)\n",
            "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (1.12.0)\n",
            "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (1.0.8)\n",
            "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from Keras->keras-radam) (1.1.2)\n",
            "Building wheels for collected packages: keras-radam\n",
            "  Building wheel for keras-radam (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for keras-radam: filename=keras_radam-0.15.0-cp36-none-any.whl size=14685 sha256=60abbb595b856dbbf59934ad85b8754fc6d57e41d84bce2fee5b922a3717fc8a\n",
            "  Stored in directory: /root/.cache/pip/wheels/79/a0/c0/670b0a118e8f078539fafec7bd02eba0af921f745660c7f83f\n",
            "Successfully built keras-radam\n",
            "Installing collected packages: keras-radam\n",
            "Successfully installed keras-radam-0.15.0\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V7_zjhL5wGeB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from keras_bert import load_trained_model_from_checkpoint, load_vocabulary\n",
        "from keras_bert import Tokenizer\n",
        "from keras_bert import AdamWarmup, calc_train_steps\n",
        "\n",
        "from keras_radam import RAdam"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RE5pjPZjwG3q",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        },
        "outputId": "2b293bd2-7d77-4a03-a8fe-af5896058933"
      },
      "source": [
        "os.listdir(path+'/bert')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['bert_config.json',\n",
              " 'vocab.txt',\n",
              " 'bert_model.ckpt.index',\n",
              " 'bert_model.ckpt.data-00000-of-00001',\n",
              " 'bert_model.ckpt.meta']"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yWqOLyGWwIMf",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "SEQ_LEN = 256\n",
        "BATCH_SIZE = 8\n",
        "EPOCHS=2\n",
        "LR=1e-5\n",
        "\n",
        "pretrained_path = path+\"/bert\"\n",
        "config_path = os.path.join(pretrained_path, 'bert_config.json')\n",
        "checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')\n",
        "vocab_path = os.path.join(pretrained_path, 'vocab.txt')\n",
        "\n",
        "DATA_COLUMN = \"body\"\n",
        "LABEL_COLUMN = \"index\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G4E3vhF5wKmg",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "token_dict = {}\n",
        "with codecs.open(vocab_path, 'r', 'utf8') as reader:\n",
        "    for line in reader:\n",
        "        token = line.strip()\n",
        "        if \"_\" in token:\n",
        "          token = token.replace(\"_\",\"\")\n",
        "          token = \"##\" + token\n",
        "        token_dict[token] = len(token_dict)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "c5a7hPzfwRcr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tokenizer = Tokenizer(token_dict)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vehabKa5wTKG",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def convert_data(data_df):\n",
        "    global tokenizer\n",
        "    indices, targets = [], []\n",
        "    for i in tqdm(range(len(data_df))):\n",
        "        ids, segments = tokenizer.encode((data_df.iloc[i])[DATA_COLUMN], max_len=SEQ_LEN)\n",
        "        indices.append(ids)\n",
        "        targets.append((data_df.iloc[i])[LABEL_COLUMN])\n",
        "    items = list(zip(indices, targets))\n",
        "    \n",
        "    indices, targets = zip(*items)\n",
        "    indices = np.array(indices)\n",
        "    return [indices, np.zeros_like(indices)], np.array(targets)\n",
        "\n",
        "def load_data(pandas_dataframe):\n",
        "    data_df = pandas_dataframe\n",
        "    # data_df[\"actor\"] = data_df[\"actor\"].astype(str)\n",
        "    # data_df[\"action\"] = data_df[\"action\"].astype(str)\n",
        "    # data_df[\"object\"] = data_df[\"object\"].astype(str)\n",
        "    data_x, data_y = convert_data(data_df)\n",
        "\n",
        "    return data_x, data_y"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V8xrXJlywXG-",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        },
        "outputId": "1c560b33-635a-4eca-df3c-eae387590031"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "train,val = train_test_split(combined_data,test_size = 0.2)\n",
        "\n",
        "train_x, train_y = load_data(train)\n",
        "test_x, test_y = load_data(val)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100%|██████████| 19240/19240 [00:14<00:00, 1307.17it/s]\n",
            "100%|██████████| 4811/4811 [00:03<00:00, 1265.52it/s]\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VyyTba9swZgM",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "layer_num = 12\n",
        "model = load_trained_model_from_checkpoint(\n",
        "    config_path,\n",
        "    checkpoint_path,\n",
        "    training=True,\n",
        "    trainable=True,\n",
        "    seq_len=SEQ_LEN,)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7jO_vzY6w_qa",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from keras.callbacks import ModelCheckpoint, EarlyStopping\n",
        "def recall(y_true, y_pred):\n",
        "    \"\"\"Recall metric.\n",
        "\n",
        "    Only computes a batch-wise average of recall.\n",
        "\n",
        "    Computes the recall, a metric for multi-label classification of\n",
        "    how many relevant items are selected.\n",
        "    \"\"\"\n",
        "    true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))\n",
        "    possible_positives = K.sum(K.round(K.clip(y_true[:, 0], 0, 1)))\n",
        "    recall = true_positives / (possible_positives + K.epsilon())\n",
        "    return recall\n",
        "\n",
        "\n",
        "def precision(y_true, y_pred):\n",
        "    \"\"\"Precision metric.\n",
        "\n",
        "    Only computes a batch-wise average of precision.\n",
        "\n",
        "    Computes the precision, a metric for multi-label classification of\n",
        "    how many selected items are relevant.\n",
        "    \"\"\"\n",
        "    true_positives = K.sum(K.round(K.clip(y_true[:, 0] * y_pred[:, 0], 0, 1)))\n",
        "    predicted_positives = K.sum(K.round(K.clip(y_pred[:, 0], 0, 1)))\n",
        "    precision = true_positives / (predicted_positives + K.epsilon())\n",
        "    return precision\n",
        "\n",
        "\n",
        "def fbeta_score(y_true, y_pred):\n",
        "\n",
        "    # If there are no true positives, fix the F score at 0 like sklearn.\n",
        "    if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:\n",
        "        return 0\n",
        "\n",
        "    p = precision(y_true, y_pred)\n",
        "    r = recall(y_true, y_pred)\n",
        "    bb = 1 ** 2\n",
        "    fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())\n",
        "    return fbeta_score\n",
        "\n",
        "def get_bert_finetuning_model(model):\n",
        "  inputs = model.inputs[:2]\n",
        "  dense = model.layers[-3].output\n",
        "\n",
        "  outputs = keras.layers.Dense(1, activation='sigmoid',kernel_initializer=keras.initializers.TruncatedNormal(stddev=0.02),\n",
        "                              name = 'real_output')(dense)\n",
        "\n",
        "\n",
        "\n",
        "  bert_model = keras.models.Model(inputs, outputs)\n",
        "  bert_model.compile(\n",
        "      optimizer=RAdam(learning_rate=0.00001, weight_decay=0.0025),\n",
        "      loss='binary_crossentropy',\n",
        "      metrics=['accuracy', recall, precision, fbeta_score])\n",
        "  \n",
        "  return bert_model\n",
        "  \n",
        "model_name = path + \"event_news_label_bert.h5\"\n",
        "checkpointer = ModelCheckpoint(filepath=model_name,\n",
        "                                monitor='val_fbeta_score', mode=\"max\",\n",
        "                                verbose=2, save_best_only=True)\n",
        "earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=2, mode = \"min\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XQDRjG2vbKKs",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 938
        },
        "outputId": "7fbefaa0-2ad0-4c1d-d486-27379af24381"
      },
      "source": [
        "with K.tensorflow_backend.tf.device('/gpu:0'):\n",
        "  sess = K.get_session()\n",
        "  uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])\n",
        "  init = tf.variables_initializer([v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables])\n",
        "  sess.run(init)\n",
        "\n",
        "  bert_model = get_bert_finetuning_model(model)\n",
        "  history = bert_model.fit(train_x, train_y, epochs=30, batch_size=16, verbose = 1, validation_data=(test_x, test_y))\n",
        "  bert_model.save_weights(\"gdrive/My Drive/body_bert_256_epoch30.h5\")"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Train on 19240 samples, validate on 4811 samples\n",
            "Epoch 1/30\n",
            "19240/19240 [==============================] - 1236s 64ms/step - loss: 0.6922 - accuracy: 0.5271 - recall: 0.9021 - precision: 0.5280 - fbeta_score: 0.6416 - val_loss: 0.6910 - val_accuracy: 0.5340 - val_recall: 1.0000 - val_precision: 0.5341 - val_fbeta_score: 0.6876\n",
            "Epoch 2/30\n",
            "19240/19240 [==============================] - 1228s 64ms/step - loss: 0.6914 - accuracy: 0.5291 - recall: 0.8927 - precision: 0.5204 - fbeta_score: 0.6347 - val_loss: 0.6919 - val_accuracy: 0.5340 - val_recall: 1.0000 - val_precision: 0.5341 - val_fbeta_score: 0.6876\n",
            "Epoch 3/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.6861 - accuracy: 0.5491 - recall: 0.7746 - precision: 0.5634 - fbeta_score: 0.6203 - val_loss: 0.6902 - val_accuracy: 0.5309 - val_recall: 0.7255 - val_precision: 0.5468 - val_fbeta_score: 0.6113\n",
            "Epoch 4/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.6125 - accuracy: 0.6657 - recall: 0.7281 - precision: 0.6842 - fbeta_score: 0.6798 - val_loss: 0.7663 - val_accuracy: 0.5259 - val_recall: 0.4899 - val_precision: 0.5644 - val_fbeta_score: 0.5093\n",
            "Epoch 5/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.3738 - accuracy: 0.8379 - recall: 0.8502 - precision: 0.8488 - fbeta_score: 0.8387 - val_loss: 1.0253 - val_accuracy: 0.5329 - val_recall: 0.6017 - val_precision: 0.5592 - val_fbeta_score: 0.5647\n",
            "Epoch 6/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.1909 - accuracy: 0.9276 - recall: 0.9332 - precision: 0.9313 - fbeta_score: 0.9271 - val_loss: 1.3036 - val_accuracy: 0.5319 - val_recall: 0.5900 - val_precision: 0.5597 - val_fbeta_score: 0.5601\n",
            "Epoch 7/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.1249 - accuracy: 0.9540 - recall: 0.9576 - precision: 0.9573 - fbeta_score: 0.9544 - val_loss: 1.6319 - val_accuracy: 0.5404 - val_recall: 0.6667 - val_precision: 0.5567 - val_fbeta_score: 0.5950\n",
            "Epoch 8/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.0950 - accuracy: 0.9663 - recall: 0.9678 - precision: 0.9675 - fbeta_score: 0.9655 - val_loss: 1.7987 - val_accuracy: 0.5383 - val_recall: 0.5949 - val_precision: 0.5670 - val_fbeta_score: 0.5654\n",
            "Epoch 9/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0802 - accuracy: 0.9715 - recall: 0.9726 - precision: 0.9745 - fbeta_score: 0.9717 - val_loss: 1.8214 - val_accuracy: 0.5311 - val_recall: 0.5689 - val_precision: 0.5639 - val_fbeta_score: 0.5503\n",
            "Epoch 10/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.0726 - accuracy: 0.9730 - recall: 0.9738 - precision: 0.9757 - fbeta_score: 0.9730 - val_loss: 1.9001 - val_accuracy: 0.5417 - val_recall: 0.6549 - val_precision: 0.5639 - val_fbeta_score: 0.5913\n",
            "Epoch 11/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.0618 - accuracy: 0.9768 - recall: 0.9769 - precision: 0.9794 - fbeta_score: 0.9767 - val_loss: 1.9707 - val_accuracy: 0.5350 - val_recall: 0.6545 - val_precision: 0.5576 - val_fbeta_score: 0.5870\n",
            "Epoch 12/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0607 - accuracy: 0.9779 - recall: 0.9785 - precision: 0.9805 - fbeta_score: 0.9780 - val_loss: 1.9424 - val_accuracy: 0.5371 - val_recall: 0.5922 - val_precision: 0.5664 - val_fbeta_score: 0.5638\n",
            "Epoch 13/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0521 - accuracy: 0.9796 - recall: 0.9808 - precision: 0.9814 - fbeta_score: 0.9798 - val_loss: 2.2737 - val_accuracy: 0.5383 - val_recall: 0.6275 - val_precision: 0.5605 - val_fbeta_score: 0.5782\n",
            "Epoch 14/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0514 - accuracy: 0.9797 - recall: 0.9803 - precision: 0.9818 - fbeta_score: 0.9797 - val_loss: 1.9318 - val_accuracy: 0.5309 - val_recall: 0.5317 - val_precision: 0.5681 - val_fbeta_score: 0.5332\n",
            "Epoch 15/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0449 - accuracy: 0.9813 - recall: 0.9797 - precision: 0.9844 - fbeta_score: 0.9808 - val_loss: 2.3235 - val_accuracy: 0.5277 - val_recall: 0.4475 - val_precision: 0.5793 - val_fbeta_score: 0.4868\n",
            "Epoch 16/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0445 - accuracy: 0.9824 - recall: 0.9824 - precision: 0.9850 - fbeta_score: 0.9827 - val_loss: 2.1759 - val_accuracy: 0.5340 - val_recall: 0.4795 - val_precision: 0.5824 - val_fbeta_score: 0.5076\n",
            "Epoch 17/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0412 - accuracy: 0.9827 - recall: 0.9822 - precision: 0.9854 - fbeta_score: 0.9827 - val_loss: 2.1135 - val_accuracy: 0.5390 - val_recall: 0.6302 - val_precision: 0.5630 - val_fbeta_score: 0.5813\n",
            "Epoch 18/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0418 - accuracy: 0.9828 - recall: 0.9826 - precision: 0.9852 - fbeta_score: 0.9828 - val_loss: 2.2571 - val_accuracy: 0.5394 - val_recall: 0.6241 - val_precision: 0.5648 - val_fbeta_score: 0.5785\n",
            "Epoch 19/30\n",
            "19240/19240 [==============================] - 1229s 64ms/step - loss: 0.0375 - accuracy: 0.9839 - recall: 0.9837 - precision: 0.9863 - fbeta_score: 0.9839 - val_loss: 2.4486 - val_accuracy: 0.5427 - val_recall: 0.6864 - val_precision: 0.5607 - val_fbeta_score: 0.6030\n",
            "Epoch 20/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0390 - accuracy: 0.9837 - recall: 0.9828 - precision: 0.9865 - fbeta_score: 0.9836 - val_loss: 2.3747 - val_accuracy: 0.5321 - val_recall: 0.5468 - val_precision: 0.5661 - val_fbeta_score: 0.5405\n",
            "Epoch 21/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0347 - accuracy: 0.9852 - recall: 0.9846 - precision: 0.9878 - fbeta_score: 0.9854 - val_loss: 2.3107 - val_accuracy: 0.5375 - val_recall: 0.5940 - val_precision: 0.5656 - val_fbeta_score: 0.5647\n",
            "Epoch 22/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0356 - accuracy: 0.9854 - recall: 0.9844 - precision: 0.9877 - fbeta_score: 0.9850 - val_loss: 2.4489 - val_accuracy: 0.5371 - val_recall: 0.6188 - val_precision: 0.5599 - val_fbeta_score: 0.5741\n",
            "Epoch 23/30\n",
            "19240/19240 [==============================] - 1230s 64ms/step - loss: 0.0368 - accuracy: 0.9837 - recall: 0.9825 - precision: 0.9863 - fbeta_score: 0.9832 - val_loss: 2.1525 - val_accuracy: 0.5271 - val_recall: 0.4709 - val_precision: 0.5715 - val_fbeta_score: 0.4996\n",
            "Epoch 24/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0341 - accuracy: 0.9845 - recall: 0.9841 - precision: 0.9870 - fbeta_score: 0.9846 - val_loss: 2.1537 - val_accuracy: 0.5271 - val_recall: 0.5332 - val_precision: 0.5623 - val_fbeta_score: 0.5319\n",
            "Epoch 25/30\n",
            "19240/19240 [==============================] - 1231s 64ms/step - loss: 0.0313 - accuracy: 0.9857 - recall: 0.9853 - precision: 0.9879 - fbeta_score: 0.9856 - val_loss: 2.4771 - val_accuracy: 0.5309 - val_recall: 0.6418 - val_precision: 0.5529 - val_fbeta_score: 0.5808\n",
            "Epoch 26/30\n",
            "15408/19240 [=======================>......] - ETA: 3:48 - loss: 0.0320 - accuracy: 0.9859 - recall: 0.9857 - precision: 0.9883 - fbeta_score: 0.9861"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jBpYE9eVxfXv",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "test = pd.read_csv('gdrive/My Drive/capstone 2/event_embedding/Thesis_data/combined_data2015.csv', encoding='utf-8') "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NQu0eoaWxfsv",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def predict_convert_data(data_df):\n",
        "    global tokenizer\n",
        "    indices = []\n",
        "    for i in tqdm(range(len(data_df))):\n",
        "        ids, segments = tokenizer.encode(data_df[DATA_COLUMN][i], max_len=SEQ_LEN)\n",
        "        indices.append(ids)\n",
        "        \n",
        "    items = indices\n",
        "    \n",
        "    \n",
        "    indices = np.array(indices)\n",
        "    return [indices, np.zeros_like(indices)]\n",
        "\n",
        "def predict_load_data(x): #Pandas Dataframe을 인풋으로 받는다\n",
        "    data_df = x\n",
        "    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)\n",
        "    data_x = predict_convert_data(data_df)\n",
        "\n",
        "    return data_x"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DBY60yKJxnKL",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "87137a7f-a38e-4fe4-b29b-cfd867cedd80"
      },
      "source": [
        "test_set = predict_load_data(test)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100%|██████████| 3692/3692 [00:01<00:00, 2567.73it/s]\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yuZyrVFCo6_9",
        "colab_type": "text"
      },
      "source": [
        "# Body 128"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jf9yeGiVbFxO",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 170
        },
        "outputId": "67adb4d9-670c-41e5-f0ac-d20e1c6caae2"
      },
      "source": [
        "bert_model = get_bert_finetuning_model(model)\n",
        "bert_model.load_weights(\"gdrive/My Drive/body_bert.h5\")\n",
        "preds = bert_model.predict(test_set)\n",
        "from sklearn.metrics import classification_report\n",
        "y_true = test['index']\n",
        "# F1 Score 확인\n",
        "print(classification_report(y_true, np.round(preds,0)))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.51      0.24      0.33      1867\n",
            "           1       0.50      0.76      0.60      1825\n",
            "\n",
            "    accuracy                           0.50      3692\n",
            "   macro avg       0.51      0.50      0.47      3692\n",
            "weighted avg       0.51      0.50      0.46      3692\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CNChuUzCbY3t",
        "colab_type": "text"
      },
      "source": [
        "# Body 256 epoch 3"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "y3l9jap3xpFB",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 170
        },
        "outputId": "5c1f17cb-0f0c-4899-b1bf-e4db6dfceb3a"
      },
      "source": [
        "bert_model = get_bert_finetuning_model(model)\n",
        "bert_model.load_weights(path+\"body_bert_512.h5\")\n",
        "preds = bert_model.predict(test_set)\n",
        "from sklearn.metrics import classification_report\n",
        "y_true = test['index']\n",
        "# F1 Score 확인\n",
        "print(classification_report(y_true, np.round(preds,0)))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "              precision    recall  f1-score   support\n",
            "\n",
            "           0       0.48      0.22      0.30      1867\n",
            "           1       0.49      0.76      0.59      1825\n",
            "\n",
            "    accuracy                           0.49      3692\n",
            "   macro avg       0.48      0.49      0.45      3692\n",
            "weighted avg       0.48      0.49      0.45      3692\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}