장르 분류 최종.ipynb 85.5 KB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5699\n",
      "9823\n",
      "14020\n",
      "2727\n",
      "1498\n",
      "1464\n",
      "8286\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3063: DtypeWarning: Columns (2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'\\ntrain_data_size = 732*6\\ntest_data_size = 732*6\\n'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import re\n",
    "\"\"\"\n",
    "thriller_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/thrillerPlot.csv')\n",
    "drama_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/dramaPlot.csv')\n",
    "fantasy_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/fantasyPlot.csv')\n",
    "history_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/historyPlot.csv')\n",
    "social_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/socialPlot.csv')\n",
    "romance_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/moviedata/moviePlot/romancePlot.csv')\n",
    "musical_plot = pd.read_csv('/Users/yangyoonji/Documents/2020-1/2020-dataCapstone/data/musicalData/broadMusicalPlot.csv',encoding='cp949')\n",
    "\n",
    "# /Users/김서영/Desktop/datacap/data/moviedata/moviePlot/romancePlot.csv\n",
    "\"\"\"\n",
    "romance_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/romancePlot.csv')\n",
    "thriller_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/thrillerPlot.csv')\n",
    "drama_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/dramaPlot.csv')\n",
    "fantasy_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/fantasyPlot.csv')\n",
    "history_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/historyPlot.csv')\n",
    "social_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/socialPlot.csv')\n",
    "non_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/nonPlot.csv')\n",
    "\n",
    "musical_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/musicalData/broadMusicalPlot.csv',encoding='cp949')\n",
    "\n",
    "\n",
    "print(len(romance_plot)) #5699 ==> train 2500 test 2500\n",
    "print(len(thriller_plot)) #9823 ==> train 2500 test 2500\n",
    "print(len(drama_plot))\n",
    "print(len(fantasy_plot)) #2727\n",
    "print(len(history_plot))\n",
    "print(len(social_plot))\n",
    "\n",
    "print(len(non_plot))\n",
    "\n",
    "\"\"\"\n",
    "train_data_size = 732*6\n",
    "test_data_size = 732*6\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#전체 레이블\n",
    "each_len = 1350\n",
    "\n",
    "train_labels = []\n",
    "test_labels = []\n",
    "\n",
    "RM_train = [[] for _ in range(2000)]\n",
    "RM_test = [[] for _ in range(700)]\n",
    "\n",
    "for i in range(2700):\n",
    "    if i < 2000:\n",
    "        RM_train[i].append(''.join(romance_plot.줄거리[i]))\n",
    "        train_labels.append(0)\n",
    "    else:\n",
    "        j = i - 2000 \n",
    "        RM_test[j].append(''.join(romance_plot.줄거리[i]))\n",
    "        test_labels.append(0)\n",
    "\n",
    "TH_train = [[] for _ in range(2000)]\n",
    "TH_test = [[] for _ in range(700)]\n",
    "for i in range(2700):\n",
    "    if i < 2000:\n",
    "        TH_train[i].append(''.join(thriller_plot.줄거리[i+700]))\n",
    "        train_labels.append(1)\n",
    "    else:\n",
    "        j = i - 2000 \n",
    "        TH_test[j].append(''.join(thriller_plot.줄거리[i-1999]))\n",
    "        test_labels.append(1)\n",
    "      \n",
    "        \n",
    "FN_train = [[] for _ in range(2000)]\n",
    "FN_test = [[] for _ in range(700)]\n",
    "for i in range(2700):\n",
    "    if i < 2000:\n",
    "        FN_train[i].append(''.join(fantasy_plot.줄거리[i]))\n",
    "        train_labels.append(2)\n",
    "    else:\n",
    "        j = i - 2000 \n",
    "        FN_test[j].append(''.join(fantasy_plot.줄거리[i]))\n",
    "        test_labels.append(2)\n",
    "\n",
    "HS_train = [[] for _ in range(1000)]\n",
    "HS_test = [[] for _ in range(350)]\n",
    "for i in range(each_len):\n",
    "    if i < 1000:\n",
    "        HS_train[i].append(''.join(history_plot.줄거리[i]))\n",
    "        train_labels.append(3)\n",
    "    else:\n",
    "        j = i- 1000\n",
    "        HS_test[j].append(''.join(history_plot.줄거리[i]))\n",
    "        test_labels.append(3)\n",
    "       \n",
    "\"\"\"    \n",
    "SC_train = [[] for _ in range(1000)]\n",
    "SC_test = [[] for _ in range(350)]\n",
    "for i in range(1350):\n",
    "    if i < 1000:\n",
    "        SC_train[i].append(''.join(social_plot.줄거리[i]))\n",
    "        train_labels.append(3)\n",
    "    else:\n",
    "        j = i-1000\n",
    "        SC_test[j].append(''.join(social_plot.줄거리[i]))\n",
    "        test_labels.append(3)\n",
    "  \n",
    "\n",
    "NN_train = [[] for _ in range(2000)]\n",
    "NN_test = [[] for _ in range(700)]\n",
    "for i in range(2700):\n",
    "    if i < 2000:\n",
    "        NN_train[i].append(''.join(non_plot.줄거리[i]))\n",
    "        train_labels.append(3)\n",
    "    else:\n",
    "        j = i - 2000 \n",
    "        NN_test[j].append(''.join(non_plot.줄거리[i]))\n",
    "        test_labels.append(3)        \n",
    "\n",
    "DR_train = [[] for _ in range(732)]\n",
    "DR_test = [[] for _ in range(732)]\n",
    "for i in range(1464):\n",
    "    if i < 732:\n",
    "        DR_train[i].append(''.join(drama_plot.줄거리[i]))\n",
    "        train_labels.append(5)\n",
    "    else:\n",
    "        j = 732 - i\n",
    "        DR_test[j].append(''.join(drama_plot.줄거리[i]))\n",
    "        test_labels.append(5)\n",
    " \"\"\"   \n",
    "Mu = [[] for _ in range(307)]\n",
    "for i in range(307):\n",
    "    Mu[i].append(''.join(musical_plot.muplot[i]))\n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#allplot = RM_train+TH_train+FN_train+HS_train+SC_train+DR_train+RM_test+TH_test+FN_test+HS_test+SC_test+DR_test\n",
    "allplot = RM_train+RM_test+HS_train+HS_test+TH_train+FN_train+TH_test+FN_test#+HS_train+SC_train+HS_test+SC_test "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "alltrain = RM_train+TH_train+FN_train+HS_train \n",
    "alltest = RM_test+TH_test+FN_test+HS_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7000\n",
      "2450\n",
      "['In 1983, off-duty policeman Reiden picks up a suspicious couple, Kei and Ai, who claim to be father and daughter but appear to be close in age. Reiden thus becomes involved in a fight between them and the female fighter K2 who is chasing them. Kei draws out his latent ability to the maximum by a Legendary \"Trigger\" and Ai blows away K2. The night sky splits and strange space is shown. Their enemy is the secret society \"Fraud\" of para-psionics, which is commanded by Kuu Ragua Lee. Kei, Ai and Reiden meet the assassins whom Fraud sends out one after another.']\n"
     ]
    }
   ],
   "source": [
    "train_data_size = 7000\n",
    "test_data_size = 2450\n",
    "print(len(alltrain))\n",
    "print(len(alltest))\n",
    "\n",
    "print(HS_test[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████| 9450/9450 [01:14<00:00, 127.26it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "all_vocab = {} \n",
    "all_sentences = []\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "for i in tqdm(allplot):\n",
    "    all_sentences = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "    result = []\n",
    "    for word in all_sentences: \n",
    "        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "        if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "            if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                result.append(word)\n",
    "                if word not in all_vocab:\n",
    "                    all_vocab[word] = 0 \n",
    "                all_vocab[word] += 1\n",
    "    all_sentences.append(result) \n",
    "    \n",
    "all_vocab_sorted = sorted(all_vocab.items(), key = lambda x:x[1], reverse = True)\n",
    "\n",
    "#전처리(4) 인덱스 부여\n",
    "all_word_to_index = {}\n",
    "i=0\n",
    "for (word, frequency) in all_vocab_sorted :\n",
    "    if frequency > 1 : # 정제(Cleaning) 챕터에서 언급했듯이 빈도수가 적은 단어는 제외한다.\n",
    "        i=i+1\n",
    "        all_word_to_index[word] = i\n",
    "#print(all_word_to_index)\n",
    "\n",
    "vocab_size = 15000 #상위 15000개 단어만 사용\n",
    "words_frequency = [w for w,c in all_word_to_index.items() if c >= vocab_size + 1] # 인덱스가 200 초과인 단어 제거\n",
    "for w in words_frequency:\n",
    "    del all_word_to_index[w] # 해당 단어에 대한 인덱스 정보를 삭제\n",
    "\n",
    "    \n",
    "all_word_to_index['OOV'] = len(all_word_to_index) + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. 학습 데이터 정리"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 인코딩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:57<00:00, 121.22it/s]\n"
     ]
    }
   ],
   "source": [
    "vocab = {} \n",
    "sentences = []\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "for i in tqdm(alltrain):\n",
    "    sentence = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "    result = []\n",
    "\n",
    "    for word in sentence: \n",
    "        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "        if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "            if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                result.append(word)\n",
    "                if word not in vocab:\n",
    "                    vocab[word] = 0 \n",
    "                vocab[word] += 1\n",
    "    sentences.append(result) \n",
    "\n",
    "train_encoded = []\n",
    "for s in sentences:\n",
    "    temp = []\n",
    "    for w in s:\n",
    "        try:\n",
    "            temp.append(all_word_to_index[w])\n",
    "        except KeyError:\n",
    "            temp.append(all_word_to_index['OOV'])\n",
    "    train_encoded.append(temp)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = []\n",
    "Y_train = []\n",
    "for i in range(train_data_size):\n",
    "    label = [0,0,0,0]\n",
    "    X_train.append(train_encoded[i])\n",
    "    idx = train_labels[i]\n",
    "    label[idx] = 1\n",
    "    Y_train.append(label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[692, 9385, 2345, 110, 65, 7, 853, 2276, 4248, 3049, 72, 15001, 252, 8415, 1097, 650, 15001, 102, 2501, 6068, 4184, 15001, 1371, 15001, 15001, 8535, 15001, 6187, 256, 3049, 11382, 2933, 2704, 1655, 638, 3049, 15001, 3443, 5395, 7992, 6877, 4524, 1810, 5157, 5857, 15001, 15001, 7287, 15001, 6816, 15001, 15001, 9320, 15001, 58, 14362, 3443, 15001, 11256, 15001, 30, 3332, 15001, 15001, 80, 15001, 13460, 15001, 15001, 1346, 4880, 647, 3049, 1208, 4407, 15001, 15001, 6326, 15001, 2273, 5330, 507, 1614, 15001, 93, 3147, 2273, 324, 3917, 15001, 8463, 4088, 3917, 15001, 15001, 6192, 132, 14085, 12141, 1985, 3899, 7663, 6634, 15001, 931, 15001, 46, 1710, 2276, 5699, 15001, 4850, 3443, 1104, 2095, 1313, 3407, 15001, 833, 15001, 2456, 221, 4125, 3049, 1451, 436, 15001, 15001, 11221, 3443, 1218, 8415, 436, 13189, 15001, 196, 27, 15001, 252, 3443, 80, 15001, 1363, 13056, 2456, 5536, 3443, 94, 3588, 15001, 7648, 6068, 3999, 7097, 15001, 15001, 2911, 1196, 3917, 15001, 15001, 358, 15001, 199, 2373, 2456, 15001, 108, 15001, 1885, 325, 8256, 9376, 3443, 1371, 2276, 110, 11863, 2345, 2213, 15001, 722, 26, 613, 15001, 1655, 2636, 15001, 103, 1655, 3443, 419, 812, 2276, 535, 3443, 15001, 1394, 15001, 9033, 3978, 1394, 110, 1099, 1399, 306, 15001, 9076, 1150, 4638, 206, 94, 6738, 535, 221, 25, 13189, 1194, 15001, 977, 307, 877, 935, 2345, 3443, 217, 15001, 5118, 1994, 15001, 102, 15001, 1473, 5395, 1371, 3443, 4253, 1473, 192, 15001, 3917, 1093, 15001, 31, 977, 563, 4655, 2386, 2933, 148, 259, 427, 11903, 4751, 573, 15001, 1428, 603, 2933, 1521, 2345, 13264, 110, 1897, 2526, 1874, 1473, 374, 31, 9521, 1056, 116, 5395, 1028, 1473, 46, 260, 5395, 199, 15001, 4677, 563, 10481, 11758, 3166, 1291, 15001, 1473, 10803, 1144, 2847, 94, 188, 199, 549, 5395, 138, 1440, 786, 3772, 199, 148, 1388, 10246, 15001, 409, 9047, 1619, 29, 12550, 5395, 94, 188, 9773, 80, 1, 3166, 1144, 2847, 15001, 31, 1144, 94, 9552, 15001, 700, 15001, 412, 3166, 5395, 175, 31, 1619, 29, 1720, 1028, 1473, 275, 122, 3166, 3443, 80, 3166, 1641, 85, 94, 484, 2456, 11355, 4861, 1119, 15001, 239, 3166, 2605, 4464, 1473, 9436, 3166, 1282, 35, 2890, 1266, 1026, 43, 1619, 29, 61, 967, 4749, 126, 3443, 2194, 7967, 3443, 239, 2933, 13394, 43, 3443, 5501, 1606, 7496, 15001, 9033, 2933, 3272, 920, 477, 1212, 2516, 2824, 3272, 3443, 304, 168, 70, 15001, 7017, 382, 693, 1664]\n"
     ]
    }
   ],
   "source": [
    "print(X_train[6000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [ 학습 데이터 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "줄거리 최대 길이 :  2324\n",
      "줄거리 평균 길이 :  234.355\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "len_result = [len(s) for s in X_train]\n",
    "print(\"줄거리 최대 길이 : \",max(len_result))\n",
    "print(\"줄거리 평균 길이 : \",sum(len_result)/len(len_result))\n",
    "\n",
    "plt.subplot(1,2,1)\n",
    "plt.boxplot(len_result)\n",
    "plt.subplot(1,2,2)\n",
    "plt.hist(len_result, bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 0, 0, 1]\n",
      "[652, 15001, 130, 607, 576, 2448, 1656, 57, 943, 2956, 14, 157, 1539, 11530, 4330, 1539, 809, 652, 5218, 607, 2453, 2751, 9090, 5156, 652, 1196, 8, 4302, 8345, 15001, 339, 607, 7879, 19, 2575, 2720, 32, 1787, 29, 3, 127, 652, 3074, 15001, 544, 9387, 339, 650, 62, 3490, 9839, 1461, 15001, 21, 523, 17, 2, 1870, 146, 3104, 15001, 117, 235, 235, 53, 15001, 2664, 1237, 1832, 15001, 652, 1656, 1076, 607, 2720, 883, 212, 1095, 15001, 1126, 176, 15001, 8851, 15001, 1320, 15001, 15001, 1231, 19, 661, 393, 12959, 1790, 1372, 766, 652, 55, 1237, 50, 98, 6526, 15001, 36, 15001, 3074, 14, 652, 138, 1427, 1480, 58, 1037, 390, 168, 1818, 26, 599, 5601, 296, 420, 15, 9387, 107, 15001, 862, 181, 604, 55, 15001, 8085, 1608, 10714, 607, 1656, 2741, 1076, 685, 1613, 1351, 439, 406, 5398, 2592, 4495, 1320, 2575, 278, 3450, 607, 1769, 57, 439, 3183, 3426, 7573, 15001, 3646, 652, 61, 387, 607, 254, 57, 1790, 3355, 1987, 1237, 1656, 15001, 16, 103, 607, 502, 29, 226, 4621, 14, 176, 1, 241, 303, 29, 607, 1290, 169, 607, 667, 685, 264, 15001, 8, 15001, 1402, 15001, 275, 11236, 15001, 6530, 15001, 8086, 15001, 178, 239, 607, 2837, 17, 8606, 10, 9387, 1406, 652, 2127, 28, 547, 15, 1692, 652, 3445, 81, 685, 427, 2239, 163, 1237, 171, 2720, 273, 416, 607, 175, 13971, 245, 6940, 18, 66, 15001, 2720, 234, 2131, 387, 450, 652, 83, 607, 3346, 36, 4, 2550, 486, 607, 2080, 64, 14, 652, 5099, 7154, 2597, 652, 385, 6125, 56, 2597, 265, 255, 607, 163, 226, 2720, 10273, 461, 2128, 105, 209, 127, 1889, 5354, 105, 2720, 357, 502, 607, 51, 900, 4416, 652, 69, 9387, 2635, 138, 1260, 2, 607, 320, 1487, 2069, 1785, 607, 68, 118, 2862, 3429, 652, 1, 2321, 8332, 1548, 15001, 5470, 12337, 453, 5554, 58, 33, 702, 687, 15001, 1825, 125, 764, 15001, 15001, 6987, 1237, 9036, 22, 607, 895, 652, 306, 22, 512, 400, 297, 746, 4719, 1376, 3133, 4, 633, 42, 24, 312, 9387, 66, 5040, 522, 9387, 1029, 652, 1715, 1238, 1961, 4075, 253]\n"
     ]
    }
   ],
   "source": [
    "print(Y_train[6999])\n",
    "print(X_train[701])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 테스트 데이터 정리"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 인코딩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████| 2450/2450 [00:19<00:00, 123.36it/s]\n"
     ]
    }
   ],
   "source": [
    "###\n",
    "vocab1 = {} \n",
    "sentences1 = []\n",
    "stop_words1 = set(stopwords.words('english'))\n",
    "\n",
    "for i in tqdm(alltest):\n",
    "    sentence1 = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "    result1 = []\n",
    "\n",
    "    for word in sentence1: \n",
    "        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "        if word not in stop_words1: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "            if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                result1.append(word)\n",
    "                if word not in vocab1:\n",
    "                    vocab1[word] = 0 \n",
    "                vocab1[word] += 1\n",
    "    sentences1.append(result1) \n",
    "\n",
    "test_encoded = []\n",
    "for s in sentences1:\n",
    "    temp = []\n",
    "    for w in s:\n",
    "        try:\n",
    "            temp.append(all_word_to_index[w])\n",
    "        except KeyError:\n",
    "            temp.append(all_word_to_index['OOV'])\n",
    "    test_encoded.append(temp)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = []\n",
    "Y_test = []\n",
    "for i in range(test_data_size):\n",
    "    label = [0,0,0,0]\n",
    "    X_test.append(test_encoded[i])\n",
    "    idx = test_labels[i]\n",
    "    label[idx] = 1\n",
    "    Y_test.append(label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### [ 테스트 데이터 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "줄거리 최대 길이 :  2593\n",
      "줄거리 평균 길이 :  203.25755102040816\n"
     ]
    },
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "len_result = [len(s) for s in X_test]\n",
    "print(\"줄거리 최대 길이 : \",max(len_result))\n",
    "print(\"줄거리 평균 길이 : \",sum(len_result)/len(len_result))\n",
    "\n",
    "plt.subplot(1,2,1)\n",
    "plt.boxplot(len_result)\n",
    "plt.subplot(1,2,2)\n",
    "plt.hist(len_result, bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 0, 0, 0]\n",
      "[15001, 11704, 354, 691, 447, 2738, 2441, 15001, 11, 220, 163, 447, 658, 15001, 996, 261, 792, 313, 329, 40, 45, 69, 2738, 4051, 432, 2157, 4968, 70, 2738, 119, 440, 11, 447, 2674, 119, 8, 2108, 257, 386, 1785, 784, 2735, 102, 467, 688, 379, 13040, 15001, 789, 279, 226, 265, 1640, 91, 9333, 113, 18]\n"
     ]
    }
   ],
   "source": [
    "print(Y_test[1])\n",
    "print(X_test[1])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 뮤지컬 데이터 정리"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████| 307/307 [00:00<00:00, 653.38it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "Mu_vocab = {} \n",
    "Mu_sentences = []\n",
    "\n",
    "for i in tqdm(Mu):\n",
    "    Mu_sentence = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "    result = []\n",
    "    \n",
    "    for word in Mu_sentence: \n",
    "        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "        if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "            if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                result.append(word)\n",
    "                if word not in Mu_vocab:\n",
    "                    Mu_vocab[word] = 0 \n",
    "                Mu_vocab[word] += 1\n",
    "\n",
    "    Mu_sentences.append(result) \n",
    "    \n",
    "    \n",
    "Mu_encoded = []\n",
    "for s in Mu_sentences:\n",
    "    temp = []\n",
    "    for w in s:\n",
    "        try:\n",
    "            temp.append(all_word_to_index[w])\n",
    "        except KeyError:\n",
    "            temp.append(all_word_to_index['OOV'])\n",
    "    Mu_encoded.append(temp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. LSTM 분류 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 7000 samples, validate on 2450 samples\n",
      "Epoch 1/7\n",
      "6976/7000 [============================>.] - ETA: 0s - loss: 1.2966 - acc: 0.4074\n",
      "Epoch 00001: val_acc improved from -inf to 0.46367, saving model to best_model.h5\n",
      "7000/7000 [==============================] - 71s 10ms/sample - loss: 1.2965 - acc: 0.4076 - val_loss: 1.2583 - val_acc: 0.4637\n",
      "Epoch 2/7\n",
      "6976/7000 [============================>.] - ETA: 0s - loss: 0.8872 - acc: 0.6608\n",
      "Epoch 00002: val_acc improved from 0.46367 to 0.58980, saving model to best_model.h5\n",
      "7000/7000 [==============================] - 69s 10ms/sample - loss: 0.8868 - acc: 0.6611 - val_loss: 1.0250 - val_acc: 0.5898\n",
      "Epoch 3/7\n",
      "6976/7000 [============================>.] - ETA: 0s - loss: 0.6036 - acc: 0.7808\n",
      "Epoch 00003: val_acc did not improve from 0.58980\n",
      "7000/7000 [==============================] - 80s 11ms/sample - loss: 0.6033 - acc: 0.7810 - val_loss: 1.2084 - val_acc: 0.5731\n",
      "Epoch 4/7\n",
      " 768/7000 [==>...........................] - ETA: 1:07 - loss: 0.3901 - acc: 0.8736WARNING:tensorflow:Early stopping conditioned on metric `val_loss` which is not available. Available metrics are: loss,acc\n",
      "WARNING:tensorflow:Can save best model only with val_acc available, skipping.\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-58-08c0a794745f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     26\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     27\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'categorical_crossentropy'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'rmsprop'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'acc'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 28\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m64\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[0;32m    817\u001b[0m         \u001b[0mmax_queue_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmax_queue_size\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    818\u001b[0m         \u001b[0mworkers\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mworkers\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 819\u001b[1;33m         use_multiprocessing=use_multiprocessing)\n\u001b[0m\u001b[0;32m    820\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    821\u001b[0m   def evaluate(self,\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[0;32m    340\u001b[0m                 \u001b[0mmode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mModeKeys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    341\u001b[0m                 \u001b[0mtraining_context\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtraining_context\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 342\u001b[1;33m                 total_epochs=epochs)\n\u001b[0m\u001b[0;32m    343\u001b[0m             \u001b[0mcbks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmake_logs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepoch_logs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraining_result\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mModeKeys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    344\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2.py\u001b[0m in \u001b[0;36mrun_one_epoch\u001b[1;34m(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)\u001b[0m\n\u001b[0;32m    126\u001b[0m         step=step, mode=mode, size=current_batch_size) as batch_logs:\n\u001b[0;32m    127\u001b[0m       \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 128\u001b[1;33m         \u001b[0mbatch_outs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    129\u001b[0m       \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mStopIteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOutOfRangeError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m         \u001b[1;31m# TODO(kaftan): File bug about tf function and errors.OutOfRangeError?\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2_utils.py\u001b[0m in \u001b[0;36mexecution_function\u001b[1;34m(input_fn)\u001b[0m\n\u001b[0;32m     96\u001b[0m     \u001b[1;31m# `numpy` translates Tensors to values in Eager mode.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     97\u001b[0m     return nest.map_structure(_non_none_constant_value,\n\u001b[1;32m---> 98\u001b[1;33m                               distributed_function(input_fn))\n\u001b[0m\u001b[0;32m     99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    100\u001b[0m   \u001b[1;32mreturn\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    566\u001b[0m         \u001b[0mxla_context\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mExit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    567\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 568\u001b[1;33m       \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    569\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    570\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    597\u001b[0m       \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    598\u001b[0m       \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 599\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    600\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    601\u001b[0m       \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   2361\u001b[0m     \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2362\u001b[0m       \u001b[0mgraph_function\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_maybe_define_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2363\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_filtered_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=protected-access\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2365\u001b[0m   \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_filtered_call\u001b[1;34m(self, args, kwargs)\u001b[0m\n\u001b[0;32m   1609\u001b[0m          if isinstance(t, (ops.Tensor,\n\u001b[0;32m   1610\u001b[0m                            resource_variable_ops.BaseResourceVariable))),\n\u001b[1;32m-> 1611\u001b[1;33m         self.captured_inputs)\n\u001b[0m\u001b[0;32m   1612\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1613\u001b[0m   \u001b[1;32mdef\u001b[0m \u001b[0m_call_flat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcaptured_inputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcancellation_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m   1690\u001b[0m       \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1691\u001b[0m       return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1692\u001b[1;33m           ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m   1693\u001b[0m     forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m   1694\u001b[0m         \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m    543\u001b[0m               \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    544\u001b[0m               \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"executor_type\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexecutor_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"config_proto\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 545\u001b[1;33m               ctx=ctx)\n\u001b[0m\u001b[0;32m    546\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    547\u001b[0m           outputs = execute.execute_with_cancellation(\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m     59\u001b[0m     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,\n\u001b[0;32m     60\u001b[0m                                                \u001b[0mop_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m                                                num_outputs)\n\u001b[0m\u001b[0;32m     62\u001b[0m   \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     63\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, LSTM, Embedding\n",
    "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "M_test=Mu_encoded\n",
    "M_test= np.array(M_test)\n",
    "max_len = 230\n",
    "X_train = pad_sequences(X_train, maxlen=max_len)\n",
    "X_test = pad_sequences(X_test, maxlen=max_len)\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(15002, 120))\n",
    "model.add(LSTM(128))\n",
    "model.add(Dense(4, activation='softmax'))\n",
    "\n",
    "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\n",
    "mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)\n",
    "\n",
    "X_train = np.array(X_train)\n",
    "Y_train = np.array(Y_train)\n",
    "X_test = np.array(X_test)\n",
    "Y_test = np.array(Y_test)\n",
    "\n",
    "model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])\n",
    "model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=7, batch_size=64, callbacks=[es, mc])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 뮤지컬 데이터 분류 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.38870266 0.02433592 0.07328184 0.51367956]\n",
      " [0.6450228  0.05261524 0.04859437 0.25376767]\n",
      " [0.9823596  0.00549819 0.00262331 0.00951902]\n",
      " ...\n",
      " [0.49974144 0.09077708 0.2564418  0.1530396 ]\n",
      " [0.22712483 0.24804659 0.28475055 0.24007803]\n",
      " [0.9531947  0.03475152 0.00676594 0.00528788]]\n"
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "import numpy as np\n",
    "\n",
    "M_test = pad_sequences(M_test, maxlen=max_len)\n",
    "predictions = model.predict(M_test)\n",
    "print(predictions)\n",
    "predict_labels = np.argmax(predictions, axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "ro = list(predict_labels).count(0)\n",
    "th = list(predict_labels).count(1)\n",
    "fn = list(predict_labels).count(2)\n",
    "his = list(predict_labels).count(3)\n",
    "data = [ro, th, fn,his]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.pie(data)\n",
    "categories = ['romance', 'thriller', 'fantasy', 'etc']\n",
    "plt.legend(categories)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "for i in range(len(M_test)):\n",
    "    plt.scatter(i+1, predict_labels[i],color = 'c')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 0 0 0 0 0 2 2 0 0 2 1 3 0 2 0 2 0 0 2 2 0 0 1 0 0 0 0 1 0 2 0 0 2 0 2 0\n",
      " 0 0 0 0 1 2 2 3 0 2 0 0 1 1 0 0 1 0 0 2 0 0 0 0 0 0 0 2 3 3 0 1 2 0 3 0 1\n",
      " 0 3 0 0 2 0 0 2 0 3 0 0 2 0 0 0 0 2 0 2 0 3 0 1 3 2 0 0 2 2 0 0 0 0 3 0 0\n",
      " 2 0 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 1 0 2 0 0 2 1 0 0 0 2 3 0 0 0 0 0 2 0 0\n",
      " 0 0 0 2 0 0 2 1 0 1 2 2 3 1 0 0 0 0 3 1 0 0 2 0 2 2 0 0 0 0 0 0 2 0 0 2 3\n",
      " 0 0 0 0 0 0 2 0 0 3 2 0 0 0 2 0 2 1 2 0 0 0 1 0 1 0 0 0 2 0 0 0 3 3 0 0 1\n",
      " 3 0 0 2 0 3 2 1 0 0 0 0 0 0 0 3 1 0 3 0 3 1 3 0 1 0 3 0 0 2 2 0 0 2 0 0 0\n",
      " 2 1 1 0 0 1 3 2 0 2 0 0 2 2 0 0 0 1 0 3 3 0 0 1 0 2 1 0 1 0 0 3 0 0 0 0 3\n",
      " 0 0 0 0 2 0 0 1 0 2 0]\n"
     ]
    }
   ],
   "source": [
    "print(predict_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 확인 : 오페라의 유령"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['The 1988 Tony Award-winner for Best Musical features the work of Andrew Lloyd Webber and is the longest-running show in Broadway history.Story:It is 1881 and the backdrop in the notoriously haunted Op챕ra Populaire has just mysteriously fallen during rehearsal frightening the star performer from continuing with the show and forcing young Christine to be recast in the role.  After opening the production the Phantom abducts Christine and brings her to his lair and reveals his love for her.  When the Phantom discovers that Christine is already in love with Raoul he vows to destroy him - a promise that leads both him and Christine to a dramatic discovery of the true power of music and love. ']\n"
     ]
    }
   ],
   "source": [
    "print(Mu[226])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "print(predict_labels[226])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "장르 색이 강한 오페라의 유령 같은 데이터에 대해서는 항상 정확한 값이 나오는 것을 확인할 수 있음!\n",
    "영화처럼 전체 내용이 다 나와 있는 것이 아니라서 정확도가 떨어지는 경우 발생"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## END"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}