영화 전처리 모델.ipynb 28.8 KB
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5699\n",
      "9823\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import re\n",
    "\n",
    "\n",
    "romance_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/romancePlot.csv')\n",
    "thriller_plot = pd.read_csv('/Users/김서영/Desktop/datacap/data/moviedata/moviePlot/thrillerPlot.csv')\n",
    "\n",
    "print(len(romance_plot)) #5699 ==> train 2500 test 2500\n",
    "print(len(thriller_plot)) #9823 ==> train 2500 test 2500\n",
    "\n",
    "train_data_size = 2899\n",
    "\n",
    "#전처리(1-1) 데이터 csv 파일로 옮기기\n",
    "#romance_plot 2899개 train_data로 to_csv || 2800개 test_data로 to_csv\n",
    "#thriller_plot 2899개 train_data로 to_csv || 2800개 test_data로 to_csv\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 토큰화+전처리(3) 불용어 처리\n",
    "# 로맨스 플롯\n",
    "\n",
    "vocab = {} # 파이썬의 dictionary 자료형\n",
    "sentences = []\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "for i in romancePlot_str_list:\n",
    "    #print(type(str(i)))\n",
    "    sentence = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "    result = []\n",
    "\n",
    "    for word in sentence: \n",
    "        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "        if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "            if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                result.append(word)\n",
    "                if word not in vocab:\n",
    "                    vocab[word] = 0 \n",
    "                vocab[word] += 1\n",
    "    sentences.append(result) \n",
    "#print(sentences)\n",
    "#print(vocab)\n",
    "vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)\n",
    "#print(vocab_sorted)\n",
    "\n",
    "#전처리(4) 인덱스 부여, 정수 인코딩\n",
    "word_to_index = {}\n",
    "i=0\n",
    "for (word, frequency) in vocab_sorted :\n",
    "    if frequency > 1 : # 정제(Cleaning) 챕터에서 언급했듯이 빈도수가 적은 단어는 제외한다.\n",
    "        i=i+1\n",
    "        word_to_index[word] = i\n",
    "#print(word_to_index)\n",
    "\n",
    "vocab_size = 500 #상위 500개 단어만 사용\n",
    "words_frequency = [w for w,c in word_to_index.items() if c >= vocab_size + 1] # 인덱스가 200 초과인 단어 제거\n",
    "for w in words_frequency:\n",
    "    del word_to_index[w] # 해당 단어에 대한 인덱스 정보를 삭제\n",
    "    \n",
    "word_to_index['OOV'] = len(word_to_index) + 1\n",
    "\n",
    "##전체 줄거리 가지고 하는 부분\n",
    "\n",
    "encoded = []\n",
    "for s in sentences:\n",
    "    temp = []\n",
    "    for w in s:\n",
    "        try:\n",
    "            temp.append(word_to_index[w])\n",
    "        except KeyError:\n",
    "            temp.append(word_to_index['OOV'])\n",
    "    encoded.append(temp)\n",
    "#print(encoded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#전처리 방법에는 NLTK의 FreqDist, 케라스(Keras) 토크나이저도 사용 가능.\n",
    "\n",
    "1.love 1. death\n",
    "1, 로맨스\n",
    "1, 로맨스\n",
    "\n",
    "0,1-> 로맨스, 스릴러\n",
    "로맨스에서 love <- 로맨스\n",
    "스릴러 love, <- 로맨스 정확도의 문제\n",
    "love deth <-\n",
    "로맨스 인덱스\n",
    "스릴러 인덱스"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "로맨스 플롯, 스릴러 따로 토큰화 해서 x train에 넣을지... 고민중\n",
    "\n",
    "이번주 : 전처리 완료, \n",
    "이번 달 목표 : 뮤지컬 장르 분류 << 다양한 모델 사용해보기.\n",
    "6월에 교차검증 및 장르 시각화 설계까지.\n",
    "\n",
    "다음주 :  2진분류(LSTM) 완료, RNN 분류기 만들어보기\n",
    "\n",
    "모델 설계할 때 단어 빈도수 참고하도록 만들기. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid character in identifier (<ipython-input-1-8071a46faed2>, line 6)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-1-8071a46faed2>\"\u001b[1;36m, line \u001b[1;32m6\u001b[0m\n\u001b[1;33m    X_train = train_sc_df.dropna().drop(‘trade_price_idx_value’, axis=1)\u001b[0m\n\u001b[1;37m                                                              ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid character in identifier\n"
     ]
    }
   ],
   "source": [
    "# 영화 줄거리는 X_train에, 장르 정보는 y_train에 저장된다.\n",
    "# 테스트용 줄거리 X_test에, 테스트용 줄거리의 장르 정보는 y_test에 저장된다.\n",
    "\n",
    "#맞춰서 저장하기. (진행중)\n",
    "\n",
    "X_train = train_sc_df.dropna().drop(‘trade_price_idx_value’, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n",
      "17465344/17464789 [==============================] - 19s 1us/step\n",
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/10\n",
      "24960/25000 [============================>.] - ETA: 1s - loss: 0.4589 - acc: 0.7804\n",
      "Epoch 00001: val_acc improved from -inf to 0.82452, saving model to best_model.h5\n",
      "25000/25000 [==============================] - 803s 32ms/sample - loss: 0.4586 - acc: 0.7806 - val_loss: 0.4044 - val_acc: 0.8245\n",
      "Epoch 2/10\n",
      "24960/25000 [============================>.] - ETA: 1s - loss: 0.3530 - acc: 0.8500\n",
      "Epoch 00002: val_acc did not improve from 0.82452\n",
      "25000/25000 [==============================] - 959s 38ms/sample - loss: 0.3529 - acc: 0.8500 - val_loss: 0.4112 - val_acc: 0.8135\n",
      "Epoch 3/10\n",
      "24960/25000 [============================>.] - ETA: 1s - loss: 0.2544 - acc: 0.8996\n",
      "Epoch 00003: val_acc improved from 0.82452 to 0.83696, saving model to best_model.h5\n",
      "25000/25000 [==============================] - 1055s 42ms/sample - loss: 0.2542 - acc: 0.8997 - val_loss: 0.4095 - val_acc: 0.8370\n",
      "Epoch 4/10\n",
      "24960/25000 [============================>.] - ETA: 1s - loss: 0.2172 - acc: 0.9165\n",
      "Epoch 00004: val_acc improved from 0.83696 to 0.86588, saving model to best_model.h5\n",
      "25000/25000 [==============================] - 936s 37ms/sample - loss: 0.2174 - acc: 0.9164 - val_loss: 0.3471 - val_acc: 0.8659\n",
      "Epoch 5/10\n",
      "24960/25000 [============================>.] - ETA: 1s - loss: 0.1797 - acc: 0.9314\n",
      "Epoch 00005: val_acc did not improve from 0.86588\n",
      "25000/25000 [==============================] - 898s 36ms/sample - loss: 0.1799 - acc: 0.9314 - val_loss: 0.3385 - val_acc: 0.8654\n",
      "Epoch 6/10\n",
      " 1856/25000 [=>............................] - ETA: 10:06 - loss: 0.1396 - acc: 0.9520WARNING:tensorflow:Early stopping conditioned on metric `val_loss` which is not available. Available metrics are: loss,acc\n",
      "WARNING:tensorflow:Can save best model only with val_acc available, skipping.\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-20-fd15450a52a3>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     25\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'binary_crossentropy'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'adam'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmetrics\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'acc'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m64\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[0;32m    817\u001b[0m         \u001b[0mmax_queue_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmax_queue_size\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    818\u001b[0m         \u001b[0mworkers\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mworkers\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 819\u001b[1;33m         use_multiprocessing=use_multiprocessing)\n\u001b[0m\u001b[0;32m    820\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    821\u001b[0m   def evaluate(self,\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)\u001b[0m\n\u001b[0;32m    340\u001b[0m                 \u001b[0mmode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mModeKeys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    341\u001b[0m                 \u001b[0mtraining_context\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtraining_context\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 342\u001b[1;33m                 total_epochs=epochs)\n\u001b[0m\u001b[0;32m    343\u001b[0m             \u001b[0mcbks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmake_logs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepoch_logs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtraining_result\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mModeKeys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTRAIN\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    344\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2.py\u001b[0m in \u001b[0;36mrun_one_epoch\u001b[1;34m(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)\u001b[0m\n\u001b[0;32m    126\u001b[0m         step=step, mode=mode, size=current_batch_size) as batch_logs:\n\u001b[0;32m    127\u001b[0m       \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 128\u001b[1;33m         \u001b[0mbatch_outs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    129\u001b[0m       \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mStopIteration\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOutOfRangeError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    130\u001b[0m         \u001b[1;31m# TODO(kaftan): File bug about tf function and errors.OutOfRangeError?\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\keras\\engine\\training_v2_utils.py\u001b[0m in \u001b[0;36mexecution_function\u001b[1;34m(input_fn)\u001b[0m\n\u001b[0;32m     96\u001b[0m     \u001b[1;31m# `numpy` translates Tensors to values in Eager mode.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     97\u001b[0m     return nest.map_structure(_non_none_constant_value,\n\u001b[1;32m---> 98\u001b[1;33m                               distributed_function(input_fn))\n\u001b[0m\u001b[0;32m     99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    100\u001b[0m   \u001b[1;32mreturn\u001b[0m \u001b[0mexecution_function\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    566\u001b[0m         \u001b[0mxla_context\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mExit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    567\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 568\u001b[1;33m       \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    569\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    570\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    597\u001b[0m       \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    598\u001b[0m       \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 599\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    600\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    601\u001b[0m       \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   2361\u001b[0m     \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2362\u001b[0m       \u001b[0mgraph_function\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_maybe_define_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2363\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_filtered_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=protected-access\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2365\u001b[0m   \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_filtered_call\u001b[1;34m(self, args, kwargs)\u001b[0m\n\u001b[0;32m   1609\u001b[0m          if isinstance(t, (ops.Tensor,\n\u001b[0;32m   1610\u001b[0m                            resource_variable_ops.BaseResourceVariable))),\n\u001b[1;32m-> 1611\u001b[1;33m         self.captured_inputs)\n\u001b[0m\u001b[0;32m   1612\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1613\u001b[0m   \u001b[1;32mdef\u001b[0m \u001b[0m_call_flat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcaptured_inputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcancellation_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m   1690\u001b[0m       \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1691\u001b[0m       return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1692\u001b[1;33m           ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m   1693\u001b[0m     forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m   1694\u001b[0m         \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m    543\u001b[0m               \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    544\u001b[0m               \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"executor_type\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexecutor_type\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"config_proto\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 545\u001b[1;33m               ctx=ctx)\n\u001b[0m\u001b[0;32m    546\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    547\u001b[0m           outputs = execute.execute_with_cancellation(\n",
      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m     59\u001b[0m     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,\n\u001b[0;32m     60\u001b[0m                                                \u001b[0mop_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m                                                num_outputs)\n\u001b[0m\u001b[0;32m     62\u001b[0m   \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     63\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "#LSTM 분류 연습\n",
    "\n",
    "from tensorflow.keras.datasets import imdb\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense, LSTM, Embedding\n",
    "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from tensorflow.keras.models import load_model\n",
    "\n",
    "(X_train, y_train), (X_test, y_test) = \n",
    "\n",
    "max_len = 500\n",
    "X_train = pad_sequences(X_train, maxlen=max_len)\n",
    "X_test = pad_sequences(X_test, maxlen=max_len)\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(5000, 120))\n",
    "model.add(LSTM(120))\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "\n",
    "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)\n",
    "mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)\n",
    "\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])\n",
    "model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[es, mc])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "loaded_model = load_model('best_model.h5')\n",
    "print(\"\\n 테스트 정확도: %.4f\" % (loaded_model.evaluate(X_test, y_test)[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'X_train' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-16-4d2ea5635e5d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m#평균 길이 출력\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mlen_result\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'리뷰의 최대 길이 : {}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen_result\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'리뷰의 평균 길이 : {}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen_result\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'X_train' is not defined"
     ]
    }
   ],
   "source": [
    "#평균 길이 출력\n",
    "len_result = [len(s) for s in X_train]\n",
    "\n",
    "print('리뷰의 최대 길이 : {}'.format(np.max(len_result)))\n",
    "print('리뷰의 평균 길이 : {}'.format(np.mean(len_result)))\n",
    "\n",
    "plt.subplot(1,2,1)\n",
    "plt.boxplot(len_result)\n",
    "plt.subplot(1,2,2)\n",
    "plt.hist(len_result, bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 영화 리뷰 예제\n",
    "# 0. 사용할 패키지 불러오기\n",
    "from keras.utils import np_utils\n",
    "from keras.datasets import mnist\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation\n",
    "import numpy as np\n",
    "from numpy import argmax\n",
    "\n",
    "# 1. 데이터셋 생성하기\n",
    "\n",
    "# 훈련셋과 시험셋 불러오기\n",
    "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
    "\n",
    "# 데이터셋 전처리\n",
    "x_train = x_train.reshape(60000, 784).astype('float32') / 255.0\n",
    "x_test = x_test.reshape(10000, 784).astype('float32') / 255.0\n",
    "\n",
    "# 원핫인코딩 (one-hot encoding) 처리\n",
    "y_train = np_utils.to_categorical(y_train)\n",
    "y_test = np_utils.to_categorical(y_test)\n",
    "\n",
    "# 훈련셋과 검증셋 분리\n",
    "x_val = x_train[:42000] # 훈련셋의 30%를 검증셋으로 사용\n",
    "x_train = x_train[42000:]\n",
    "y_val = y_train[:42000] # 훈련셋의 30%를 검증셋으로 사용\n",
    "y_train = y_train[42000:]\n",
    "\n",
    "# 2. 모델 구성하기\n",
    "model = Sequential()\n",
    "model.add(Dense(units=64, input_dim=28*28, activation='relu'))\n",
    "model.add(Dense(units=10, activation='softmax'))\n",
    "\n",
    "# 3. 모델 학습과정 설정하기\n",
    "model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])\n",
    "\n",
    "# 4. 모델 학습시키기\n",
    "model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_val, y_val))\n",
    "\n",
    "# 5. 모델 평가하기\n",
    "loss_and_metrics = model.evaluate(x_test, y_test, batch_size=32)\n",
    "print('')\n",
    "print('loss_and_metrics : ' + str(loss_and_metrics))\n",
    "\n",
    "# 6. 모델 사용하기\n",
    "xhat_idx = np.random.choice(x_test.shape[0], 5)\n",
    "xhat = x_test[xhat_idx]\n",
    "yhat = model.predict_classes(xhat)\n",
    "\n",
    "for i in range(5):\n",
    "    print('True : ' + str(argmax(y_test[xhat_idx[i]])) + ', Predict : ' + str(yhat[i]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}