processing.ipynb 1.95 KB

Raw Blame History Permalink

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "vocab = {} \n",
    "sentences = []\n",
    "\n",
    "def processing(arr,all_word_to_index):\n",
    "    global all_word_to_index\n",
    "    for i in tqdm(arr):\n",
    "        sentence = word_tokenize(str(i)) # 단어 토큰화를 수행합니다.\n",
    "        result = []\n",
    "\n",
    "        for word in RMsentence: \n",
    "            word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.\n",
    "            if word not in stop_words: # 단어 토큰화 된 결과에 대해서 불용어를 제거합니다.\n",
    "                if len(word) > 2: # 단어 길이가 2이하인 경우에 대하여 추가로 단어를 제거합니다.\n",
    "                    result.append(word)\n",
    "                    if word not in vocab_r:\n",
    "                        vocab[word] = 0 \n",
    "                    vocab[word] += 1\n",
    "        sentences.append(result) \n",
    "\n",
    "    encoded = []\n",
    "    for s in sentences:\n",
    "        temp = []\n",
    "        for w in s:\n",
    "            try:\n",
    "                temp.append(all_word_to_index[w])\n",
    "            except KeyError:\n",
    "                temp.append(all_word_to_index['OOV'])\n",
    "        encoded.append(temp)\n",
    "    return encoded\n",
    "\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}