Created
October 22, 2020 11:19
-
-
Save MachineLearningIsEasy/375f68fe596ba7efc9b8a50804f8339d to your computer and use it in GitHub Desktop.
NLP векторные представления текстов
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "2 Векторные представления.ipynb", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5K1WlUmN6pJN" | |
}, | |
"source": [ | |
"\n", | |
"\n", | |
"[перейти](https://www.bigdataschool.ru/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vI0rSTxNfMXu", | |
"outputId": "fdcd65b6-3e0c-4d91-a8ac-7eb04b1aa29c", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 241 | |
} | |
}, | |
"source": [ | |
"!pip install pymorphy2\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import nltk\n", | |
"import re\n", | |
"import csv\n", | |
"from nltk.stem import WordNetLemmatizer\n", | |
"import sklearn\n", | |
"import codecs\n", | |
"import pymorphy2\n", | |
"import seaborn as sns\n", | |
"sns.set_style(\"darkgrid\")\n", | |
"from nltk.stem.snowball import SnowballStemmer\n", | |
"\n", | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive')" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Collecting pymorphy2\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)\n", | |
"\u001b[K |████████████████████████████████| 61kB 1.7MB/s \n", | |
"\u001b[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2MB)\n", | |
"\u001b[K |████████████████████████████████| 8.2MB 5.9MB/s \n", | |
"\u001b[?25hRequirement already satisfied: docopt>=0.6 in /usr/local/lib/python3.6/dist-packages (from pymorphy2) (0.6.2)\n", | |
"Collecting dawg-python>=0.7.1\n", | |
" Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl\n", | |
"Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2\n", | |
"Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844\n", | |
"Mounted at /content/drive\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LBF3ntXC5TWG", | |
"outputId": "48e8eaa1-d6f2-4213-e658-1b43c4612c29", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 68 | |
} | |
}, | |
"source": [ | |
"import nltk\n", | |
"nltk.download('stopwords')" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
"[nltk_data] Unzipping corpora/stopwords.zip.\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 2 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "NrdXlozI5Hev" | |
}, | |
"source": [ | |
"### Функции" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dRIUfW88fI6A" | |
}, | |
"source": [ | |
"from nltk.corpus import stopwords\n", | |
"stopWords = set(stopwords.words('russian'))\n", | |
"\n", | |
"def csv_to_list(arr):\n", | |
" arr_list = []\n", | |
" for row in arr:\n", | |
" arr_list.append(list_to_str(row))\n", | |
" return arr_list\n", | |
"\n", | |
"def list_to_str(arr):\n", | |
" str_ = ''\n", | |
" for rec in arr:\n", | |
" str_+=rec\n", | |
" return str_\n", | |
"\n", | |
"def df_preprocess(text): \n", | |
" reg = re.compile('[^а-яА-яa-zA-Z0-9 ]') #\n", | |
" text = text.lower().replace(\"ё\", \"е\")\n", | |
" text = text.replace(\"ъ\", \"ь\")\n", | |
" text = text.replace(\"й\", \"и\")\n", | |
" text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', 'сайт', text)\n", | |
" text = re.sub('@[^\\s]+', 'пользователь', text)\n", | |
" text = reg.sub(' ', text)\n", | |
" \n", | |
" # Лемматизация\n", | |
" #morph = pymorphy2.MorphAnalyzer()\n", | |
" #text =[morph.parse(word)[0].normal_form for word in text.split()]\n", | |
"\n", | |
" # Стемминг\n", | |
" # stemmer = SnowballStemmer(\"russian\")\n", | |
" # text =[stemmer.stem(word) for word in text.split()]\n", | |
"\n", | |
" # Стемминг + удаление стоп слов\n", | |
" stemmer = SnowballStemmer(\"russian\")\n", | |
" #text =[stemmer.stem(word) for word in text.split() if word not in stopWords]\n", | |
" text = ' '.join([stemmer.stem(word) for word in text.split() if word not in stopWords])\n", | |
"\n", | |
" return text" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0dCTRpn55ONd" | |
}, | |
"source": [ | |
"### Считываем данные\n", | |
"\n", | |
"Используем корпус с сайта https://study.mokoron.com/#download" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "87Chu_tOYxW2" | |
}, | |
"source": [ | |
"positive_recalls = csv_to_list(csv.reader(codecs.open('/content/drive/My Drive/Colab Notebooks/NLP/positive_recalls.csv', 'rU', 'utf-8', errors='ignore')))\n", | |
"negative_recalls = csv_to_list(csv.reader(codecs.open('/content/drive/My Drive/Colab Notebooks/NLP/negative_recalls.csv', 'rU', 'utf-8', errors='ignore')))" | |
], | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "wF5HZPIb957U" | |
}, | |
"source": [ | |
"### Формируем датасет " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "KjrfTaD_7wo1", | |
"outputId": "d605803a-529a-45dd-f769-3949698f333e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
} | |
}, | |
"source": [ | |
"df_positive_recalls = pd.DataFrame(positive_recalls, columns=['recall'])\n", | |
"df_positive_recalls['type']=1\n", | |
"df_positive_recalls.head()" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>recall</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>@first_timee хоть я и школота но поверь у нас ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Да все-таки он немного похож на него. Но мой м...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>RT @KatiaCheh: Ну ты идиотка) я испугалась за ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>RT @digger2912: \"Кто то в углу сидит и погибае...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>@irina_dyshkant Вот что значит страшилка :D\\nН...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" recall type\n", | |
"0 @first_timee хоть я и школота но поверь у нас ... 1\n", | |
"1 Да все-таки он немного похож на него. Но мой м... 1\n", | |
"2 RT @KatiaCheh: Ну ты идиотка) я испугалась за ... 1\n", | |
"3 RT @digger2912: \"Кто то в углу сидит и погибае... 1\n", | |
"4 @irina_dyshkant Вот что значит страшилка :D\\nН... 1" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "C3oNsEsg-2Yb", | |
"outputId": "1ca2222d-fa21-44a8-c104-6cc3dbb114eb", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
} | |
}, | |
"source": [ | |
"df_negative_recalls = pd.DataFrame(negative_recalls, columns=['recall'])\n", | |
"df_negative_recalls['type']=0\n", | |
"df_negative_recalls.head()" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>recall</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>на работе был полный пиддес :| и так каждое за...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Коллеги сидят рубятся в Urban terror а я из-за...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>@elina_4post как говорят обещаного три года жд...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Желаю хорошего полёта и удачной посадкия буду ...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Обновил за каким-то лешим surf теперь не работ...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" recall type\n", | |
"0 на работе был полный пиддес :| и так каждое за... 0\n", | |
"1 Коллеги сидят рубятся в Urban terror а я из-за... 0\n", | |
"2 @elina_4post как говорят обещаного три года жд... 0\n", | |
"3 Желаю хорошего полёта и удачной посадкия буду ... 0\n", | |
"4 Обновил за каким-то лешим surf теперь не работ... 0" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2H5O4MG1-8uT", | |
"outputId": "73d56848-2634-4299-fa95-395514f8483b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
} | |
}, | |
"source": [ | |
"# Объединяем два датафрейма вместе\n", | |
"df_recalls = pd.concat((df_negative_recalls, df_positive_recalls),axis = 0).sample(frac = 1.0) # объединяем и перемешиваем\n", | |
"df_recalls.index = range(0,len(df_recalls))\n", | |
"df_recalls.head()" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>recall</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Сегодня у всех парней обострениеговно изо рта ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Я никогда не туплю. Я просто делаю все в своем...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>RT @Sugar_Kroshka: Опять сердце ..и голова кру...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>RT @DoDFavorit: @Olga_Wholock Мадам да Вы смущ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>RT @SachihiroB: 20k Years Into Space — очень з...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" recall type\n", | |
"0 Сегодня у всех парней обострениеговно изо рта ... 1\n", | |
"1 Я никогда не туплю. Я просто делаю все в своем... 1\n", | |
"2 RT @Sugar_Kroshka: Опять сердце ..и голова кру... 0\n", | |
"3 RT @DoDFavorit: @Olga_Wholock Мадам да Вы смущ... 1\n", | |
"4 RT @SachihiroB: 20k Years Into Space — очень з... 1" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qYV7zbB7_MC-" | |
}, | |
"source": [ | |
"### Удалим стоп слова" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AZ_qod5R5LpL" | |
}, | |
"source": [ | |
"stopWords = set(stopwords.words('russian'))" | |
], | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "54lNMQdG5QHC", | |
"outputId": "8a9c7b8e-f69c-41a3-9c53-c89705c74dd4", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1000 | |
} | |
}, | |
"source": [ | |
"stopWords" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'а',\n", | |
" 'без',\n", | |
" 'более',\n", | |
" 'больше',\n", | |
" 'будет',\n", | |
" 'будто',\n", | |
" 'бы',\n", | |
" 'был',\n", | |
" 'была',\n", | |
" 'были',\n", | |
" 'было',\n", | |
" 'быть',\n", | |
" 'в',\n", | |
" 'вам',\n", | |
" 'вас',\n", | |
" 'вдруг',\n", | |
" 'ведь',\n", | |
" 'во',\n", | |
" 'вот',\n", | |
" 'впрочем',\n", | |
" 'все',\n", | |
" 'всегда',\n", | |
" 'всего',\n", | |
" 'всех',\n", | |
" 'всю',\n", | |
" 'вы',\n", | |
" 'где',\n", | |
" 'да',\n", | |
" 'даже',\n", | |
" 'два',\n", | |
" 'для',\n", | |
" 'до',\n", | |
" 'другой',\n", | |
" 'его',\n", | |
" 'ее',\n", | |
" 'ей',\n", | |
" 'ему',\n", | |
" 'если',\n", | |
" 'есть',\n", | |
" 'еще',\n", | |
" 'ж',\n", | |
" 'же',\n", | |
" 'за',\n", | |
" 'зачем',\n", | |
" 'здесь',\n", | |
" 'и',\n", | |
" 'из',\n", | |
" 'или',\n", | |
" 'им',\n", | |
" 'иногда',\n", | |
" 'их',\n", | |
" 'к',\n", | |
" 'как',\n", | |
" 'какая',\n", | |
" 'какой',\n", | |
" 'когда',\n", | |
" 'конечно',\n", | |
" 'кто',\n", | |
" 'куда',\n", | |
" 'ли',\n", | |
" 'лучше',\n", | |
" 'между',\n", | |
" 'меня',\n", | |
" 'мне',\n", | |
" 'много',\n", | |
" 'может',\n", | |
" 'можно',\n", | |
" 'мой',\n", | |
" 'моя',\n", | |
" 'мы',\n", | |
" 'на',\n", | |
" 'над',\n", | |
" 'надо',\n", | |
" 'наконец',\n", | |
" 'нас',\n", | |
" 'не',\n", | |
" 'него',\n", | |
" 'нее',\n", | |
" 'ней',\n", | |
" 'нельзя',\n", | |
" 'нет',\n", | |
" 'ни',\n", | |
" 'нибудь',\n", | |
" 'никогда',\n", | |
" 'ним',\n", | |
" 'них',\n", | |
" 'ничего',\n", | |
" 'но',\n", | |
" 'ну',\n", | |
" 'о',\n", | |
" 'об',\n", | |
" 'один',\n", | |
" 'он',\n", | |
" 'она',\n", | |
" 'они',\n", | |
" 'опять',\n", | |
" 'от',\n", | |
" 'перед',\n", | |
" 'по',\n", | |
" 'под',\n", | |
" 'после',\n", | |
" 'потом',\n", | |
" 'потому',\n", | |
" 'почти',\n", | |
" 'при',\n", | |
" 'про',\n", | |
" 'раз',\n", | |
" 'разве',\n", | |
" 'с',\n", | |
" 'сам',\n", | |
" 'свою',\n", | |
" 'себе',\n", | |
" 'себя',\n", | |
" 'сейчас',\n", | |
" 'со',\n", | |
" 'совсем',\n", | |
" 'так',\n", | |
" 'такой',\n", | |
" 'там',\n", | |
" 'тебя',\n", | |
" 'тем',\n", | |
" 'теперь',\n", | |
" 'то',\n", | |
" 'тогда',\n", | |
" 'того',\n", | |
" 'тоже',\n", | |
" 'только',\n", | |
" 'том',\n", | |
" 'тот',\n", | |
" 'три',\n", | |
" 'тут',\n", | |
" 'ты',\n", | |
" 'у',\n", | |
" 'уж',\n", | |
" 'уже',\n", | |
" 'хорошо',\n", | |
" 'хоть',\n", | |
" 'чего',\n", | |
" 'чем',\n", | |
" 'через',\n", | |
" 'что',\n", | |
" 'чтоб',\n", | |
" 'чтобы',\n", | |
" 'чуть',\n", | |
" 'эти',\n", | |
" 'этого',\n", | |
" 'этой',\n", | |
" 'этом',\n", | |
" 'этот',\n", | |
" 'эту',\n", | |
" 'я'}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "-mTrgM0E_eAW" | |
}, | |
"source": [ | |
"### Очитска текста приведение слов к стандартному виду" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gHI2wqsn_kMB", | |
"outputId": "a553bb84-2503-4387-c85a-9afdd6d4b871", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 51 | |
} | |
}, | |
"source": [ | |
"%time df_recalls['recall'] = df_recalls['recall'].apply(df_preprocess)" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 40s, sys: 79.8 ms, total: 1min 40s\n", | |
"Wall time: 1min 40s\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Kca4OwK8BRPt", | |
"outputId": "f51ad583-e159-4f84-a810-45bb64cf185a", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
} | |
}, | |
"source": [ | |
"df_recalls.head()" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>recall</th>\n", | |
" <th>type</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>сегодн парн обострениеговн из рта льет</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>тупл прост дела сво стил d сайт</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>rt пользовател сердц голов круж</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>rt пользовател пользовател мад смуща деиствите...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>rt пользовател 20k years int spac очен занимат...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" recall type\n", | |
"0 сегодн парн обострениеговн из рта льет 1\n", | |
"1 тупл прост дела сво стил d сайт 1\n", | |
"2 rt пользовател сердц голов круж 0\n", | |
"3 rt пользовател пользовател мад смуща деиствите... 1\n", | |
"4 rt пользовател 20k years int spac очен занимат... 1" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "O2NIMMwd_Cpi" | |
}, | |
"source": [ | |
"### Train/test split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_kkes5A7_CAl" | |
}, | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"X_train, X_test, y_train, y_test = train_test_split(df_recalls['recall'], df_recalls['type'], test_size=.15, random_state=42)\n" | |
], | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ImvDPxubh_Su", | |
"outputId": "cb002f7c-caf9-488e-8374-1f46696a7f1b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"type(X_train)" | |
], | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"pandas.core.series.Series" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 14 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "J9nRJ3Z5iEjQ", | |
"outputId": "34b6cfdf-94e5-449a-8524-4cf47f8d1925", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(len(X_train), len(X_test))" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"192808 34026\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ipEpIuJwiM0g", | |
"outputId": "5eb9cf10-1d9e-4188-bce5-110655681d3b", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"X_train[100]" | |
], | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'добав стен потеря музык сайт'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 16 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "hTYYSDy5jx_o", | |
"outputId": "7fb84c2e-4e70-4a75-9af3-c574b5762e90", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"y_train[100]" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZBy6f8q6-xzv" | |
}, | |
"source": [ | |
"### BOW" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1PdMo6PY_BXL" | |
}, | |
"source": [ | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"vectorizer = CountVectorizer()\n", | |
"\n", | |
"X_train_BOW = vectorizer.fit_transform(X_train)\n", | |
"X_test_BOW = vectorizer.transform(X_test)" | |
], | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "URXczudRj9p2", | |
"outputId": "147ddf3c-d72b-4d2b-891e-c67b86dd1775", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(X_train_BOW.shape, X_test_BOW.shape)" | |
], | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(192808, 110125) (34026, 110125)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6D2JImLqZ-bW", | |
"outputId": "a94654ee-d1b5-457f-a0c2-10dfc4d90048", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
} | |
}, | |
"source": [ | |
"X_train.iloc[200]" | |
], | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'пользовател иогурт эрмигурт очен широк горлышк пит неудобн бутылк убира губ царапа'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 20 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "hb9w4Mgqm2s7", | |
"outputId": "ba29e657-8dcb-46d9-ee8d-1d86cf8e310d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 51 | |
} | |
}, | |
"source": [ | |
"# Векторное представление\n", | |
"X_train_BOW[200]" | |
], | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<1x110125 sparse matrix of type '<class 'numpy.int64'>'\n", | |
"\twith 12 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 21 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pmOeNxCznybd" | |
}, | |
"source": [ | |
"### TF-IDF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "T3mKWWnwn0im" | |
}, | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"vectorizer = TfidfVectorizer()\n", | |
"\n", | |
"X_train_TFIDF = vectorizer.fit_transform(X_train)\n", | |
"X_test_TFIDF = vectorizer.transform(X_test)" | |
], | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "NpFtdBSYoBgT", | |
"outputId": "abc70af0-d17a-4cb1-8734-7fb7ef259986", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"print(X_train_TFIDF.shape, X_test_TFIDF.shape)" | |
], | |
"execution_count": 23, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(192808, 110125) (34026, 110125)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "E6ArOkpioIOs", | |
"outputId": "bfc6ee5b-e1c1-4093-d47d-02d20cb2e433", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 51 | |
} | |
}, | |
"source": [ | |
"# Векторное представление\n", | |
"X_train_TFIDF[200]" | |
], | |
"execution_count": 24, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<1x110125 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 12 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 24 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "DVF6O7dsaRjO" | |
}, | |
"source": [ | |
"### Строим простейшую модель" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "hd5OlzOpbxD_" | |
}, | |
"source": [ | |
"#### На данных BOW" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4q3FF9T5dgZZ" | |
}, | |
"source": [ | |
"from sklearn.linear_model import LogisticRegression\n", | |
"from sklearn.metrics import accuracy_score\n" | |
], | |
"execution_count": 25, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2hIL5d9Wa3WV", | |
"outputId": "ae2b187c-4134-4881-a9e6-5cfae750a1d8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 207 | |
} | |
}, | |
"source": [ | |
"# обучаем классификатор\n", | |
"%time clf = LogisticRegression(random_state=0).fit(X_train_BOW, y_train)" | |
], | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 6.83 s, sys: 4.54 s, total: 11.4 s\n", | |
"Wall time: 5.86 s\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n", | |
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", | |
"\n", | |
"Increase the number of iterations (max_iter) or scale the data as shown in:\n", | |
" https://scikit-learn.org/stable/modules/preprocessing.html\n", | |
"Please also refer to the documentation for alternative solver options:\n", | |
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", | |
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1CEVfbAKbYga" | |
}, | |
"source": [ | |
"# вычисляем предсказания\n", | |
"y_predict_BOW = clf.predict(X_test_BOW)" | |
], | |
"execution_count": 27, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cXxx3qZKboL8", | |
"outputId": "aff5779e-817f-4d14-ce09-5fd0544eebc0", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"# вычисляем метрику accuracy\n", | |
"accuracy_score(y_predict_BOW, y_test)" | |
], | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.7360253923470288" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 28 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ccgCm6NKb0Wy" | |
}, | |
"source": [ | |
"#### На данных TF-IDF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-Lpnp-YQb20w", | |
"outputId": "c3e068b2-4d00-441f-f3b4-cb318f18deb8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 207 | |
} | |
}, | |
"source": [ | |
"%time clf = LogisticRegression(random_state=43).fit(X_train_TFIDF, y_train)" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 7.34 s, sys: 4.94 s, total: 12.3 s\n", | |
"Wall time: 6.3 s\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n", | |
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", | |
"\n", | |
"Increase the number of iterations (max_iter) or scale the data as shown in:\n", | |
" https://scikit-learn.org/stable/modules/preprocessing.html\n", | |
"Please also refer to the documentation for alternative solver options:\n", | |
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", | |
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ERtxqj6qb7i0" | |
}, | |
"source": [ | |
"y_predict_TFIDF = clf.predict(X_test_TFIDF)" | |
], | |
"execution_count": 30, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wFvxSEK7cBB7", | |
"outputId": "52a9241a-514d-4da6-dc89-ead417ed644d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
} | |
}, | |
"source": [ | |
"accuracy_score(y_predict_TFIDF, y_test)" | |
], | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.7341444777523071" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 31 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VQxum79ockEI" | |
}, | |
"source": [ | |
"#### На данных BOW с биграммами" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "hDFo5A8wcinG", | |
"outputId": "7a4639e2-1deb-419d-b689-95721daedb00", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 207 | |
} | |
}, | |
"source": [ | |
"#-----------------------------------------------\n", | |
"vectorizer = CountVectorizer(ngram_range=(1, 2))\n", | |
"#-----------------------------------------------\n", | |
"X_train_BOW_bi = vectorizer.fit_transform(X_train)\n", | |
"X_test_BOW_bi = vectorizer.transform(X_test)\n", | |
"#-----------------------------------------------\n", | |
"print(X_train_BOW_bi.shape, X_test_BOW_bi.shape)\n", | |
"#-----------------------------------------------\n", | |
"clf = LogisticRegression(random_state=0).fit(X_train_BOW_bi, y_train)\n", | |
"#-----------------------------------------------\n", | |
"y_predict_BOW_bi = clf.predict(X_test_BOW_bi)\n", | |
"#-----------------------------------------------\n", | |
"accuracy_score(y_predict_BOW_bi, y_test)" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(192808, 893652) (34026, 893652)\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n", | |
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", | |
"\n", | |
"Increase the number of iterations (max_iter) or scale the data as shown in:\n", | |
" https://scikit-learn.org/stable/modules/preprocessing.html\n", | |
"Please also refer to the documentation for alternative solver options:\n", | |
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", | |
" extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" | |
], | |
"name": "stderr" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"0.7493681302533357" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 32 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment