Created
September 17, 2021 09:21
-
-
Save MachineLearningIsEasy/84f797da6c1dff7313851980459d302f to your computer and use it in GitHub Desktop.
IMDB convolution NN
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5K1WlUmN6pJN" | |
}, | |
"source": [ | |
"\n", | |
"\n", | |
"[перейти](https://www.bigdataschool.ru/)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"colab_type": "text", | |
"id": "Lv1nORvnYVQn" | |
}, | |
"source": [ | |
"## Определение тональности отзывов на фильмы с сайта IMDB" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"colab_type": "code", | |
"executionInfo": { | |
"elapsed": 2463, | |
"status": "ok", | |
"timestamp": 1554888691310, | |
"user": { | |
"displayName": "Максим Батькович", | |
"photoUrl": "", | |
"userId": "08926243841298764575" | |
}, | |
"user_tz": -180 | |
}, | |
"id": "lpsyhVlJX5XH", | |
"outputId": "2ca82cb8-1d57-4a58-d045-b5f24d3b8ceb" | |
}, | |
"outputs": [], | |
"source": [ | |
"from tensorflow.keras.datasets import imdb\n", | |
"from tensorflow.keras.models import Sequential\n", | |
"from tensorflow.keras.layers import Dense\n", | |
"from tensorflow.keras.layers import Flatten\n", | |
"from tensorflow.keras.layers import Conv1D\n", | |
"from tensorflow.keras.layers import MaxPooling1D\n", | |
"from tensorflow.keras.layers import Embedding\n", | |
"from tensorflow.keras.preprocessing import sequence\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"colab_type": "text", | |
"id": "xGDJkdINNWVh" | |
}, | |
"source": [ | |
"## Загружаем данные" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#берем только наиболее частые 7000 слов\n", | |
"top_words = 7000\n", | |
"#разбиваем данные на корпус для обучения и теста\n", | |
"\n", | |
"import ssl\n", | |
"\n", | |
"try:\n", | |
" _create_unverified_https_context = ssl._create_unverified_context\n", | |
"except AttributeError:\n", | |
" pass\n", | |
"else:\n", | |
" ssl._create_default_https_context = _create_unverified_https_context\n", | |
"(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Смотрим на данные" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]\n", | |
"1\n", | |
"Shape of training data: \n", | |
"(25000,)\n", | |
"(25000,)\n", | |
"Shape of test data: \n", | |
"(25000,)\n", | |
"(25000,)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Looking at the nature of training data\n", | |
"print(X_train[0])\n", | |
"print(y_train[0])\n", | |
"print('Shape of training data: ')\n", | |
"print(X_train.shape)\n", | |
"print(y_train.shape)\n", | |
"print('Shape of test data: ')\n", | |
"print(X_test.shape)\n", | |
"print(y_test.shape)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Препроцессинг для нейронной сети" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Padding the data samples to a maximum review length in words\n", | |
"max_words = 450\n", | |
"X_train = sequence.pad_sequences(X_train, maxlen=max_words)\n", | |
"X_test = sequence.pad_sequences(X_test, maxlen=max_words)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Архитектура нейронной сети" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = Sequential() \n", | |
"model.add(Embedding(top_words, 32, input_length=max_words))\n", | |
"model.add(Conv1D(32, 3, padding='same', activation='relu'))\n", | |
"model.add(MaxPooling1D())\n", | |
"model.add(Flatten())\n", | |
"model.add(Dense(250, activation='relu'))\n", | |
"model.add(Dense(1, activation='sigmoid'))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Настройки обучения" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Model: \"sequential_1\"\n", | |
"_________________________________________________________________\n", | |
"Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
"embedding_1 (Embedding) (None, 450, 32) 224000 \n", | |
"_________________________________________________________________\n", | |
"conv1d_1 (Conv1D) (None, 450, 32) 3104 \n", | |
"_________________________________________________________________\n", | |
"max_pooling1d_1 (MaxPooling1 (None, 225, 32) 0 \n", | |
"_________________________________________________________________\n", | |
"flatten_1 (Flatten) (None, 7200) 0 \n", | |
"_________________________________________________________________\n", | |
"dense_2 (Dense) (None, 250) 1800250 \n", | |
"_________________________________________________________________\n", | |
"dense_3 (Dense) (None, 1) 251 \n", | |
"=================================================================\n", | |
"Total params: 2,027,605\n", | |
"Trainable params: 2,027,605\n", | |
"Non-trainable params: 0\n", | |
"_________________________________________________________________\n" | |
] | |
} | |
], | |
"source": [ | |
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", | |
"model.summary()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Обучаем нейронную сеть" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Epoch 1/2\n", | |
"196/196 - 11s - loss: 0.4628 - accuracy: 0.7404 - val_loss: 0.2765 - val_accuracy: 0.8861\n", | |
"Epoch 2/2\n", | |
"196/196 - 11s - loss: 0.1975 - accuracy: 0.9246 - val_loss: 0.2775 - val_accuracy: 0.8857\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<keras.callbacks.History at 0x7faabd3f03a0>" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Метрики на тестовой выборке" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Accuracy: 88.57%\n" | |
] | |
} | |
], | |
"source": [ | |
"scores = model.evaluate(X_test, y_test, verbose=0)\n", | |
"print(\"Accuracy: %.2f%%\" % (scores[1]*100))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Задание\n", | |
" - Измените длину вектора в слое Embedding.\n", | |
" - Изменяйте длину ядра в сверточном слое.\n", | |
" - Попробуйте изменить параметры обучения нейронной сети\n", | |
" - Попробуйте использовать несколько рекуррентных слоев в сети. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"accelerator": "GPU", | |
"colab": { | |
"name": "Copy of ДЗ№10 рекуррентные нейронные сети для обработки текстов Лопатин М.В. .ipynb", | |
"provenance": [ | |
{ | |
"file_id": "1hIoriJFZz0gpsI8X2KPHxThAgz0cNtBX", | |
"timestamp": 1555142559682 | |
}, | |
{ | |
"file_id": "1xo2ktUFhosKo8dwYQNsO9qk-NHM5qBoM", | |
"timestamp": 1554284811093 | |
}, | |
{ | |
"file_id": "14qSjZOTB_ZH-xp8E3ar6v_SmCvyIBAeJ", | |
"timestamp": 1543289943814 | |
}, | |
{ | |
"file_id": "1Bj43ioMWx0OrZSkWGc9p95oaUc2A6olc", | |
"timestamp": 1542707252376 | |
}, | |
{ | |
"file_id": "1Rx2ZCmlL783MdXCgapJEmB7S1k9qC-sc", | |
"timestamp": 1542638541701 | |
} | |
], | |
"version": "0.3.2" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment