Skip to content

Instantly share code, notes, and snippets.

@vporoshok
Created May 12, 2019 15:40
Show Gist options
  • Save vporoshok/1afeebd28f60f5219c0f79c198db705f to your computer and use it in GitHub Desktop.
Save vporoshok/1afeebd28f60f5219c0f79c198db705f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import numpy as np\n",
"from keras.utils import to_categorical\n",
"from keras import models\n",
"from keras import layers"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from keras.datasets import imdb\n",
"(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)\n",
"data = np.concatenate((training_data, testing_data), axis=0)\n",
"targets = np.concatenate((training_targets, testing_targets), axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categories: [0 1]\n",
"Number of unique words: 9998\n"
]
}
],
"source": [
"print(\"Categories:\", np.unique(targets))\n",
"print(\"Number of unique words:\", len(np.unique(np.hstack(data))))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average Review length: 234.75892\n",
"Standard Deviation: 173.0\n"
]
}
],
"source": [
"length = [len(i) for i in data]\n",
"print(\"Average Review length:\", np.mean(length))\n",
"print(\"Standard Deviation:\", round(np.std(length)))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all\n"
]
}
],
"source": [
"index = imdb.get_word_index()\n",
"reverse_index = dict([(value, key) for (key, value) in index.items()]) \n",
"decoded = \" \".join( [reverse_index.get(i - 3, \"#\") for i in data[0]] )\n",
"print(decoded) "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def vectorize(sequences, dimension = 10000):\n",
" results = np.zeros((len(sequences), dimension))\n",
" for i, sequence in enumerate(sequences):\n",
" results[i, sequence] = 1\n",
" return results\n",
" \n",
"data = vectorize(data)\n",
"targets = np.array(targets).astype(\"float32\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"test_x = data[:10000]\n",
"test_y = targets[:10000]\n",
"train_x = data[10000:]\n",
"train_y = targets[10000:]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /home/vp/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Colocations handled automatically by placer.\n",
"WARNING:tensorflow:From /home/vp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"dense_1 (Dense) (None, 50) 500050 \n",
"_________________________________________________________________\n",
"dropout_1 (Dropout) (None, 50) 0 \n",
"_________________________________________________________________\n",
"dense_2 (Dense) (None, 50) 2550 \n",
"_________________________________________________________________\n",
"dropout_2 (Dropout) (None, 50) 0 \n",
"_________________________________________________________________\n",
"dense_3 (Dense) (None, 50) 2550 \n",
"_________________________________________________________________\n",
"dense_4 (Dense) (None, 1) 51 \n",
"=================================================================\n",
"Total params: 505,201\n",
"Trainable params: 505,201\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n",
"WARNING:tensorflow:From /home/vp/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"Use tf.cast instead.\n",
"Train on 40000 samples, validate on 10000 samples\n",
"Epoch 1/2\n",
"40000/40000 [==============================] - 7s 182us/step - loss: 0.4049 - acc: 0.8215 - val_loss: 0.2634 - val_acc: 0.8946\n",
"Epoch 2/2\n",
"40000/40000 [==============================] - 5s 126us/step - loss: 0.2121 - acc: 0.9190 - val_loss: 0.2606 - val_acc: 0.8953\n",
"Test-Accuracy: 0.8949500024318695\n"
]
}
],
"source": [
"model = models.Sequential()\n",
"# Input - Layer\n",
"model.add(layers.Dense(50, activation = \"relu\", input_shape=(10000, )))\n",
"# Hidden - Layers\n",
"model.add(layers.Dropout(0.3, noise_shape=None, seed=None))\n",
"model.add(layers.Dense(50, activation = \"relu\"))\n",
"model.add(layers.Dropout(0.2, noise_shape=None, seed=None))\n",
"model.add(layers.Dense(50, activation = \"relu\"))\n",
"# Output- Layer\n",
"model.add(layers.Dense(1, activation = \"sigmoid\"))\n",
"model.summary()\n",
"# compiling the model\n",
"model.compile(\n",
" optimizer = \"adam\",\n",
" loss = \"binary_crossentropy\",\n",
" metrics = [\"accuracy\"]\n",
")\n",
"results = model.fit(\n",
" train_x, train_y,\n",
" epochs= 2,\n",
" batch_size = 500,\n",
" validation_data = (test_x, test_y)\n",
")\n",
"print(\"Test-Accuracy:\", np.mean(results.history[\"val_acc\"]))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.93223476]], dtype=float32)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict(np.array([train_x[6]]))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_y[6]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/vp/nltk_data...\n",
"[nltk_data] Error downloading 'punkt' from\n",
"[nltk_data] <https://raw.githubusercontent.com/nltk/nltk_data/gh-\n",
"[nltk_data] pages/packages/tokenizers/punkt.zip>: HTTP Error\n",
"[nltk_data] 503: first byte timeout\n"
]
},
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tokens = nltk.word_tokenize('А мне впрочем неважно как это работает, лишь бы работало')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pymorphy2"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"morph = pymorphy2.MorphAnalyzer()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['а',\n",
" 'я',\n",
" 'впрочем',\n",
" 'неважно',\n",
" 'как',\n",
" 'это',\n",
" 'работать',\n",
" ',',\n",
" 'лишь',\n",
" 'бы',\n",
" 'работать']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[morph.parse(word)[0].normal_form for word in tokens]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['и',\n",
" 'в',\n",
" 'во',\n",
" 'не',\n",
" 'что',\n",
" 'он',\n",
" 'на',\n",
" 'я',\n",
" 'с',\n",
" 'со',\n",
" 'как',\n",
" 'а',\n",
" 'то',\n",
" 'все',\n",
" 'она',\n",
" 'так',\n",
" 'его',\n",
" 'но',\n",
" 'да',\n",
" 'ты',\n",
" 'к',\n",
" 'у',\n",
" 'же',\n",
" 'вы',\n",
" 'за',\n",
" 'бы',\n",
" 'по',\n",
" 'только',\n",
" 'ее',\n",
" 'мне',\n",
" 'было',\n",
" 'вот',\n",
" 'от',\n",
" 'меня',\n",
" 'еще',\n",
" 'нет',\n",
" 'о',\n",
" 'из',\n",
" 'ему',\n",
" 'теперь',\n",
" 'когда',\n",
" 'даже',\n",
" 'ну',\n",
" 'вдруг',\n",
" 'ли',\n",
" 'если',\n",
" 'уже',\n",
" 'или',\n",
" 'ни',\n",
" 'быть',\n",
" 'был',\n",
" 'него',\n",
" 'до',\n",
" 'вас',\n",
" 'нибудь',\n",
" 'опять',\n",
" 'уж',\n",
" 'вам',\n",
" 'ведь',\n",
" 'там',\n",
" 'потом',\n",
" 'себя',\n",
" 'ничего',\n",
" 'ей',\n",
" 'может',\n",
" 'они',\n",
" 'тут',\n",
" 'где',\n",
" 'есть',\n",
" 'надо',\n",
" 'ней',\n",
" 'для',\n",
" 'мы',\n",
" 'тебя',\n",
" 'их',\n",
" 'чем',\n",
" 'была',\n",
" 'сам',\n",
" 'чтоб',\n",
" 'без',\n",
" 'будто',\n",
" 'чего',\n",
" 'раз',\n",
" 'тоже',\n",
" 'себе',\n",
" 'под',\n",
" 'будет',\n",
" 'ж',\n",
" 'тогда',\n",
" 'кто',\n",
" 'этот',\n",
" 'того',\n",
" 'потому',\n",
" 'этого',\n",
" 'какой',\n",
" 'совсем',\n",
" 'ним',\n",
" 'здесь',\n",
" 'этом',\n",
" 'один',\n",
" 'почти',\n",
" 'мой',\n",
" 'тем',\n",
" 'чтобы',\n",
" 'нее',\n",
" 'сейчас',\n",
" 'были',\n",
" 'куда',\n",
" 'зачем',\n",
" 'всех',\n",
" 'никогда',\n",
" 'можно',\n",
" 'при',\n",
" 'наконец',\n",
" 'два',\n",
" 'об',\n",
" 'другой',\n",
" 'хоть',\n",
" 'после',\n",
" 'над',\n",
" 'больше',\n",
" 'тот',\n",
" 'через',\n",
" 'эти',\n",
" 'нас',\n",
" 'про',\n",
" 'всего',\n",
" 'них',\n",
" 'какая',\n",
" 'много',\n",
" 'разве',\n",
" 'три',\n",
" 'эту',\n",
" 'моя',\n",
" 'впрочем',\n",
" 'хорошо',\n",
" 'свою',\n",
" 'этой',\n",
" 'перед',\n",
" 'иногда',\n",
" 'лучше',\n",
" 'чуть',\n",
" 'том',\n",
" 'нельзя',\n",
" 'такой',\n",
" 'им',\n",
" 'более',\n",
" 'всегда',\n",
" 'конечно',\n",
" 'всю',\n",
" 'между']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stopwords.words('russian')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['а', 'неважно', 'это', 'работать', 'лишь', 'работать']"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[morph.parse(word)[0].normal_form for word in tokens if word not in stopwords.words('russian') and word not in string.punctuation]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from keras.preprocessing.text import Tokenizer\n",
"tokenizer = Tokenizer(10000)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"tokenizer.fit_on_texts([' '.join(tokens)])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'а': 1,\n",
" 'мне': 2,\n",
" 'впрочем': 3,\n",
" 'неважно': 4,\n",
" 'как': 5,\n",
" 'это': 6,\n",
" 'работает': 7,\n",
" 'лишь': 8,\n",
" 'бы': 9,\n",
" 'работало': 10}"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.word_index"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment