Created
April 1, 2021 17:06
-
-
Save MBoustani/3e32e8c503b167128054051128e80662 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Error loading stopwords: <urlopen error [SSL:\n", | |
"[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:\n", | |
"[nltk_data] unable to get local issuer certificate (_ssl.c:1038)>\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"import re\n", | |
"\n", | |
"import nltk \n", | |
"nltk.download('stopwords')\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk.tokenize import word_tokenize \n", | |
"from nltk.stem import SnowballStemmer\n", | |
"\n", | |
"from sklearn import model_selection, metrics, preprocessing, ensemble, model_selection, metrics\n", | |
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", | |
"\n", | |
"\n", | |
"import tensorflow as tf\n", | |
"from tensorflow.keras.models import Model\n", | |
"from tensorflow.keras.preprocessing.text import Tokenizer\n", | |
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n", | |
"from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Dropout, Input, SpatialDropout1D\n", | |
"from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau\n", | |
"from tensorflow.keras.optimizers import Adam\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Read train data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Data points count: 7613\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>id</th>\n", | |
" <th>keyword</th>\n", | |
" <th>location</th>\n", | |
" <th>text</th>\n", | |
" <th>target</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Our Deeds are the Reason of this #earthquake M...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>4</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Forest fire near La Ronge Sask. Canada</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>All residents asked to 'shelter in place' are ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>6</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>13,000 people receive #wildfires evacuation or...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>7</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Just got sent this photo from Ruby #Alaska as ...</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" id keyword location text \\\n", | |
"0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n", | |
"1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n", | |
"2 5 NaN NaN All residents asked to 'shelter in place' are ... \n", | |
"3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n", | |
"4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n", | |
"\n", | |
" target \n", | |
"0 1 \n", | |
"1 1 \n", | |
"2 1 \n", | |
"3 1 \n", | |
"4 1 " | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Rreading train dataset\n", | |
"file_path = \"./train.csv\"\n", | |
"raw_data = pd.read_csv(file_path)\n", | |
"print(\"Data points count: \", raw_data['id'].count())\n", | |
"raw_data.head()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 720x576 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Plotting target value counts\n", | |
"plt.figure(figsize=(10,8))\n", | |
"ax = raw_data['target'].value_counts().sort_values().plot(kind=\"bar\")\n", | |
"ax.grid(axis=\"y\")\n", | |
"plt.suptitle(\"Target Value Counts\", fontsize=20)\n", | |
"plt.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Missing Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of missing data for column keyword: 61\n", | |
"Number of missing data for column location: 2533\n", | |
"Number of missing data for column text: 0\n", | |
"Number of missing data for column target: 0\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Number of missing data for column keyword: \", raw_data['keyword'].isna().sum())\n", | |
"print(\"Number of missing data for column location: \", raw_data['location'].isna().sum())\n", | |
"print(\"Number of missing data for column text: \", raw_data['text'].isna().sum())\n", | |
"print(\"Number of missing data for column target: \", raw_data['target'].isna().sum())\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 1080x576 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.figure(figsize=(15,8))\n", | |
"sns.heatmap(raw_data.drop('id', axis=1).isnull(), cbar=False, cmap=\"GnBu\").set_title(\"Missing data for each column\")\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Cleaning Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", | |
" return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval\n" | |
] | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 1080x576 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.figure(figsize=(15,8))\n", | |
"raw_data['word_count'] = raw_data['text'].apply(lambda x: len(x.split(\" \")) )\n", | |
"sns.distplot(raw_data['word_count'].values, hist=True, kde=True, kde_kws={\"shade\": True})\n", | |
"plt.axvline(raw_data['word_count'].describe()['25%'], ls=\"--\")\n", | |
"plt.axvline(raw_data['word_count'].describe()['50%'], ls=\"--\")\n", | |
"plt.axvline(raw_data['word_count'].describe()['75%'], ls=\"--\")\n", | |
"\n", | |
"plt.grid()\n", | |
"plt.suptitle(\"Word count histogram\")\n", | |
"plt.show()\n", | |
"\n", | |
"# remove rows with under 3 words\n", | |
"raw_data = raw_data[raw_data['word_count']>2]\n", | |
"raw_data = raw_data.reset_index()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"25th percentile: 11.0\n", | |
"mean: 15.0\n", | |
"75th percentile: 19.0\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"25th percentile: \", raw_data['word_count'].describe()['25%'])\n", | |
"print(\"mean: \", raw_data['word_count'].describe()['50%'])\n", | |
"print(\"75th percentile: \", raw_data['word_count'].describe()['75%'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Clean text columns\n", | |
"stop_words = set(stopwords.words('english'))\n", | |
"stemmer = SnowballStemmer('english')\n", | |
"\n", | |
"\n", | |
"def clean_text(each_text):\n", | |
"\n", | |
" # remove URL from text\n", | |
" each_text_no_url = re.sub(r\"http\\S+\", \"\", each_text)\n", | |
" \n", | |
" # remove numbers from text\n", | |
" text_no_num = re.sub(r'\\d+', '', each_text_no_url)\n", | |
"\n", | |
" # tokenize each text\n", | |
" word_tokens = word_tokenize(text_no_num)\n", | |
" \n", | |
" # remove sptial character\n", | |
" clean_text = []\n", | |
" for word in word_tokens:\n", | |
" clean_text.append(\"\".join([e for e in word if e.isalnum()]))\n", | |
"\n", | |
" # remove stop words and lower\n", | |
" text_with_no_stop_word = [w.lower() for w in clean_text if not w in stop_words] \n", | |
"\n", | |
" # do stemming\n", | |
" stemmed_text = [stemmer.stem(w) for w in text_with_no_stop_word]\n", | |
" \n", | |
" return \" \".join(\" \".join(stemmed_text).split())\n", | |
"\n", | |
"\n", | |
"raw_data['clean_text'] = raw_data['text'].apply(lambda x: clean_text(x) )\n", | |
"raw_data['keyword'] = raw_data['keyword'].fillna(\"none\")\n", | |
"raw_data['clean_keyword'] = raw_data['keyword'].apply(lambda x: clean_text(x) )\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Combine column 'clean_keyword' and 'clean_text' into one\n", | |
"raw_data['keyword_text'] = raw_data['clean_keyword'] + \" \" + raw_data[\"clean_text\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Prepare train and test data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"feature = 'keyword_text'\n", | |
"label = \"target\"\n", | |
"\n", | |
"# split train and test\n", | |
"X_train, X_test,y_train, y_test = model_selection.train_test_split(raw_data[feature],\n", | |
" raw_data[label],\n", | |
" test_size=0.3,\n", | |
" random_state=0, \n", | |
" shuffle=True)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Gradient Boosting Classifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train_GBC = X_train.values.reshape(-1)\n", | |
"x_test_GBC = X_test.values.reshape(-1)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Vectorize text\n", | |
"vectorizer = CountVectorizer()\n", | |
"X_train_GBC = vectorizer.fit_transform(X_train_GBC)\n", | |
"x_test_GBC = vectorizer.transform(x_test_GBC)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", | |
" learning_rate=0.1, loss='deviance', max_depth=9,\n", | |
" max_features=8, max_leaf_nodes=None,\n", | |
" min_impurity_decrease=0.0, min_impurity_split=None,\n", | |
" min_samples_leaf=2, min_samples_split=6,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=2000,\n", | |
" n_iter_no_change=None, presort='auto',\n", | |
" random_state=None, subsample=0.9, tol=0.0001,\n", | |
" validation_fraction=0.1, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Train the model\n", | |
"model = ensemble.GradientBoostingClassifier(learning_rate=0.1, \n", | |
" n_estimators=2000,\n", | |
" max_depth=9,\n", | |
" min_samples_split=6,\n", | |
" min_samples_leaf=2,\n", | |
" max_features=8,\n", | |
" subsample=0.9)\n", | |
"model.fit(X_train_GBC, y_train)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Test accuracy: 0.7911894273127753\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.79 0.87 0.83 1309\n", | |
" 1 0.80 0.68 0.73 961\n", | |
"\n", | |
" accuracy 0.79 2270\n", | |
" macro avg 0.79 0.78 0.78 2270\n", | |
"weighted avg 0.79 0.79 0.79 2270\n", | |
"\n", | |
"Test F-scoare: 0.733408323959505\n" | |
] | |
} | |
], | |
"source": [ | |
"# Evaluate the model\n", | |
"predicted_prob = model.predict_proba(x_test_GBC)[:,1]\n", | |
"predicted = model.predict(x_test_GBC)\n", | |
"\n", | |
"accuracy = metrics.accuracy_score(predicted, y_test)\n", | |
"print(\"Test accuracy: \", accuracy)\n", | |
"print(metrics.classification_report(y_test, predicted, target_names=[\"0\", \"1\"]))\n", | |
"print(\"Test F-scoare: \", metrics.f1_score(y_test, predicted))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Plot confusion matrix\n", | |
"conf_matrix = metrics.confusion_matrix(y_test, predicted)\n", | |
"\n", | |
"fig, ax = plt.subplots()\n", | |
"sns.heatmap(conf_matrix, cbar=False, cmap='Reds', annot=True, fmt='d')\n", | |
"ax.set(xlabel=\"Predicted Value\", ylabel=\"True Value\", title=\"Confusion Matrix\")\n", | |
"ax.set_yticklabels(labels=['0', '1'], rotation=0)\n", | |
"\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# LSTM" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Define some hyperparameters\n", | |
"path_to_glove_file = './glove.6B.300d.txt' # download link: http://nlp.stanford.edu/data/glove.6B.zip\n", | |
"embedding_dim = 300\n", | |
"learning_rate = 1e-3\n", | |
"batch_size = 1024\n", | |
"epochs = 20\n", | |
"sequence_len = 100\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Training Y shape: (5294, 1)\n", | |
"Testing Y shape: (2270, 1)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Define train and test labels\n", | |
"y_train_LSTM = y_train.values.reshape(-1,1)\n", | |
"y_test_LSTM = y_test.values.reshape(-1,1)\n", | |
"\n", | |
"print(\"Training Y shape:\", y_train_LSTM.shape)\n", | |
"print(\"Testing Y shape:\", y_test_LSTM.shape)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Vocabulary Size: 11148\n" | |
] | |
} | |
], | |
"source": [ | |
"# Tokenize train data\n", | |
"tokenizer = Tokenizer()\n", | |
"tokenizer.fit_on_texts(X_train)\n", | |
"\n", | |
"word_index = tokenizer.word_index\n", | |
"vocab_size = len(word_index) + 1\n", | |
"print(\"Vocabulary Size: \", vocab_size)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Training X shape: (5294, 100)\n", | |
"Testing X shape: (2270, 100)\n" | |
] | |
} | |
], | |
"source": [ | |
"# Pad train and test \n", | |
"X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=sequence_len)\n", | |
"X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=sequence_len)\n", | |
"\n", | |
"print(\"Training X shape: \", X_train.shape)\n", | |
"print(\"Testing X shape: \", X_test.shape)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Found 400000 word vectors.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Read word embeddings\n", | |
"embeddings_index = {}\n", | |
"with open(path_to_glove_file) as f:\n", | |
" for line in f:\n", | |
" word, coefs = line.split(maxsplit=1)\n", | |
" coefs = np.fromstring(coefs, \"f\", sep=\" \")\n", | |
" embeddings_index[word] = coefs\n", | |
"\n", | |
"print(\"Found %s word vectors.\" % len(embeddings_index))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"WARNING: Logging before flag parsing goes to stderr.\n", | |
"W0401 09:42:41.752741 4526493120 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n" | |
] | |
} | |
], | |
"source": [ | |
"# Define embedding layer in Keras\n", | |
"embedding_matrix = np.zeros((vocab_size, embedding_dim))\n", | |
"for word, i in word_index.items():\n", | |
" embedding_vector = embeddings_index.get(word)\n", | |
" if embedding_vector is not None:\n", | |
" embedding_matrix[i] = embedding_vector\n", | |
" \n", | |
"embedding_layer = tf.keras.layers.Embedding(vocab_size,\n", | |
" embedding_dim,\n", | |
" weights=[embedding_matrix],\n", | |
" input_length=sequence_len,\n", | |
" trainable=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"W0401 09:42:41.831162 4526493120 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"W0401 09:42:41.855180 4526493120 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"W0401 09:42:41.856187 4526493120 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n", | |
"W0401 09:42:41.857138 4526493120 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Call initializer instance with the dtype argument instead of passing it to the constructor\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Model: \"model\"\n", | |
"_________________________________________________________________\n", | |
"Layer (type) Output Shape Param # \n", | |
"=================================================================\n", | |
"input_1 (InputLayer) [(None, 100)] 0 \n", | |
"_________________________________________________________________\n", | |
"embedding (Embedding) (None, 100, 300) 3344400 \n", | |
"_________________________________________________________________\n", | |
"conv1d (Conv1D) (None, 96, 128) 192128 \n", | |
"_________________________________________________________________\n", | |
"bidirectional (Bidirectional (None, 256) 263168 \n", | |
"_________________________________________________________________\n", | |
"dense (Dense) (None, 512) 131584 \n", | |
"_________________________________________________________________\n", | |
"dropout (Dropout) (None, 512) 0 \n", | |
"_________________________________________________________________\n", | |
"dense_1 (Dense) (None, 512) 262656 \n", | |
"_________________________________________________________________\n", | |
"dense_2 (Dense) (None, 1) 513 \n", | |
"=================================================================\n", | |
"Total params: 4,194,449\n", | |
"Trainable params: 850,049\n", | |
"Non-trainable params: 3,344,400\n", | |
"_________________________________________________________________\n" | |
] | |
} | |
], | |
"source": [ | |
"# Define model architecture\n", | |
"sequence_input = Input(shape=(sequence_len, ), dtype='int32')\n", | |
"embedding_sequences = embedding_layer(sequence_input)\n", | |
"\n", | |
"x = Conv1D(128, 5, activation='relu')(embedding_sequences)\n", | |
"x = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2))(x)\n", | |
"x = Dense(512, activation='relu')(x)\n", | |
"x = Dropout(0.5)(x)\n", | |
"x = Dense(512, activation='relu')(x)\n", | |
"outputs = Dense(1, activation='sigmoid')(x)\n", | |
"model = Model(sequence_input, outputs)\n", | |
"model.summary()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"W0401 09:42:42.555547 4526493120 deprecation.py:323] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", | |
"Instructions for updating:\n", | |
"Use tf.where in 2.0, which has the same broadcast rule as np.where\n" | |
] | |
} | |
], | |
"source": [ | |
"# Optimize the model\n", | |
"model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Train on 5294 samples, validate on 2270 samples\n", | |
"Epoch 1/20\n", | |
"5294/5294 [==============================] - 22s 4ms/sample - loss: 0.6693 - acc: 0.5518 - val_loss: 0.5874 - val_acc: 0.7414\n", | |
"Epoch 2/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.5778 - acc: 0.7263 - val_loss: 0.5394 - val_acc: 0.7502\n", | |
"Epoch 3/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.5177 - acc: 0.7584 - val_loss: 0.5060 - val_acc: 0.7731\n", | |
"Epoch 4/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.4906 - acc: 0.7743 - val_loss: 0.4949 - val_acc: 0.7740\n", | |
"Epoch 5/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.4694 - acc: 0.7860 - val_loss: 0.4802 - val_acc: 0.7767\n", | |
"Epoch 6/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.4505 - acc: 0.7981 - val_loss: 0.4792 - val_acc: 0.7767\n", | |
"Epoch 7/20\n", | |
"5294/5294 [==============================] - 18s 3ms/sample - loss: 0.4289 - acc: 0.8102 - val_loss: 0.4700 - val_acc: 0.7811\n", | |
"Epoch 8/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.4127 - acc: 0.8264 - val_loss: 0.4749 - val_acc: 0.7855\n", | |
"Epoch 9/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.3925 - acc: 0.8298 - val_loss: 0.4807 - val_acc: 0.7907\n", | |
"Epoch 10/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.3733 - acc: 0.8387 - val_loss: 0.4860 - val_acc: 0.7890\n", | |
"Epoch 11/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.3590 - acc: 0.8434 - val_loss: 0.5011 - val_acc: 0.7912\n", | |
"Epoch 12/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.3223 - acc: 0.8663 - val_loss: 0.5146 - val_acc: 0.7846\n", | |
"Epoch 13/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.2980 - acc: 0.8825 - val_loss: 0.5561 - val_acc: 0.7793\n", | |
"Epoch 14/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.2718 - acc: 0.8874 - val_loss: 0.5606 - val_acc: 0.7863\n", | |
"Epoch 15/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.2349 - acc: 0.9073 - val_loss: 0.6098 - val_acc: 0.7828\n", | |
"Epoch 16/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.2270 - acc: 0.9108 - val_loss: 0.6109 - val_acc: 0.7885\n", | |
"Epoch 17/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.1974 - acc: 0.9260 - val_loss: 0.6200 - val_acc: 0.7855\n", | |
"Epoch 18/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.1658 - acc: 0.9367 - val_loss: 0.7181 - val_acc: 0.7784\n", | |
"Epoch 19/20\n", | |
"5294/5294 [==============================] - 16s 3ms/sample - loss: 0.1497 - acc: 0.9409 - val_loss: 0.7746 - val_acc: 0.7797\n", | |
"Epoch 20/20\n", | |
"5294/5294 [==============================] - 17s 3ms/sample - loss: 0.1377 - acc: 0.9528 - val_loss: 0.8005 - val_acc: 0.7727\n" | |
] | |
} | |
], | |
"source": [ | |
"# Train the LSTM Model\n", | |
"history = model.fit(X_train,\n", | |
" y_train,\n", | |
" batch_size=batch_size,\n", | |
" epochs=epochs, \n", | |
" validation_data=(X_test, y_test))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 720x288 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Plot train accuracy and loss\n", | |
"accuraties = history.history['acc']\n", | |
"losses = history.history['loss']\n", | |
"accuraties_losses = list(zip(accuraties,losses))\n", | |
"\n", | |
"accuraties_losses_df = pd.DataFrame(accuraties_losses, columns={\"accuraties\", \"losses\"})\n", | |
"\n", | |
"plt.figure(figsize=(10,4))\n", | |
"plt.suptitle(\"Train Accuracy vs Train Loss\")\n", | |
"sns.lineplot(data=accuraties_losses_df)\n", | |
"plt.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2270/2270 [==============================] - 2s 951us/sample\n", | |
"2270/2270 [==============================] - 2s 858us/sample - loss: 0.8005 - acc: 0.7727\n", | |
"Test Accuracy: 0.7726872\n", | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.78 0.84 0.81 1309\n", | |
" 1 0.76 0.68 0.72 961\n", | |
"\n", | |
" accuracy 0.77 2270\n", | |
" macro avg 0.77 0.76 0.76 2270\n", | |
"weighted avg 0.77 0.77 0.77 2270\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"# Evaluate the model\n", | |
"predicted = model.predict(X_test, verbose=1, batch_size=10000)\n", | |
"\n", | |
"y_predicted = [1 if each > 0.5 else 0 for each in predicted]\n", | |
"\n", | |
"score, test_accuracy = model.evaluate(X_test, y_test, batch_size=10000)\n", | |
"\n", | |
"print(\"Test Accuracy: \", test_accuracy)\n", | |
"print(metrics.classification_report(list(y_test), y_predicted))\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Plot confusion matrix\n", | |
"conf_matrix = metrics.confusion_matrix(y_test, y_predicted)\n", | |
"\n", | |
"fig, ax = plt.subplots()\n", | |
"sns.heatmap(conf_matrix, cbar=False, cmap='Reds', annot=True, fmt='d')\n", | |
"ax.set(xlabel=\"Predicted Value\", ylabel=\"True Value\", title=\"Confusion Matrix\")\n", | |
"ax.set_yticklabels(labels=['0', '1'], rotation=0)\n", | |
"plt.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.0b2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment