Skip to content

Instantly share code, notes, and snippets.

@luan-cestari
Forked from prhbrt/IMDB + TFIDF + LogReg.ipynb
Last active March 26, 2017 23:02
Show Gist options
  • Save luan-cestari/7ada56a0b4c0991c45cc7dd4fa3f30a0 to your computer and use it in GitHub Desktop.
Save luan-cestari/7ada56a0b4c0991c45cc7dd4fa3f30a0 to your computer and use it in GitHub Desktop.
Notebook of TFIDF Logistic regression on IMDB sentiment dataset (provided by keras)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.metrics import classification_report\n",
"from keras.datasets import imdb\n",
"\n",
"import json\n",
"import numpy\n",
"import nltk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dictionaries\n",
"\n",
"Create an index -> word mapping for each word"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"word_to_index = imdb.get_word_index()\n",
"index_to_word = [None] * (max(word_to_index.values()) + 1)\n",
"for w, i in word_to_index.items():\n",
" index_to_word[i] = w\n",
"print(index_to_word)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Review reconstruction\n",
"\n",
"Translate the lists of word-indices to strings using the aforementioned dictionaries."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"(X_train, y_train), (X_test, y_test) = imdb.load_data()\n",
"X_train = [\n",
" ' '.join(\n",
" index_to_word[i]\n",
" for i in X_train[i]\n",
" if i < len(index_to_word)\n",
" ) for i in range(X_train.shape[0])\n",
"]\n",
"\n",
"X_test = [\n",
" ' '.join(\n",
" index_to_word[i]\n",
" for i in X_test[i]\n",
" if i < len(index_to_word)\n",
" ) for i in range(X_test.shape[0])\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TFIDF + logistic regression pipeline"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,\n",
" ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
" verbose=0, warm_start=False))])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = Pipeline([\n",
" ('tfidf', TfidfVectorizer(ngram_range=(1,2))),\n",
" ('log', LogisticRegression())\n",
"])\n",
"\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Results"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.90 0.89 12280\n",
" 1 0.90 0.88 0.89 12720\n",
"\n",
"avg / total 0.89 0.89 0.89 25000\n",
"\n",
"0.88904\n"
]
}
],
"source": [
"y_pred = model.predict(X_test)\n",
"print(classification_report(y_pred, y_test))\n",
"print((y_pred == y_test).mean())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "IPython (Python 3)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment