Last active
February 8, 2025 06:49
-
-
Save prhbrt/92313f15fc814d6eed1e36ab4df1f92d to your computer and use it in GitHub Desktop.
Notebook of TFIDF Logistic regression on IMDB sentiment dataset (provided by keras)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.linear_model import LogisticRegression\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.pipeline import Pipeline\n", | |
"from sklearn.metrics import classification_report\n", | |
"from keras.datasets import imdb\n", | |
"\n", | |
"import json\n", | |
"import numpy\n", | |
"import nltk" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Dictionaries\n", | |
"\n", | |
"Create an index -> word mapping for each word" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"word_to_index = imdb.get_word_index()\n", | |
"index_to_word = [None] * (max(word_to_index.values()) + 1)\n", | |
"for w, i in word_to_index.items():\n", | |
" index_to_word[i] = w" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Review reconstruction\n", | |
"\n", | |
"Translate the lists of word-indices to strings using the aforementioned dictionaries." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"(X_train, y_train), (X_test, y_test) = imdb.load_data()\n", | |
"X_train = [\n", | |
" ' '.join(\n", | |
" index_to_word[i]\n", | |
" for i in X_train[i]\n", | |
" if i < len(index_to_word)\n", | |
" ) for i in range(X_train.shape[0])\n", | |
"]\n", | |
"\n", | |
"X_test = [\n", | |
" ' '.join(\n", | |
" index_to_word[i]\n", | |
" for i in X_test[i]\n", | |
" if i < len(index_to_word)\n", | |
" ) for i in range(X_test.shape[0])\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# TFIDF + logistic regression pipeline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", | |
" ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,\n", | |
" ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", | |
" verbose=0, warm_start=False))])" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model = Pipeline([\n", | |
" ('tfidf', TfidfVectorizer(ngram_range=(1,2))),\n", | |
" ('log', LogisticRegression())\n", | |
"])\n", | |
"\n", | |
"model.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.88 0.90 0.89 12280\n", | |
" 1 0.90 0.88 0.89 12720\n", | |
"\n", | |
"avg / total 0.89 0.89 0.89 25000\n", | |
"\n", | |
"0.88904\n" | |
] | |
} | |
], | |
"source": [ | |
"y_pred = model.predict(X_test)\n", | |
"print(classification_report(y_pred, y_test))\n", | |
"print((y_pred == y_test).mean())" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "IPython (Python 3)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment