Last active
June 12, 2018 23:20
-
-
Save rtbs-dev/88c5a6a3cebd98311244c065378892e0 to your computer and use it in GitHub Desktop.
NLP Cross-Val Pipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import seaborn as sns\n", | |
"import matplotlib.pyplot as plt\n", | |
"from tqdm import tqdm\n", | |
"import textacy\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"mlp\\preprocess.py:34: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
" df[\"labels\"][df[\"labels\"] == missing_label] = -1 # to make a \"non-category\"\n", | |
"mlp\\preprocess.py:37: SettingWithCopyWarning: \n", | |
"A value is trying to be set on a copy of a slice from a DataFrame\n", | |
"\n", | |
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", | |
" df[\"labels\"][df[\"labels\"] > missing_label] = df[\"labels\"][df[\"labels\"] > missing_label].values - 1\n" | |
] | |
} | |
], | |
"source": [ | |
"import mlp.preprocess as pre" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from mlp.embeddings import TopicVectors, SemanticVectors\n", | |
"from sklearn.pipeline import Pipeline\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn.ensemble import ExtraTreesClassifier\n", | |
"from sklearn.svm import LinearSVC\n", | |
"from sklearn.model_selection import GridSearchCV\n", | |
"from sklearn.preprocessing import FunctionTransformer\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"corpus = pre.get_corpus()\n", | |
"df = pre.get_df()\n", | |
"\n", | |
"y = pre.get_labeled_data(df['labels'].values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"score: 91.407407%\n" | |
] | |
} | |
], | |
"source": [ | |
"sgd_w2v = Pipeline([\n", | |
" ('word2vec embedding', SemanticVectors()),\n", | |
" ('extract_labeled', FunctionTransformer(pre.get_labeled_data, validate=False)), # extract labeled points\n", | |
" ('SGD Lin-SVC w/ElasticNet', SGDClassifier(class_weight='balanced', # compensate for class freqs\n", | |
" penalty='elasticnet', # L1 + L2 regularized\n", | |
" alpha=0.001,\n", | |
" n_iter=10))\n", | |
"])\n", | |
"\n", | |
"# sgd_w2v.set_params(anova__k=10, svc__C=.1).fit(corpus, y)\n", | |
"sgd_w2v.fit(corpus, y)\n", | |
"\n", | |
"print \"score: {:2f}%\".format((sgd_w2v.score(corpus, y))*100.)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"pipe = Pipeline([\n", | |
" ('embedding', SemanticVectors()),\n", | |
" ('extract_labeled', FunctionTransformer(pre.get_labeled_data, validate=False)), # extract labeled points\n", | |
" ('classify', LinearSVC())\n", | |
"])\n", | |
"\n", | |
"# SGDClassifier(class_weight='balanced', # compensate for class freqs\n", | |
"# penalty='elasticnet', # L1 + L2 regularized\n", | |
"# alpha=0.001,\n", | |
"# n_iter=10)\n", | |
"\n", | |
"N_FEATURES_OPTIONS = [100, 200, 300, 400, 500]\n", | |
"# C_OPTIONS = [1, 10, 100, 1000]\n", | |
"TOPIC_OPTIONS = ['lda', 'lsa', 'nmf']\n", | |
"param_grid = [\n", | |
" {\n", | |
" 'embedding': [TopicVectors()],\n", | |
" 'embedding__n_topics': N_FEATURES_OPTIONS,\n", | |
" 'embedding__model': TOPIC_OPTIONS,\n", | |
" 'embedding__bow_kws': [{\n", | |
" 'weighting': 'tfidf', # change to tf if using LDA\n", | |
" 'normalize': True, # turn off if using LDA\n", | |
" 'smooth_idf': True, # turn off if using LDA\n", | |
" },\n", | |
" {\n", | |
" 'weighting': 'tf', # change to tf if using LDA\n", | |
" 'normalize': False, # turn off if using LDA\n", | |
" 'smooth_idf': False, # turn off if using LDA\n", | |
" }]\n", | |
" },\n", | |
" {\n", | |
" 'embedding': [SemanticVectors()]\n", | |
" },\n", | |
"]\n", | |
"# reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']\n", | |
"\n", | |
"grid = GridSearchCV(pipe, cv=3, n_jobs=3, param_grid=param_grid)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "Found input variables with inconsistent numbers of samples: [3435, 675]", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-18-545120320836>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgrid\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32mC:\\Users\\tbs4\\AppData\\Local\\Continuum\\Anaconda2\\lib\\site-packages\\sklearn\\model_selection\\_search.pyc\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 943\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mtest\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \"\"\"\n\u001b[0;32m--> 945\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 946\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32mC:\\Users\\tbs4\\AppData\\Local\\Continuum\\Anaconda2\\lib\\site-packages\\sklearn\\model_selection\\_search.pyc\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, groups, parameter_iterable)\u001b[0m\n\u001b[1;32m 540\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscorer_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscoring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 541\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 542\u001b[0;31m \u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mindexable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 543\u001b[0m \u001b[0mn_splits\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_n_splits\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 544\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mverbose\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mparameter_iterable\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSized\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32mC:\\Users\\tbs4\\AppData\\Local\\Continuum\\Anaconda2\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36mindexable\u001b[0;34m(*iterables)\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 205\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 206\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 207\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32mC:\\Users\\tbs4\\AppData\\Local\\Continuum\\Anaconda2\\lib\\site-packages\\sklearn\\utils\\validation.pyc\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m raise ValueError(\"Found input variables with inconsistent numbers of\"\n\u001b[0;32m--> 181\u001b[0;31m \" samples: %r\" % [int(l) for l in lengths])\n\u001b[0m\u001b[1;32m 182\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [3435, 675]" | |
] | |
} | |
], | |
"source": [ | |
"grid.fit(corpus, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment