Last active
August 29, 2015 14:19
-
-
Save sangheestyle/f3a533fcce67041043e8 to your computer and use it in GitHub Desktop.
feature engineering with scikit-learn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# model07: feature selection\n", | |
"\n", | |
"## Explain this model\n", | |
"\n", | |
"### Model\n", | |
"* Linear models: Ridge, RidgeCV\n", | |
"\n", | |
"### Features\n", | |
"* uid\n", | |
"* qid\n", | |
"* q_length\n", | |
"* category\n", | |
"* answer\n", | |
"* avg_per_uid: average response time per user\n", | |
"* avg_per_qid: average response time per question" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"# Let's start our experimemt" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step1: Read train and test data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Read files for train and test set\n", | |
"We alread made given csv files as a pickled data for our convenience." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 210, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import gzip\n", | |
"import cPickle as pickle\n", | |
"\n", | |
"\n", | |
"with gzip.open(\"../data/train.pklz\", \"rb\") as train_file:\n", | |
" train_set = pickle.load(train_file)\n", | |
"\n", | |
"with gzip.open(\"../data/test.pklz\", \"rb\") as test_file:\n", | |
" test_set = pickle.load(test_file)\n", | |
"\n", | |
"with gzip.open(\"../data/questions.pklz\", \"rb\") as questions_file:\n", | |
" questions = pickle.load(questions_file)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### What they have?\n", | |
"Just look at what each set have." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 211, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"* train_set: {'answer': 'cole', 'qid': 1, 'uid': 0, 'position': 61.0}\n", | |
"* test_set: {'qid': 1, 'uid': 6}\n", | |
"* question keys: ['answer', 'category', 'group', 'pos_token', 'question']\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"('* question contents:',\n", | |
" {'answer': 'thomas cole',\n", | |
" 'category': 'Fine Arts',\n", | |
" 'group': 'test',\n", | |
" 'pos_token': {0: '',\n", | |
" 1: 'painters',\n", | |
" 2: 'indulgence',\n", | |
" 4: 'visual',\n", | |
" 5: 'fantasy',\n", | |
" 7: 'appreciation',\n", | |
" 9: 'different',\n", | |
" 10: 'historic',\n", | |
" 11: 'architectural',\n", | |
" 12: 'styles',\n", | |
" 15: 'seen',\n", | |
" 18: '1840',\n", | |
" 19: 'architects',\n", | |
" 20: 'dream',\n", | |
" 23: 'series',\n", | |
" 25: 'paintings',\n", | |
" 28: 'last',\n", | |
" 31: 'mohicans',\n", | |
" 33: 'made',\n", | |
" 35: 'three',\n", | |
" 36: 'year',\n", | |
" 37: 'trip',\n", | |
" 39: 'europe',\n", | |
" 41: '1829',\n", | |
" 45: 'better',\n", | |
" 46: 'known',\n", | |
" 49: 'trip',\n", | |
" 50: 'four',\n", | |
" 51: 'years',\n", | |
" 52: 'earlier',\n", | |
" 56: 'journeyed',\n", | |
" 59: 'hudson',\n", | |
" 60: 'river',\n", | |
" 63: 'catskill',\n", | |
" 64: 'mountains',\n", | |
" 65: 'ftp',\n", | |
" 66: 'name',\n", | |
" 68: 'this_painter',\n", | |
" 71: 'oxbow',\n", | |
" 74: 'voyage',\n", | |
" 76: 'life',\n", | |
" 77: 'series'},\n", | |
" 'question': \"This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series.\"})" | |
] | |
}, | |
"execution_count": 211, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"print \"* train_set:\", train_set[1]\n", | |
"print \"* test_set:\", test_set[7]\n", | |
"print \"* question keys:\", questions[1].keys()\n", | |
"\"* question contents:\", questions[1]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step2: Feature Engineering\n", | |
"We might want to use some set of feature based on given data." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 212, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from collections import defaultdict\n", | |
"\n", | |
"\"\"\"\n", | |
"Calculate average position(response time) per user(uid) and question(qid).\n", | |
"\"\"\"\n", | |
"def get_avg_pos(data):\n", | |
" pos_uid = defaultdict(list)\n", | |
" pos_qid = defaultdict(list)\n", | |
"\n", | |
" for key in data:\n", | |
" uid = data[key]['uid']\n", | |
" qid = data[key]['qid']\n", | |
" pos = data[key]['position']\n", | |
" pos_uid[uid].append(pos)\n", | |
" pos_qid[qid].append(pos)\n", | |
"\n", | |
" avg_pos_uid = {}\n", | |
" avg_pos_qid = {}\n", | |
"\n", | |
" for key in pos_uid:\n", | |
" avg_pos_uid[key] = sum(pos_uid[key]) / len(pos_uid[key])\n", | |
"\n", | |
" for key in pos_qid:\n", | |
" avg_pos_qid[key] = sum(pos_qid[key]) / len(pos_qid[key])\n", | |
" \n", | |
" return [avg_pos_uid, avg_pos_qid]\n", | |
"\n", | |
"\n", | |
"\"\"\"\n", | |
"Make feature vectors for given data set\n", | |
"\"\"\"\n", | |
"def featurize(data, avg_pos):\n", | |
" X = []\n", | |
" avg_pos_uid = avg_pos[0]\n", | |
" avg_pos_qid = avg_pos[1]\n", | |
" for key in data:\n", | |
" uid = data[key]['uid']\n", | |
" qid = data[key]['qid']\n", | |
" q_length = max(questions[qid]['pos_token'].keys())\n", | |
" category = questions[qid]['category'].lower()\n", | |
" answer = questions[qid]['answer'].lower()\n", | |
" if uid in avg_pos_uid:\n", | |
" pos_uid = avg_pos_uid[uid]\n", | |
" else:\n", | |
" pos_uid = sum(avg_pos_uid.values()) / float(len(avg_pos_uid.values()))\n", | |
" \n", | |
" if qid in avg_pos_qid:\n", | |
" pos_qid = avg_pos_qid[qid]\n", | |
" else:\n", | |
" pos_qid = sum(avg_pos_qid.values()) / float(len(avg_pos_qid.values()))\n", | |
" \n", | |
" feat = {\"uid\": str(uid),\n", | |
" \"qid\": str(qid),\n", | |
" \"q_length\": q_length,\n", | |
" \"category\": category,\n", | |
" \"answer\": answer,\n", | |
" \"avg_pos_uid\": pos_uid,\n", | |
" \"avg_pos_qid\": pos_qid\n", | |
" }\n", | |
" X.append(feat)\n", | |
" \n", | |
" return X\n", | |
"\n", | |
"\n", | |
"\"\"\"\n", | |
"Get positions\n", | |
"\"\"\"\n", | |
"def get_positions(data):\n", | |
" Y = []\n", | |
" for key in data:\n", | |
" position = data[key]['position']\n", | |
" Y.append(position)\n", | |
" \n", | |
" return Y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 218, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"28494\n", | |
"28494\n", | |
"{'category': 'fine arts', 'uid': '0', 'qid': '1', 'avg_pos_uid': 55.708333333333336, 'q_length': 77, 'answer': 'thomas cole', 'avg_pos_qid': 51.0} 61.0\n" | |
] | |
} | |
], | |
"source": [ | |
"import math\n", | |
"from numpy import abs, sqrt\n", | |
"from sklearn.cross_validation import StratifiedKFold\n", | |
"from sklearn.feature_selection import RFECV\n", | |
"from sklearn import linear_model\n", | |
"from sklearn.cross_validation import train_test_split, cross_val_score\n", | |
"from sklearn.feature_extraction import DictVectorizer\n", | |
"\n", | |
"\n", | |
"X_train = featurize(train_set, get_avg_pos(train_set))\n", | |
"Y_train = get_positions(train_set)\n", | |
"print len(X_train)\n", | |
"print len(Y_train)\n", | |
"print X_train[0], Y_train[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 219, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"vec = DictVectorizer()\n", | |
"X_train = vec.fit_transform(X_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 220, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[False False True ..., False False False]\n", | |
"125\n", | |
"[4914 737 1 ..., 43 47 293]\n", | |
"sklearn.cross_validation.StratifiedKFold(labels=[ 61. 67. 66. ..., -106. -91. 94.], n_folds=2, shuffle=False, random_state=None)\n" | |
] | |
} | |
], | |
"source": [ | |
"estimator = linear_model.Ridge()\n", | |
"selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(Y_train, 2))\n", | |
"X_train_sel = selector.fit_transform(X_train, Y_train)\n", | |
"print selector.support_\n", | |
"print selector.n_features_\n", | |
"print selector.ranking_\n", | |
"print selector.cv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 221, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"alpha: 1000000\n", | |
"score: <bound method RidgeCV.score of RidgeCV(alphas=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000],\n", | |
" cv=100, fit_intercept=True, gcv_mode=None, normalize=True,\n", | |
" scoring='mean_squared_error', store_cv_values=False)>\n", | |
"coef: [ 1.75933803e-06 1.48886459e-05 -1.01434067e-04 ..., 6.07399560e-05\n", | |
" 6.08362233e-05 2.65297792e-05]\n" | |
] | |
} | |
], | |
"source": [ | |
"regressor = linear_model.RidgeCV(alphas=[10**x for x in range(10)],\\\n", | |
" normalize=True,\\\n", | |
" cv=100,\\\n", | |
" scoring='mean_squared_error')\n", | |
"regressor.fit(X_train, Y_train)\n", | |
"print \"alpha:\", regressor.alpha_\n", | |
"print \"score:\", regressor.score\n", | |
"print \"coef: \", regressor.coef_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 223, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 \t70.2511645175\n", | |
"10 \t70.2511644912\n", | |
"100 \t70.2511642276\n", | |
"1000 \t70.2511616064\n", | |
"10000 \t70.2511368389\n", | |
"100000 \t70.2510319681\n", | |
"1000000 \t70.262798834\n", | |
"10000000 \t70.9567819985\n", | |
"100000000 \t77.654246471\n", | |
"1000000000 \t84.0398877839\n" | |
] | |
} | |
], | |
"source": [ | |
"for ii in [10**x for x in range(10)]:\n", | |
" scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n", | |
" X_train_sel, Y_train,\\\n", | |
" cv=10, scoring='mean_squared_error')\n", | |
" print ii,\"\\t\", sqrt(abs(scores)).mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 224, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 \t70.2511715134\n", | |
"10 \t70.251171487\n", | |
"100 \t70.2511712234\n", | |
"1000 \t70.2511686012\n", | |
"10000 \t70.2511438238\n", | |
"100000 \t70.2510388562\n", | |
"1000000 \t70.2628049139\n", | |
"10000000 \t70.9567855533\n", | |
"100000000 \t77.654244337\n", | |
"1000000000 \t84.0398853661\n" | |
] | |
} | |
], | |
"source": [ | |
"for ii in [10**x for x in range(10)]:\n", | |
" scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n", | |
" X_train, Y_train,\\\n", | |
" cv=10, scoring='mean_squared_error')\n", | |
" print ii,\"\\t\", sqrt(abs(scores)).mean()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment