sangheestyle · August 29, 2015 14:19
diff --git a/model07_play_with_feature_selection.ipynb b/model07_play_with_feature_selection.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# model07: feature selection\n",
    "\n",
    "## Explain this model\n",
    "\n",
    "### Model\n",
    "* Linear models: Ridge, RidgeCV\n",
    "\n",
    "### Features\n",
    "* uid\n",
    "* qid\n",
    "* q_length\n",
    "* category\n",
    "* answer\n",
    "* avg_per_uid: average response time per user\n",
    "* avg_per_qid: average response time per question"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Let's start our experimemt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step1: Read train and test data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read files for train and test set\n",
    "We alread made given csv files as a pickled data for our convenience."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gzip\n",
    "import cPickle as pickle\n",
    "\n",
    "\n",
    "with gzip.open(\"../data/train.pklz\", \"rb\") as train_file:\n",
    "    train_set = pickle.load(train_file)\n",
    "\n",
    "with gzip.open(\"../data/test.pklz\", \"rb\") as test_file:\n",
    "    test_set = pickle.load(test_file)\n",
    "\n",
    "with gzip.open(\"../data/questions.pklz\", \"rb\") as questions_file:\n",
    "    questions = pickle.load(questions_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What they have?\n",
    "Just look at what each set have."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* train_set: {'answer': 'cole', 'qid': 1, 'uid': 0, 'position': 61.0}\n",
      "* test_set: {'qid': 1, 'uid': 6}\n",
      "* question keys: ['answer', 'category', 'group', 'pos_token', 'question']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('* question contents:',\n",
       " {'answer': 'thomas cole',\n",
       "  'category': 'Fine Arts',\n",
       "  'group': 'test',\n",
       "  'pos_token': {0: '',\n",
       "   1: 'painters',\n",
       "   2: 'indulgence',\n",
       "   4: 'visual',\n",
       "   5: 'fantasy',\n",
       "   7: 'appreciation',\n",
       "   9: 'different',\n",
       "   10: 'historic',\n",
       "   11: 'architectural',\n",
       "   12: 'styles',\n",
       "   15: 'seen',\n",
       "   18: '1840',\n",
       "   19: 'architects',\n",
       "   20: 'dream',\n",
       "   23: 'series',\n",
       "   25: 'paintings',\n",
       "   28: 'last',\n",
       "   31: 'mohicans',\n",
       "   33: 'made',\n",
       "   35: 'three',\n",
       "   36: 'year',\n",
       "   37: 'trip',\n",
       "   39: 'europe',\n",
       "   41: '1829',\n",
       "   45: 'better',\n",
       "   46: 'known',\n",
       "   49: 'trip',\n",
       "   50: 'four',\n",
       "   51: 'years',\n",
       "   52: 'earlier',\n",
       "   56: 'journeyed',\n",
       "   59: 'hudson',\n",
       "   60: 'river',\n",
       "   63: 'catskill',\n",
       "   64: 'mountains',\n",
       "   65: 'ftp',\n",
       "   66: 'name',\n",
       "   68: 'this_painter',\n",
       "   71: 'oxbow',\n",
       "   74: 'voyage',\n",
       "   76: 'life',\n",
       "   77: 'series'},\n",
       "  'question': \"This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series.\"})"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print \"* train_set:\", train_set[1]\n",
    "print \"* test_set:\", test_set[7]\n",
    "print \"* question keys:\", questions[1].keys()\n",
    "\"* question contents:\", questions[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2: Feature Engineering\n",
    "We might want to use some set of feature based on given data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "\"\"\"\n",
    "Calculate average position(response time) per user(uid) and question(qid).\n",
    "\"\"\"\n",
    "def get_avg_pos(data):\n",
    "    pos_uid = defaultdict(list)\n",
    "    pos_qid = defaultdict(list)\n",
    "\n",
    "    for key in data:\n",
    "        uid = data[key]['uid']\n",
    "        qid = data[key]['qid']\n",
    "        pos = data[key]['position']\n",
    "        pos_uid[uid].append(pos)\n",
    "        pos_qid[qid].append(pos)\n",
    "\n",
    "    avg_pos_uid = {}\n",
    "    avg_pos_qid = {}\n",
    "\n",
    "    for key in pos_uid:\n",
    "        avg_pos_uid[key] = sum(pos_uid[key]) / len(pos_uid[key])\n",
    "\n",
    "    for key in pos_qid:\n",
    "        avg_pos_qid[key] = sum(pos_qid[key]) / len(pos_qid[key])\n",
    "    \n",
    "    return [avg_pos_uid, avg_pos_qid]\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "Make feature vectors for given data set\n",
    "\"\"\"\n",
    "def featurize(data, avg_pos):\n",
    "    X = []\n",
    "    avg_pos_uid = avg_pos[0]\n",
    "    avg_pos_qid = avg_pos[1]\n",
    "    for key in data:\n",
    "        uid = data[key]['uid']\n",
    "        qid = data[key]['qid']\n",
    "        q_length = max(questions[qid]['pos_token'].keys())\n",
    "        category = questions[qid]['category'].lower()\n",
    "        answer = questions[qid]['answer'].lower()\n",
    "        if uid in avg_pos_uid:\n",
    "            pos_uid = avg_pos_uid[uid]\n",
    "        else:\n",
    "            pos_uid = sum(avg_pos_uid.values()) / float(len(avg_pos_uid.values()))\n",
    "            \n",
    "        if qid in avg_pos_qid:\n",
    "            pos_qid = avg_pos_qid[qid]\n",
    "        else:\n",
    "            pos_qid = sum(avg_pos_qid.values()) / float(len(avg_pos_qid.values()))\n",
    "            \n",
    "        feat = {\"uid\": str(uid),\n",
    "                \"qid\": str(qid),\n",
    "                \"q_length\": q_length,\n",
    "                \"category\": category,\n",
    "                \"answer\": answer,\n",
    "                \"avg_pos_uid\": pos_uid,\n",
    "                \"avg_pos_qid\": pos_qid\n",
    "               }\n",
    "        X.append(feat)\n",
    "    \n",
    "    return X\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "Get positions\n",
    "\"\"\"\n",
    "def get_positions(data):\n",
    "    Y = []\n",
    "    for key in data:\n",
    "        position = data[key]['position']\n",
    "        Y.append(position)\n",
    "    \n",
    "    return Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28494\n",
      "28494\n",
      "{'category': 'fine arts', 'uid': '0', 'qid': '1', 'avg_pos_uid': 55.708333333333336, 'q_length': 77, 'answer': 'thomas cole', 'avg_pos_qid': 51.0} 61.0\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "from numpy import abs, sqrt\n",
    "from sklearn.cross_validation import StratifiedKFold\n",
    "from sklearn.feature_selection import RFECV\n",
    "from sklearn import linear_model\n",
    "from sklearn.cross_validation import train_test_split, cross_val_score\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "\n",
    "\n",
    "X_train = featurize(train_set, get_avg_pos(train_set))\n",
    "Y_train = get_positions(train_set)\n",
    "print len(X_train)\n",
    "print len(Y_train)\n",
    "print X_train[0], Y_train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vec = DictVectorizer()\n",
    "X_train = vec.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[False False  True ..., False False False]\n",
      "125\n",
      "[4914  737    1 ...,   43   47  293]\n",
      "sklearn.cross_validation.StratifiedKFold(labels=[  61.   67.   66. ..., -106.  -91.   94.], n_folds=2, shuffle=False, random_state=None)\n"
     ]
    }
   ],
   "source": [
    "estimator = linear_model.Ridge()\n",
    "selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(Y_train, 2))\n",
    "X_train_sel = selector.fit_transform(X_train, Y_train)\n",
    "print selector.support_\n",
    "print selector.n_features_\n",
    "print selector.ranking_\n",
    "print selector.cv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "alpha: 1000000\n",
      "score: <bound method RidgeCV.score of RidgeCV(alphas=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000],\n",
      "    cv=100, fit_intercept=True, gcv_mode=None, normalize=True,\n",
      "    scoring='mean_squared_error', store_cv_values=False)>\n",
      "coef:  [  1.75933803e-06   1.48886459e-05  -1.01434067e-04 ...,   6.07399560e-05\n",
      "   6.08362233e-05   2.65297792e-05]\n"
     ]
    }
   ],
   "source": [
    "regressor = linear_model.RidgeCV(alphas=[10**x for x in range(10)],\\\n",
    "                                 normalize=True,\\\n",
    "                                 cv=100,\\\n",
    "                                 scoring='mean_squared_error')\n",
    "regressor.fit(X_train, Y_train)\n",
    "print \"alpha:\", regressor.alpha_\n",
    "print \"score:\", regressor.score\n",
    "print \"coef: \", regressor.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 \t70.2511645175\n",
      "10 \t70.2511644912\n",
      "100 \t70.2511642276\n",
      "1000 \t70.2511616064\n",
      "10000 \t70.2511368389\n",
      "100000 \t70.2510319681\n",
      "1000000 \t70.262798834\n",
      "10000000 \t70.9567819985\n",
      "100000000 \t77.654246471\n",
      "1000000000 \t84.0398877839\n"
     ]
    }
   ],
   "source": [
    "for ii in [10**x for x in range(10)]:\n",
    "    scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n",
    "                             X_train_sel, Y_train,\\\n",
    "                             cv=10, scoring='mean_squared_error')\n",
    "    print ii,\"\\t\", sqrt(abs(scores)).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 \t70.2511715134\n",
      "10 \t70.251171487\n",
      "100 \t70.2511712234\n",
      "1000 \t70.2511686012\n",
      "10000 \t70.2511438238\n",
      "100000 \t70.2510388562\n",
      "1000000 \t70.2628049139\n",
      "10000000 \t70.9567855533\n",
      "100000000 \t77.654244337\n",
      "1000000000 \t84.0398853661\n"
     ]
    }
   ],
   "source": [
    "for ii in [10**x for x in range(10)]:\n",
    "    scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n",
    "                             X_train, Y_train,\\\n",
    "                             cv=10, scoring='mean_squared_error')\n",
    "    print ii,\"\\t\", sqrt(abs(scores)).mean()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# model07: feature selection\n",
	"\n",
	"## Explain this model\n",
	"\n",
	"### Model\n",
	"* Linear models: Ridge, RidgeCV\n",
	"\n",
	"### Features\n",
	"* uid\n",
	"* qid\n",
	"* q_length\n",
	"* category\n",
	"* answer\n",
	"* avg_per_uid: average response time per user\n",
	"* avg_per_qid: average response time per question"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"# Let's start our experimemt"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step1: Read train and test data"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Read files for train and test set\n",
	"We alread made given csv files as a pickled data for our convenience."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 210,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import gzip\n",
	"import cPickle as pickle\n",
	"\n",
	"\n",
	"with gzip.open(\"../data/train.pklz\", \"rb\") as train_file:\n",
	" train_set = pickle.load(train_file)\n",
	"\n",
	"with gzip.open(\"../data/test.pklz\", \"rb\") as test_file:\n",
	" test_set = pickle.load(test_file)\n",
	"\n",
	"with gzip.open(\"../data/questions.pklz\", \"rb\") as questions_file:\n",
	" questions = pickle.load(questions_file)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### What they have?\n",
	"Just look at what each set have."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 211,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"* train_set: {'answer': 'cole', 'qid': 1, 'uid': 0, 'position': 61.0}\n",
	"* test_set: {'qid': 1, 'uid': 6}\n",
	"* question keys: ['answer', 'category', 'group', 'pos_token', 'question']\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"('* question contents:',\n",
	" {'answer': 'thomas cole',\n",
	" 'category': 'Fine Arts',\n",
	" 'group': 'test',\n",
	" 'pos_token': {0: '',\n",
	" 1: 'painters',\n",
	" 2: 'indulgence',\n",
	" 4: 'visual',\n",
	" 5: 'fantasy',\n",
	" 7: 'appreciation',\n",
	" 9: 'different',\n",
	" 10: 'historic',\n",
	" 11: 'architectural',\n",
	" 12: 'styles',\n",
	" 15: 'seen',\n",
	" 18: '1840',\n",
	" 19: 'architects',\n",
	" 20: 'dream',\n",
	" 23: 'series',\n",
	" 25: 'paintings',\n",
	" 28: 'last',\n",
	" 31: 'mohicans',\n",
	" 33: 'made',\n",
	" 35: 'three',\n",
	" 36: 'year',\n",
	" 37: 'trip',\n",
	" 39: 'europe',\n",
	" 41: '1829',\n",
	" 45: 'better',\n",
	" 46: 'known',\n",
	" 49: 'trip',\n",
	" 50: 'four',\n",
	" 51: 'years',\n",
	" 52: 'earlier',\n",
	" 56: 'journeyed',\n",
	" 59: 'hudson',\n",
	" 60: 'river',\n",
	" 63: 'catskill',\n",
	" 64: 'mountains',\n",
	" 65: 'ftp',\n",
	" 66: 'name',\n",
	" 68: 'this_painter',\n",
	" 71: 'oxbow',\n",
	" 74: 'voyage',\n",
	" 76: 'life',\n",
	" 77: 'series'},\n",
	" 'question': \"This painter's indulgence of visual fantasy, and appreciation of different historic architectural styles can be seen in his 1840 Architect's Dream. After a series of paintings on The Last of the Mohicans, he made a three year trip to Europe in 1829, but he is better known for a trip four years earlier in which he journeyed up the Hudson River to the Catskill Mountains. FTP, name this painter of The Oxbow and The Voyage of Life series.\"})"
	]
	},
	"execution_count": 211,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print \"* train_set:\", train_set[1]\n",
	"print \"* test_set:\", test_set[7]\n",
	"print \"* question keys:\", questions[1].keys()\n",
	"\"* question contents:\", questions[1]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Step2: Feature Engineering\n",
	"We might want to use some set of feature based on given data."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 212,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from collections import defaultdict\n",
	"\n",
	"\"\"\"\n",
	"Calculate average position(response time) per user(uid) and question(qid).\n",
	"\"\"\"\n",
	"def get_avg_pos(data):\n",
	" pos_uid = defaultdict(list)\n",
	" pos_qid = defaultdict(list)\n",
	"\n",
	" for key in data:\n",
	" uid = data[key]['uid']\n",
	" qid = data[key]['qid']\n",
	" pos = data[key]['position']\n",
	" pos_uid[uid].append(pos)\n",
	" pos_qid[qid].append(pos)\n",
	"\n",
	" avg_pos_uid = {}\n",
	" avg_pos_qid = {}\n",
	"\n",
	" for key in pos_uid:\n",
	" avg_pos_uid[key] = sum(pos_uid[key]) / len(pos_uid[key])\n",
	"\n",
	" for key in pos_qid:\n",
	" avg_pos_qid[key] = sum(pos_qid[key]) / len(pos_qid[key])\n",
	" \n",
	" return [avg_pos_uid, avg_pos_qid]\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Make feature vectors for given data set\n",
	"\"\"\"\n",
	"def featurize(data, avg_pos):\n",
	" X = []\n",
	" avg_pos_uid = avg_pos[0]\n",
	" avg_pos_qid = avg_pos[1]\n",
	" for key in data:\n",
	" uid = data[key]['uid']\n",
	" qid = data[key]['qid']\n",
	" q_length = max(questions[qid]['pos_token'].keys())\n",
	" category = questions[qid]['category'].lower()\n",
	" answer = questions[qid]['answer'].lower()\n",
	" if uid in avg_pos_uid:\n",
	" pos_uid = avg_pos_uid[uid]\n",
	" else:\n",
	" pos_uid = sum(avg_pos_uid.values()) / float(len(avg_pos_uid.values()))\n",
	" \n",
	" if qid in avg_pos_qid:\n",
	" pos_qid = avg_pos_qid[qid]\n",
	" else:\n",
	" pos_qid = sum(avg_pos_qid.values()) / float(len(avg_pos_qid.values()))\n",
	" \n",
	" feat = {\"uid\": str(uid),\n",
	" \"qid\": str(qid),\n",
	" \"q_length\": q_length,\n",
	" \"category\": category,\n",
	" \"answer\": answer,\n",
	" \"avg_pos_uid\": pos_uid,\n",
	" \"avg_pos_qid\": pos_qid\n",
	" }\n",
	" X.append(feat)\n",
	" \n",
	" return X\n",
	"\n",
	"\n",
	"\"\"\"\n",
	"Get positions\n",
	"\"\"\"\n",
	"def get_positions(data):\n",
	" Y = []\n",
	" for key in data:\n",
	" position = data[key]['position']\n",
	" Y.append(position)\n",
	" \n",
	" return Y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 218,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"28494\n",
	"28494\n",
	"{'category': 'fine arts', 'uid': '0', 'qid': '1', 'avg_pos_uid': 55.708333333333336, 'q_length': 77, 'answer': 'thomas cole', 'avg_pos_qid': 51.0} 61.0\n"
	]
	}
	],
	"source": [
	"import math\n",
	"from numpy import abs, sqrt\n",
	"from sklearn.cross_validation import StratifiedKFold\n",
	"from sklearn.feature_selection import RFECV\n",
	"from sklearn import linear_model\n",
	"from sklearn.cross_validation import train_test_split, cross_val_score\n",
	"from sklearn.feature_extraction import DictVectorizer\n",
	"\n",
	"\n",
	"X_train = featurize(train_set, get_avg_pos(train_set))\n",
	"Y_train = get_positions(train_set)\n",
	"print len(X_train)\n",
	"print len(Y_train)\n",
	"print X_train[0], Y_train[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 219,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"vec = DictVectorizer()\n",
	"X_train = vec.fit_transform(X_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 220,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[False False True ..., False False False]\n",
	"125\n",
	"[4914 737 1 ..., 43 47 293]\n",
	"sklearn.cross_validation.StratifiedKFold(labels=[ 61. 67. 66. ..., -106. -91. 94.], n_folds=2, shuffle=False, random_state=None)\n"
	]
	}
	],
	"source": [
	"estimator = linear_model.Ridge()\n",
	"selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(Y_train, 2))\n",
	"X_train_sel = selector.fit_transform(X_train, Y_train)\n",
	"print selector.support_\n",
	"print selector.n_features_\n",
	"print selector.ranking_\n",
	"print selector.cv"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 221,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"alpha: 1000000\n",
	"score: <bound method RidgeCV.score of RidgeCV(alphas=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000],\n",
	" cv=100, fit_intercept=True, gcv_mode=None, normalize=True,\n",
	" scoring='mean_squared_error', store_cv_values=False)>\n",
	"coef: [ 1.75933803e-06 1.48886459e-05 -1.01434067e-04 ..., 6.07399560e-05\n",
	" 6.08362233e-05 2.65297792e-05]\n"
	]
	}
	],
	"source": [
	"regressor = linear_model.RidgeCV(alphas=[10**x for x in range(10)],\\\n",
	" normalize=True,\\\n",
	" cv=100,\\\n",
	" scoring='mean_squared_error')\n",
	"regressor.fit(X_train, Y_train)\n",
	"print \"alpha:\", regressor.alpha_\n",
	"print \"score:\", regressor.score\n",
	"print \"coef: \", regressor.coef_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 223,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 \t70.2511645175\n",
	"10 \t70.2511644912\n",
	"100 \t70.2511642276\n",
	"1000 \t70.2511616064\n",
	"10000 \t70.2511368389\n",
	"100000 \t70.2510319681\n",
	"1000000 \t70.262798834\n",
	"10000000 \t70.9567819985\n",
	"100000000 \t77.654246471\n",
	"1000000000 \t84.0398877839\n"
	]
	}
	],
	"source": [
	"for ii in [10**x for x in range(10)]:\n",
	" scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n",
	" X_train_sel, Y_train,\\\n",
	" cv=10, scoring='mean_squared_error')\n",
	" print ii,\"\\t\", sqrt(abs(scores)).mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 224,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 \t70.2511715134\n",
	"10 \t70.251171487\n",
	"100 \t70.2511712234\n",
	"1000 \t70.2511686012\n",
	"10000 \t70.2511438238\n",
	"100000 \t70.2510388562\n",
	"1000000 \t70.2628049139\n",
	"10000000 \t70.9567855533\n",
	"100000000 \t77.654244337\n",
	"1000000000 \t84.0398853661\n"
	]
	}
	],
	"source": [
	"for ii in [10**x for x in range(10)]:\n",
	" scores = cross_val_score(linear_model.Ridge(alpha=ii, normalize=False),\\\n",
	" X_train, Y_train,\\\n",
	" cv=10, scoring='mean_squared_error')\n",
	" print ii,\"\\t\", sqrt(abs(scores)).mean()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}