stevenhao · December 13, 2017 04:54
diff --git a/lasso_and_svm.ipynb b/lasso_and_svm.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import Lasso, Ridge\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_game_data(year):\n",
    "    \"\"\"\n",
    "    param\n",
    "    ---------------\n",
    "    year: int, between 1997 and 2016\n",
    "\n",
    "    returns\n",
    "    ---------------\n",
    "    regular_season, playoffs: pd.DataFrame, consists of game-by-game data\n",
    "        from the regular season and playoffs in that year, has columns ['Week', \n",
    "        'Day', 'Date', 'Time', 'Winner/tie', Loser/tie', 'Home', 'Away', 'PtsW', 'PtsL',  \n",
    "        'YdsW', 'TOW', 'YdsL', 'TOL']\n",
    "    \"\"\"\n",
    "    df = pd.read_csv('game_data_csv/'+str(year)+\".csv\")\n",
    "    df = df.drop('Unnamed: 7', axis=1)\n",
    "    df.rename(columns={'Unnamed: 5': 'at'}, inplace=True)\n",
    "    df.insert(loc=7, column='Home', value=None)\n",
    "    df.insert(loc=8, column='Away', value=None)\n",
    "\n",
    "    playoff_cutoff = None\n",
    "    for index, row in df.iterrows():\n",
    "        if row['at'] == '@':\n",
    "            df.loc[index, 'Home'] = row['Loser/tie']\n",
    "            df.loc[index,'Away'] = row['Winner/tie']\n",
    "        else:\n",
    "            df.loc[index, 'Home'] = row['Winner/tie']\n",
    "            df.loc[index,'Away'] = row['Loser/tie']\n",
    "        if row['Date'] == 'Playoffs':\n",
    "            playoff_cutoff = index\n",
    "        if playoff_cutoff is None:\n",
    "            df.loc[index, 'Week'] = int(row['Week'])\n",
    "\n",
    "    df = df.drop('at', axis=1)\n",
    "\n",
    "    regular_season = df.loc[:playoff_cutoff-1]\n",
    "    playoffs = df.loc[playoff_cutoff+1:]\n",
    "    return regular_season, playoffs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_teams(year=2016):\n",
    "    \"\"\"\n",
    "    param\n",
    "    ---------------\n",
    "    year: int, between 1997 and 2016\n",
    "\n",
    "    returns\n",
    "    ---------------\n",
    "    teams: list of team names\n",
    "    \"\"\"\n",
    "    regular_season, playoffs = get_game_data(year)\n",
    "    teams = regular_season['Winner/tie'].tolist() + regular_season['Loser/tie'].tolist()\n",
    "    teams = list(set(teams))\n",
    "    return teams\n",
    "\n",
    "def get_cumulative_history(year):\n",
    "    \"\"\"\n",
    "    param\n",
    "    --------------\n",
    "    year: int, between 1997 and 2016\n",
    "    \n",
    "    returns\n",
    "    --------------\n",
    "    history: list of pd.DataFrame objects, indexed by week number (note index 0 entry has\n",
    "        all attributes set to 0). pd.DataFrame objects have columns ['points_forced', 'points_\n",
    "        allowed', 'win', 'loss']. All stats are cumulative up to the week number. Ties are counted\n",
    "        as 0.5 win and 0.5 loss.\n",
    "    \"\"\"\n",
    "    weeks = list(range(0, 18))\n",
    "    teams = get_teams(year)\n",
    "    regular_season, playoff = get_game_data(year)\n",
    "    res = pd.DataFrame(index=teams, columns=['points_forced', 'points_allowed', 'win', 'loss'])\n",
    "    res = res.fillna(0)\n",
    "    history = []\n",
    "\n",
    "    for week in weeks:\n",
    "        for index, row in regular_season.iterrows():\n",
    "            if row['Week'] == week:\n",
    "                res.loc[row['Winner/tie'], 'points_forced'] += row['PtsW']\n",
    "                res.loc[row['Winner/tie'], 'points_allowed'] += row['PtsL']\n",
    "                res.loc[row['Loser/tie'], 'points_forced'] += row['PtsL']\n",
    "                res.loc[row['Loser/tie'], 'points_allowed'] += row['PtsW']\n",
    "                if row['PtsW'] == row['PtsL']:\n",
    "                    res.loc[row['Winner/tie'], 'win'] += 0.5\n",
    "                    res.loc[row['Winner/tie'], 'loss'] += 0.5\n",
    "                    res.loc[row['Loser/tie'], 'win'] += 0.5\n",
    "                    res.loc[row['Loser/tie'], 'loss'] += 0.5\n",
    "                else:\n",
    "                    res.loc[row['Winner/tie'], 'win'] += 1\n",
    "                    res.loc[row['Loser/tie'], 'loss'] += 1\n",
    "        weekly_res = res.copy()\n",
    "        history.append(weekly_res)\n",
    "    return history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_cumulative_X_y(years, yvar='binary'):\n",
    "    # takes in list of years, and y is 'label', 'one_hot', or ''score_diff' \n",
    "    # label is 1 if home team wins, 0 otherwise (including ties)\n",
    "    # score differential is positive if home team wins\n",
    "    # week 17 excluded from the data\n",
    "    # features are cumulative stats of [points_forced, points_allowed, win, loss]\n",
    "\n",
    "    data_X , data_Y = [], []\n",
    "\n",
    "    for year in years:\n",
    "        reg_games, po_games = get_game_data(year)\n",
    "        reg_history = get_regular_season_history(year)\n",
    "        for index, row in reg_games.iterrows():\n",
    "            if row['Week'] == 17:\n",
    "                break\n",
    "            home_features = list(reg_history[row['Week']-1].loc[row['Home']])\n",
    "            away_features = list(reg_history[row['Week']-1].loc[row['Away']])\n",
    "            data_X.append(home_features + away_features)\n",
    "            \n",
    "            if yvar == 'binary':\n",
    "                y = int(row['Winner/tie'] == row['Home'])\n",
    "                \n",
    "            if yvar == 'one_hot':\n",
    "                y = [int(row['Winner/tie'] == row['Home']), int(row['Winner/tie'] != row['Home'])]\n",
    "            \n",
    "            if yvar == 'score_diff':\n",
    "                y = (row['PtsW'] - row['PtsL']) * (1 if row['Winner/tie'] == row['Home'] else -1)\n",
    "\n",
    "            data_Y.append(y)\n",
    "    \n",
    "    data_X = np.array(data_X)\n",
    "    data_Y = np.array(data_Y)\n",
    "\n",
    "    return data_X, data_Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "def sort_vegas_lines():\n",
    "    '''\n",
    "    Parse point spreads from Vegas lines into dict w/ keys [year][week][home_team]\n",
    "    '''\n",
    "    team_name_dict = {}\n",
    "    team_list = get_teams(2016)\n",
    "    for team in team_list:\n",
    "        team_data = team.rsplit(\" \",1)\n",
    "        team_name_dict[team_data[0]] = team_data[1]\n",
    "    \n",
    "    xx = pickle.load(open(\"betting_lines.pickle\"))\n",
    "    sorted_vegas_lines = sorted(xx, key = lambda x: x[1])\n",
    "    vegas_line_dict = {}\n",
    "    vegas_years = range(2006,2017)\n",
    "    for year in vegas_years:\n",
    "        year_dict = {}\n",
    "        for week in range(1,18):\n",
    "            year_dict[week] = {}\n",
    "        vegas_line_dict[year] = year_dict\n",
    "\n",
    "    for game in sorted_vegas_lines:\n",
    "        week = game[0]\n",
    "        year = game[1]\n",
    "        if year >= 2017:\n",
    "            break\n",
    "        home_team = str(game[2]).strip()\n",
    "        away_team = str(game[3]).strip()\n",
    "        if '(' in home_team or '\\n' in home_team or '(' in away_team or '\\n' in away_team:\n",
    "            continue\n",
    "        elif home_team == 'NY Giants':\n",
    "            home_team = 'New York Giants'\n",
    "        elif home_team == 'NY Jets':\n",
    "            home_team = 'New York Jets'\n",
    "        elif home_team == 'St. Louis':\n",
    "            home_team = 'Los Angeles Rams'\n",
    "        else:\n",
    "            home_team = home_team + \" \" + team_name_dict[home_team]\n",
    "        points_favored = game[4]\n",
    "        vegas_line_dict[year][week][home_team] = points_favored\n",
    "    return vegas_line_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#======================EWMA code here======================\n",
    "\n",
    "new_names = {'Tennessee Oilers': 'Tennessee Titans', 'St. Louis Rams': 'Los Angeles Rams'}\n",
    "name_changes = lambda x: new_names.get(x, x)\n",
    "\n",
    "def get_continuous_game_data(start=1997, end=2016, vegas=None):\n",
    "    data = None\n",
    "    for year in range(start, end+1):\n",
    "        df = pd.read_csv('game_data_csv/'+str(year)+\".csv\")\n",
    "        df = df.drop('Unnamed: 7', axis=1)\n",
    "        df = df.applymap(name_changes)\n",
    "        df = df.rename(columns={'Unnamed: 5': 'at'})\n",
    "        df.insert(loc=7, column='Home', value=None)\n",
    "        df.insert(loc=8, column='Away', value=None)\n",
    "        df.insert(loc=0, column='Year', value=year)\n",
    "        df.insert(loc=0, column='Week_id', value=year)\n",
    "        df['Week'] = pd.to_numeric(df['Week'], errors='coerce').fillna(0).astype(int)\n",
    "        if vegas is not None:\n",
    "            df.insert(loc=12, column='Vegas_lines', value=None)\n",
    "\n",
    "        for index, row in df.iterrows():\n",
    "            if row['Week'] == 17:\n",
    "                df = df.loc[:index-1]\n",
    "                break\n",
    "            if row['at'] == '@':\n",
    "                df.loc[index, 'Home'] = row['Loser/tie']\n",
    "                df.loc[index, 'Away'] = row['Winner/tie']\n",
    "            else:\n",
    "                df.loc[index, 'Home'] = row['Winner/tie']\n",
    "                df.loc[index, 'Away'] = row['Loser/tie']\n",
    "            df.loc[index, 'Week_id'] = (row['Year'] - start) * 16 + row['Week']\n",
    "            if vegas is not None:\n",
    "                try:\n",
    "                    df.loc[index, 'Vegas_lines'] = vegas_lines[row['Year']][row['Week']][df.loc[index, 'Home']]\n",
    "                except KeyError:\n",
    "                    df.loc[index, 'Vegas_lines'] = 2.361 # avg pt spread\n",
    "        df = df.drop('at', axis=1)\n",
    "        data = df if data is None else data.append(df, ignore_index=True)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def average_pts_yds(games):\n",
    "    pts, yds = 0, 0\n",
    "    for index, row in games.iterrows():\n",
    "        pts += row['PtsW'] + row['PtsL']\n",
    "        yds += row['YdsW'] + row['YdsL']\n",
    "    pts /= len(games) * 2\n",
    "    yds /= len(games) * 2\n",
    "    return pts, yds\n",
    "\n",
    "def generate_EWMA(games, decay, cross_season_decay=0.8):\n",
    "    teams = get_teams()\n",
    "    avg_pts, avg_yds = average_pts_yds(games)\n",
    "    initialization = {'yards_forced':  avg_yds, 'yards_allowed': avg_yds, 'points_forced': avg_pts, 'points_allowed': avg_pts, 'win': 0.5, 'loss': 0.5}\n",
    "#     curr = pd.DataFrame(index=teams, columns=['yards_forced', 'yards_allowed', 'points_forced', 'points_allowed', 'win', 'loss'], data=initialization)\n",
    "    curr = pd.DataFrame(index=teams, columns=['yards_forced', 'yards_allowed', 'points_forced', 'points_allowed', 'win'], data=initialization)\n",
    "\n",
    "    \n",
    "    year = games.loc[0, 'Year']\n",
    "    week_id = 0\n",
    "    d = cross_season_decay\n",
    "\n",
    "    ewma = []\n",
    "    for index, row in games.iterrows():\n",
    "        if week_id != row['Week_id']:\n",
    "            assert week_id + 1 == row['Week_id']\n",
    "            ewma.append(curr.copy())\n",
    "            week_id = row['Week_id']\n",
    "\n",
    "        if year != row['Year']:\n",
    "            assert year + 1 == row['Year']\n",
    "            d = cross_season_decay\n",
    "            year = row['Year']\n",
    "        else:\n",
    "            d = 0.5 * decay + 0.5 * cross_season_decay\n",
    "\n",
    "        curr.loc[row['Winner/tie'], 'points_forced'] *= d\n",
    "        curr.loc[row['Winner/tie'], 'points_forced'] += row['PtsW'] * (1 - d)\n",
    "        curr.loc[row['Winner/tie'], 'points_allowed'] *= d\n",
    "        curr.loc[row['Winner/tie'], 'points_allowed'] += row['PtsL'] * (1 - d)\n",
    "        curr.loc[row['Winner/tie'], 'yards_forced'] *= d\n",
    "        curr.loc[row['Winner/tie'], 'yards_forced'] += row['YdsW'] * (1 - d)\n",
    "        curr.loc[row['Winner/tie'], 'yards_allowed'] *= d\n",
    "        curr.loc[row['Winner/tie'], 'yards_allowed'] += row['YdsL'] * (1 - d)\n",
    "        curr.loc[row['Loser/tie'], 'points_forced'] *= d\n",
    "        curr.loc[row['Loser/tie'], 'points_forced'] += row['PtsL'] * (1 - d)\n",
    "        curr.loc[row['Loser/tie'], 'points_allowed'] *= d\n",
    "        curr.loc[row['Loser/tie'], 'points_allowed'] += row['PtsW'] * (1 - d)\n",
    "        curr.loc[row['Loser/tie'], 'yards_forced'] *= d\n",
    "        curr.loc[row['Loser/tie'], 'yards_forced'] += row['YdsL'] * (1 - d)\n",
    "        curr.loc[row['Loser/tie'], 'yards_allowed'] *= d\n",
    "        curr.loc[row['Loser/tie'], 'yards_allowed'] += row['YdsW'] * (1 - d)\n",
    "\n",
    "        if row['PtsW'] == row['PtsL']:\n",
    "            curr.loc[row['Winner/tie'], 'win'] *= d\n",
    "            curr.loc[row['Winner/tie'], 'win'] += 0.5 * (1 - d)\n",
    "            curr.loc[row['Loser/tie'], 'win'] *= d\n",
    "            curr.loc[row['Loser/tie'], 'win'] += 0.5 * (1 - d)\n",
    "#             curr.loc[row['Winner/tie'], 'loss'] *= d\n",
    "#             curr.loc[row['Winner/tie'], 'loss'] += 0.5 * (1 - d)\n",
    "#             curr.loc[row['Loser/tie'], 'loss'] *= d\n",
    "#             curr.loc[row['Loser/tie'], 'loss'] += 0.5 * (1 - d)\n",
    "        else:\n",
    "            curr.loc[row['Winner/tie'], 'win'] *= d\n",
    "            curr.loc[row['Winner/tie'], 'win'] += 1 * (1 - d)\n",
    "            curr.loc[row['Loser/tie'], 'win'] *= d\n",
    "            curr.loc[row['Loser/tie'], 'win'] += 0 * (1 - d)\n",
    "#             curr.loc[row['Winner/tie'], 'loss'] *= d\n",
    "#             curr.loc[row['Winner/tie'], 'loss'] += 0 * (1 - d)\n",
    "#             curr.loc[row['Loser/tie'], 'loss'] *= d\n",
    "#             curr.loc[row['Loser/tie'], 'loss'] += 1 * (1 - d)\n",
    "    ewma.append(curr.copy())\n",
    "    return ewma"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_continuous_X_y(games, stats, yvar='binary'):\n",
    "    # takes in list of years, and y is 'label', 'one_hot', or ''score_diff' \n",
    "    # label is 1 if home team wins, 0 otherwise (including ties)\n",
    "    # score differential is positive if home team wins\n",
    "    # week 17 excluded from the data\n",
    "\n",
    "    X, y = [], []\n",
    "    for _, row in games.iterrows():\n",
    "        home_features = list(stats[int(row['Week_id']-1)].loc[row['Home']])\n",
    "        away_features = list(stats[int(row['Week_id']-1)].loc[row['Away']])\n",
    "        features = home_features + away_features\n",
    "        if 'vegas_line' in row:\n",
    "            features += [row['vegas_line']]\n",
    "        X.append(features)\n",
    "        if yvar == 'binary':\n",
    "            y.append(int(row['Winner/tie'] == row['Home']))\n",
    "        elif yvar == 'one_hot':\n",
    "            y.append([int(row['Winner/tie'] == row['Home']), int(row['Winner/tie'] != row['Home'])])\n",
    "        elif yvar == 'score_diff':\n",
    "            y.append((row['PtsW'] - row['PtsL']) * (1 if row['Winner/tie'] == row['Home'] else -1))\n",
    "        else:\n",
    "            raise Exception('Outcome type not supported.')\n",
    "    X = np.array(X)\n",
    "    y = np.array(y)\n",
    "    return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "vegas_lines = sort_vegas_lines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# games = get_continuous_game_data(1997, 2016)\n",
    "games = get_continuous_game_data(2006, 2016)\n",
    "# games = get_continuous_game_data(2006, 2016, vegas_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished processing EWMA with decay = 8.70e-01\n",
      "Finished processing EWMA with decay = 9.00e-01\n",
      "Finished processing EWMA with decay = 9.30e-01\n"
     ]
    }
   ],
   "source": [
    "ewma = {}\n",
    "for decay in [0.87, 0.90, 0.93]:\n",
    "    ewma[decay] = generate_EWMA(games, decay, cross_season_decay=0.8)\n",
    "    print('Finished processing EWMA with decay = {:.2e}'.format(decay))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_EWMA_train_test(decay, yvar='binary'):\n",
    "    assert decay in ewma\n",
    "    X, y = get_continuous_X_y(games, ewma[decay], yvar=yvar)\n",
    "    X, y = shuffle(X, y)\n",
    "\n",
    "    num = len(X) // 4\n",
    "    X_train, Y_train = [None]*3, [None]*3\n",
    "    for i in range(3):\n",
    "        X_train[i] = X[i * num: (i+1) * num]\n",
    "        Y_train[i] = y[i * num: (i+1) * num]\n",
    "\n",
    "    X_test = X[3 * num: ]\n",
    "    Y_test = y[3 * num: ]\n",
    "    return X_train, Y_train, X_test, Y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_acc(X, y, predict, yvar='binary'):\n",
    "    correct = 0\n",
    "    for i in range(len(X)):\n",
    "        if yvar == 'binary':\n",
    "            outcome = y[i]\n",
    "        if yvar == 'score_diff':\n",
    "            outcome = int(y[i] > 0)\n",
    "        if yvar == 'sigmoid_score_diff':\n",
    "            outcome = int(y[i] > 0.5)\n",
    "        if predict([X[i]]) == outcome:\n",
    "            correct += 1\n",
    "    return 1.*correct / len(X)\n",
    "\n",
    "def predict_func(obj, model):\n",
    "    if model == 'svm':\n",
    "        return lambda x: obj.predict(x)\n",
    "    if model == 'linear_binary':\n",
    "        return lambda x: 1 if obj.predict(x) > 0.5 else 0\n",
    "    if model == 'linear_score_diff':\n",
    "        return lambda x: 1 if obj.predict(x) > 0 else 0\n",
    "    if model == 'linear_sigmoid_score_diff':\n",
    "        return lambda x: 1 if obj.predict(x) > 0.5 else 0\n",
    "\n",
    "def svm_train(X, y, kernel, C, gamma='auto', max_iter=1e5):\n",
    "    svm = SVC(C=C, kernel=kernel, gamma=gamma, max_iter=max_iter)\n",
    "    svm.fit(X, y)\n",
    "    return svm\n",
    "\n",
    "def svm_cv_train(X_train, Y_train, kernel, C, gamma='auto', max_iter=1e5, suppress=False):\n",
    "    if not suppress:\n",
    "        print('SVM (C = {:.2e})'.format(C))\n",
    "\n",
    "    train_acc = [0, 0, 0]\n",
    "    val_acc = [0, 0, 0]\n",
    "    for i in range(3):\n",
    "        if not suppress:\n",
    "            print('----Cross Validation Set {}----'.format(i+1))\n",
    "\n",
    "        X = np.concatenate([X_train[i], X_train[(i+1)%3]])\n",
    "        Y = np.concatenate([Y_train[i], Y_train[(i+1)%3]])\n",
    "        svm = svm_train(X, Y, C=C, kernel=kernel, gamma=gamma, max_iter=max_iter)\n",
    "        train_acc[i] = svm.score(X, Y)\n",
    "        val_acc[i] = svm.score(X_train[(i+2)%3], Y_train[(i+2)%3])\n",
    "\n",
    "        if not suppress:\n",
    "            print('train acc: {:.2%}'.format(train_acc[i]))\n",
    "            print('val acc: {:.2%}\\n'.format(val_acc[i]))\n",
    "\n",
    "    return 1.*sum(val_acc)/3\n",
    "\n",
    "def lasso_train(X, y, alpha, fit_intercept=True, max_iter=1e6):\n",
    "    lasso = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter)\n",
    "    lasso.fit(X, y)\n",
    "    return lasso\n",
    "\n",
    "def lasso_cv_train(X_train, Y_train, alpha, fit_intercept=True, max_iter=1e6, suppress=False, yvar='binary'):\n",
    "    if not suppress:\n",
    "        print('Lasso (alpha = {:.2e})'.format(alpha))\n",
    "\n",
    "    train_acc = [0, 0, 0]\n",
    "    val_acc = [0, 0, 0]\n",
    "    for i in range(3):\n",
    "        if not suppress:\n",
    "            print('----Cross Validation Set {}----'.format(i+1))\n",
    "\n",
    "        X = np.concatenate([X_train[i], X_train[(i+1)%3]])\n",
    "        Y = np.concatenate([Y_train[i], Y_train[(i+1)%3]])\n",
    "        lasso = lasso_train(X, Y, alpha, fit_intercept, max_iter)\n",
    "        train_acc[i] = predict_acc(X_train[i], Y_train[i], predict_func(lasso, 'linear_' + yvar), yvar=yvar)\n",
    "        val_acc[i] = predict_acc(X_train[(i+2)%3], Y_train[(i+2)%3], predict_func(lasso, 'linear_' + yvar), yvar=yvar)\n",
    "\n",
    "        if not suppress:\n",
    "            print('train acc: {:.2%}'.format(train_acc[i]))\n",
    "            print('val acc: {:.2%}\\n'.format(val_acc[i]))\n",
    "\n",
    "    return 1.*sum(val_acc)/3\n",
    "\n",
    "def ridge_train(X, y, alpha, fit_intercept=True, max_iter=1e6):\n",
    "    ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter)\n",
    "    ridge.fit(X, y)\n",
    "    return ridge\n",
    "\n",
    "def ridge_cv_train(X_train, Y_train, alpha, fit_intercept=True, max_iter=1e6, suppress=False, yvar='binary'):\n",
    "    if not suppress:\n",
    "        print('Ridge (alpha = {:.2e})'.format(alpha))\n",
    "\n",
    "    train_acc = [0, 0, 0]\n",
    "    val_acc = [0, 0, 0]\n",
    "    for i in range(3):\n",
    "        if not suppress:\n",
    "            print('----Cross Validation Set {}----'.format(i+1))\n",
    "\n",
    "        X = np.concatenate([X_train[i], X_train[(i+1)%3]])\n",
    "        Y = np.concatenate([Y_train[i], Y_train[(i+1)%3]])\n",
    "        ridge = ridge_train(X, Y, alpha, fit_intercept, max_iter)\n",
    "        train_acc[i] = predict_acc(X_train[i], Y_train[i], predict_func(ridge, 'linear_' + yvar), yvar=yvar)\n",
    "        val_acc[i] = predict_acc(X_train[(i+2)%3], Y_train[(i+2)%3], predict_func(ridge, 'linear_' + yvar), yvar=yvar)\n",
    "\n",
    "        if not suppress:\n",
    "            print('train acc: {:.2%}'.format(train_acc[i]))\n",
    "            print('val acc: {:.2%}\\n'.format(val_acc[i]))\n",
    "\n",
    "    return 1.*sum(val_acc)/3\n",
    "\n",
    "def log_reg_train(X, y, C, penalty='l2', fit_intercept=True, max_iter=1e6):\n",
    "    log_reg = Logistic_Regression(C=C, penalty=penalty, fit_intercept=fit_intercept, max_iter=max_iter)\n",
    "    log_reg.fit(X, y)\n",
    "    return ridge\n",
    "\n",
    "def log_reg_cv_train(X_train, Y_train, C, penalty='l2', fit_intercept=True, max_iter=1e6, suppress=False, yvar='binary'):\n",
    "    if not suppress:\n",
    "        print('Logistic Regression (reg = {}, C = {:.2e})'.format(penalty, C))\n",
    "\n",
    "    train_acc = [0, 0, 0]\n",
    "    val_acc = [0, 0, 0]\n",
    "    for i in range(3):\n",
    "        if not suppress:\n",
    "            print('----Cross Validation Set {}----'.format(i+1))\n",
    "\n",
    "        X = np.concatenate([X_train[i], X_train[(i+1)%3]])\n",
    "        Y = np.concatenate([Y_train[i], Y_train[(i+1)%3]])\n",
    "        log_reg = log_reg_train(X, Y, C, penalty, fit_intercept, max_iter)\n",
    "        train_acc[i] = predict_acc(X_train[i], Y_train[i], predict_func(log_reg, 'log_reg' + yvar), yvar=yvar)\n",
    "        val_acc[i] = predict_acc(X_train[(i+2)%3], Y_train[(i+2)%3], predict_func(log_reg, 'log_reg' + yvar), yvar=yvar)\n",
    "\n",
    "        if not suppress:\n",
    "            print('train acc: {:.2%}'.format(train_acc[i]))\n",
    "            print('val acc: {:.2%}\\n'.format(val_acc[i]))\n",
    "\n",
    "    return 1.*sum(val_acc)/3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress: 5 / 25.\n",
      "Progress: 10 / 25.\n",
      "Progress: 15 / 25.\n",
      "Progress: 20 / 25.\n",
      "Progress: 25 / 25.\n",
      "\n",
      "Best Linear SVM: decay = 9.30e-01, C = 3.91e-03, val acc = 64.80%\n",
      "================END================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# hyperparameter sweep for linear SVM\n",
    "best_lin_C = None\n",
    "best_lin_decay = None\n",
    "best_lin_acc = 0\n",
    "\n",
    "for i, j in enumerate(range(-20, 5)):\n",
    "    if (i+1) % 5 == 0:\n",
    "        print('Progress: {} / 25.'.format(i+1))\n",
    "\n",
    "    for decay in ewma:\n",
    "        X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay)\n",
    "        C = 2**j\n",
    "        val_acc = svm_cv_train(X_train, Y_train, kernel='linear', C=C, max_iter= 1e5, suppress=True)\n",
    "\n",
    "        if val_acc > best_lin_acc:\n",
    "            best_lin_decay = decay\n",
    "            best_lin_acc = val_acc\n",
    "            best_lin_C = C\n",
    "\n",
    "print('\\nBest Linear SVM: decay = {:.2e}, C = {:.2e}, val acc = {:.2%}'.format(best_lin_decay, best_lin_C, best_lin_acc))\n",
    "print('================END================\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear SVM (decay = 9.30e-01, C = 3.91e-03), training acc: 64.04%, test acc: 63.18%\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Report test accuracy for linear SVM (after retraining on entire training set)\n",
    "decay_opt = best_lin_decay\n",
    "C_opt = best_lin_C\n",
    "\n",
    "X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay_opt)\n",
    "X = np.concatenate(X_train)\n",
    "Y = np.concatenate(Y_train)\n",
    "svm = svm_train(X, Y, 'linear', C_opt, max_iter=1e6)\n",
    "print('Linear SVM (decay = {:.2e}, C = {:.2e}), training acc: {:.2%}, test acc: {:.2%}\\n'.format(decay_opt, C_opt, svm.score(X, Y), svm.score(X_test, Y_test)))\n",
    "# print('w, w_0 = {}, {}\\n'.format(svm.coef_[0], svm.intercept_))"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "# randomized hyperparameter search for Gaussian SVM\n",
    "\n",
    "best_rbf_C, best_rbf_gamma = None, None\n",
    "best_rbf_acc = 0\n",
    "\n",
    "for i in range(100):\n",
    "    if (i+1) % 10 == 0:\n",
    "        print('Progress: {} / 100.'.format(i+1))\n",
    "        \n",
    "    C = 2**(-random.randint(-20, 5))\n",
    "    gamma = 2**(-random.randint(-20, 5))\n",
    "    val_acc = svm_cv_train(X_train, Y_train, kernel='rbf', C=C, gamma=gamma, max_iter=1e5, suppress=True)\n",
    "    #     print('Gaussian RBF SVM: C = {:.2e}, gamma = {:.2e}'.format(C, gamma))\n",
    "\n",
    "    if val_acc > best_rbf_acc:\n",
    "        best_rbf_acc = val_acc\n",
    "        best_rbf_C, best_rbf_gamma = C, gamma\n",
    "\n",
    "print('\\nBest Gaussian RBF SVM: C = {:.2e}, gamma = {:.2e}, val acc = {:.2%}'.format(best_rbf_C, best_rbf_gamma, best_rbf_acc))\n",
    "print('================END================\\n')"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "X = np.concatenate(X_train)\n",
    "Y = np.concatenate(Y_train)\n",
    "C_rbf_opt = 2.0\n",
    "gamma_opt = 6.25e-02\n",
    "svm2 = svm_train(X, Y, 'rbf', C_rbf_opt, gamma_opt, max_iter=1e5)\n",
    "print('Gaussian RBF SVM (C = {:.2e}, gamma = {:.2e}), test acc: {:.2%}'.format(C_rbf_opt, gamma_opt, svm2.score(X_test, Y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress: 5 / 25.\n",
      "Progress: 10 / 25.\n",
      "Progress: 15 / 25.\n",
      "Progress: 20 / 25.\n",
      "Progress: 25 / 25.\n",
      "\n",
      "Best Lasso Model (yvar='binary'): decay = 9.00e-01, alpha = 6.10e-05, val acc = 64.65%\n",
      "================END================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# hyperparameter sweep for lasso trained on binary outcomes\n",
    "best_lasso_decay = None\n",
    "best_lasso_alpha = None\n",
    "best_lasso_acc = 0\n",
    "\n",
    "for i, j in enumerate(range(-20, 5)):\n",
    "    if (i+1) % 5 == 0:\n",
    "        print('Progress: {} / 25.'.format(i+1))\n",
    "        \n",
    "    for decay in ewma:\n",
    "        X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay, yvar='binary')\n",
    "        alpha = 2**j\n",
    "        val_acc = lasso_cv_train(X_train, Y_train, alpha=alpha, max_iter=1e6, suppress=True)\n",
    "    \n",
    "        if val_acc > best_lasso_acc:\n",
    "            best_lasso_acc = val_acc\n",
    "            best_lasso_alpha = alpha\n",
    "            best_lasso_decay = decay\n",
    "print('\\nBest Lasso Model (yvar=\\'binary\\'): decay = {:.2e}, alpha = {:.2e}, val acc = {:.2%}'.format(best_lasso_decay, best_lasso_alpha, best_lasso_acc))\n",
    "print('================END================\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lasso (decay = 9.00e-01, alpha = 6.10e-05, yvar='binary'), training acc: 64.04%, test acc: 60.76%\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Report test accuracy for lasso trained on binary outcomes\n",
    "decay_opt = best_lasso_decay\n",
    "alpha_opt = best_lasso_alpha\n",
    "\n",
    "X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay_opt, yvar='binary')\n",
    "X = np.concatenate(X_train)\n",
    "Y = np.concatenate(Y_train)\n",
    "lasso = lasso_train(X, Y, alpha_opt)\n",
    "\n",
    "print('Lasso (decay = {:.2e}, alpha = {:.2e}, yvar=\\'binary\\'), training acc: {:.2%}, test acc: {:.2%}\\n'.format(decay_opt, alpha_opt,\n",
    "    predict_acc(X, Y, predict_func(lasso, 'linear_binary')), predict_acc(X_test, Y_test, predict_func(lasso, 'linear_binary'))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress: 5 / 25.\n",
      "Progress: 10 / 25.\n",
      "Progress: 15 / 25.\n",
      "Progress: 20 / 25.\n",
      "Progress: 25 / 25.\n",
      "\n",
      "Best Lasso Model (yvar='score_diff'): decay = 8.70e-01, alpha = 9.77e-04, val acc = 65.15%\n",
      "================END================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# hyperparameter sweep for lasso trained on score differentials\n",
    "best_lasso_sd_decay = None\n",
    "best_lasso_sd_alpha = None\n",
    "best_lasso_sd_acc = 0\n",
    "\n",
    "for i, j in enumerate(range(-20, 5)):\n",
    "    if (i+1) % 5 == 0:\n",
    "        print('Progress: {} / 25.'.format(i+1))\n",
    "    \n",
    "    alpha = 2**j\n",
    "\n",
    "    for decay in ewma:\n",
    "        X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay, yvar='score_diff')\n",
    "        val_acc = lasso_cv_train(X_train, Y_train, alpha=alpha, max_iter=1e6, suppress=True, yvar='score_diff')\n",
    "\n",
    "        if val_acc > best_lasso_sd_acc:\n",
    "            best_lasso_sd_acc = val_acc\n",
    "            best_lasso_sd_alpha = alpha\n",
    "            best_lasso_sd_decay = decay\n",
    "\n",
    "print('\\nBest Lasso Model (yvar=\\'score_diff\\'): decay = {:.2e}, alpha = {:.2e}, val acc = {:.2%}'.format(best_lasso_sd_decay,\n",
    "                                                                                                          best_lasso_sd_alpha, best_lasso_sd_acc))\n",
    "print('================END================\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lasso (decay = 8.70e-01, alpha = 9.77e-04, yvar='score_diff'), training acc: 62.98%, test acc: 64.70%\n",
      "\n"
     ]
    }
   ],
   "source": [
    "decay_opt = best_lasso_sd_decay\n",
    "alpha_opt = best_lasso_sd_alpha\n",
    "\n",
    "X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay_opt, yvar='score_diff')\n",
    "X = np.concatenate(X_train)\n",
    "Y = np.concatenate(Y_train)\n",
    "lasso = lasso_train(X, Y, alpha_opt, max_iter=1e7)\n",
    "\n",
    "print('Lasso (decay = {:.2e}, alpha = {:.2e}, yvar=\\'score_diff\\'), training acc: {:.2%}, test acc: {:.2%}\\n'.format(decay_opt, alpha_opt,\n",
    "                                                                                                        predict_acc(X, Y, predict_func(lasso, 'linear_score_diff'), yvar='score_diff'), \n",
    "                                                                                                        predict_acc(X_test, Y_test, predict_func(lasso, 'linear_score_diff'), yvar='score_diff')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sigmoid(z, k=1):\n",
    "    return np.ones_like(z)/(1+np.exp(-z*k))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### hyperparameter sweep for lasso trained on sigmoid of score differentials\n",
    "best_lasso_sig_decay = None\n",
    "best_lasso_sig_alpha = None\n",
    "best_lasso_sig_k= None\n",
    "best_lasso_sig_acc = 0\n",
    "\n",
    "def sig(k):\n",
    "    return lambda x: sigmoid(x, k)\n",
    "\n",
    "for i, j in enumerate(range(-20, 5)):\n",
    "    if (i+1) % 5 == 0:\n",
    "        print('Progress: {} / 25.'.format(i+1))\n",
    "\n",
    "    k = 2**j\n",
    "    Y_train_sig = list(map(sig(k), Y_train))\n",
    "    \n",
    "    for h in range(-5, 5):\n",
    "        alpha = 2**h\n",
    "        \n",
    "        for decay in ewma:\n",
    "            val_acc = lasso_cv_train(X_train, Y_train_sig, alpha=alpha, max_iter=1e5, suppress=True, yvar='sigmoid_score_diff')\n",
    "\n",
    "            if val_acc > best_lasso_sig_acc:\n",
    "                best_lasso_sig_acc = val_acc\n",
    "                best_lasso_sig_alpha = alpha\n",
    "                best_lasso_sig_k = k\n",
    "                best_lasso_sig_decay = decay\n",
    "\n",
    "print('\\nBest Lasso Model (yvar=\\'sigmoid_score_diff\\'): decay = {:.2e}, alpha = {:.2e}, sigmoid_k = {:.2e} val acc = {:.2%}'.format(\n",
    "                best_lasso_sig_decay, best_lasso_sig_alpha, best_lasso_sig_k, best_lasso_sig_acc))\n",
    "print('================END================\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "decay_opt = best_lasso_sig_decay\n",
    "alpha_opt = best_lasso_sig_alpha\n",
    "k_opt = best_lasso_sig_k\n",
    "\n",
    "X_train, Y_train, X_test, Y_test = get_EWMA_train_test(decay_opt, yvar='score_diff')\n",
    "X = np.concatenate(X_train)\n",
    "Y = np.concatenate(Y_train)\n",
    "lasso = lasso_train(X, sig(k_opt)(Y), alpha_opt)\n",
    "\n",
    "print('Lasso (decay = {:.2e}, alpha = {:.2e}, sigmoid_k = {:.2e}, yvar=\\'sigmoid_score_diff\\'), training acc: {:.2%}, test acc: {:.2%}'.format(\n",
    "    decay_opt, alpha_opt, k_opt,\n",
    "    predict_acc(X, Y, predict_func(lasso, 'linear_sigmoid_score_diff'), yvar='sigmoid_score_diff'), \n",
    "    predict_acc(X_test, Y_test, predict_func(lasso, 'linear_sigmoid_score_diff'), yvar='sigmoid_score_diff')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "### Trained on EWMA of weekly stats from 1997 to 2016 (win/pts_forced/pts_allowed/yds_forced/yds_allowed)\n",
    "\n",
    "#### Validation Results\n",
    "Best Linear SVM: decay = 8.70e-01, C = 1.56e-02, val acc = 64.39%\n",
    "Best Lasso Model (yvar='binary'): decay = 9.00e-01, alpha = 7.63e-06, val acc = 64.90%\n",
    "Best Lasso Model (yvar='score_diff'): decay = 8.70e-01, alpha = 6.10e-05, val acc = 64.44%\n",
    "\n",
    "#### Test Results\n",
    "Linear SVM (decay = 8.70e-01, C = 1.56e-02), training acc: 63.03%, test acc: 63.33%\n",
    "Lasso (decay = 9.00e-01, alpha = 7.63e-06, yvar='binary'), training acc: 64.04%, test acc: 62.42%\n",
    "Lasso (decay = 9.30e-01, alpha = 6.25e-02, yvar='score_diff'), training acc: 63.23%, test acc: 64.39%\n",
    "\n",
    "---\n",
    "### Trained on data between 2006 and 2016\n",
    "\n",
    "#### Validation Results\n",
    "Best Linear SVM: decay = 9.30e-01, C = 3.91e-03, val acc = 64.80%\n",
    "Best Lasso Model (yvar='binary'): decay = 9.00e-01, alpha = 6.10e-05, val acc = 64.65%\n",
    "Best Lasso Model (yvar='score_diff'): decay = 8.70e-01, alpha = 9.77e-04, val acc = 65.15%\n",
    "\n",
    "#### Test Results\n",
    "Linear SVM (decay = 9.30e-01, C = 3.91e-03), training acc: 64.04%, test acc: 63.18%\n",
    "Lasso (decay = 9.00e-01, alpha = 6.10e-05, yvar='binary'), training acc: 64.04%, test acc: 60.76%\n",
    "Lasso (decay = 8.70e-01, alpha = 9.77e-04, yvar='score_diff'), training acc: 62.98%, test acc: 64.70%\n",
    "\n",
    "---\n",
    "### + Vegas point spreads\n",
    "\n",
    "#### Validation Results\n",
    "Best Linear SVM: decay = 9.30e-01, C = 1.95e-03, val acc = 64.70%\n",
    "Best Lasso Model (yvar='binary'): decay = 9.30e-01, alpha = 7.63e-06, val acc = 64.29%\n",
    "Best Lasso Model (yvar='score_diff'): decay = 9.30e-01, alpha = 6.25e-02, val acc = 64.95%\n",
    "\n",
    "#### Test Results\n",
    "Linear SVM (decay = 9.30e-01, C = 1.95e-03), training acc: 63.69%, test acc: 64.55%\n",
    "Lasso (decay = 9.30e-01, alpha = 7.63e-06, yvar='binary'), training acc: 63.64%, test acc: 63.33%\n",
    "Lasso (decay = 9.30e-01, alpha = 5.00e-01, yvar='score_diff'), training acc: 64.19%, test acc: 61.82%"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "### Trained on cumulative weekly stats from 1997 to 2016 (win/loss/points forced/points allowed), reset at the start of every season\n",
    "\n",
    "#### Validation Results\n",
    "Best Linear SVM: C = 1.53e-05, val acc = 62.41%\n",
    "Best Gaussian RBF SVM: C = 1.00e+00, gamma = 1.25e-01, val acc = 57.71%\n",
    "Best Lasso Model (yvar='binary'): alpha = 2.44e-04, val acc = 63.36%\n",
    "Best Lasso Model (yvar='score_diff'): alpha = 3.13e-02, val acc = 63.25%\n",
    "Best Lasso Model (yvar='sigmoid_score_diff'): alpha = 2.44e-04, val acc = 63.19%\n",
    "Best Lasso Model (yvar='sigmoid_score_diff'): alpha = 2.44e-04, sigmoid_k = 4.00e+00 val acc = 63.30%\n",
    "\n",
    "#### Test Results\n",
    "Linear SVM (C = 1.53e-05) test acc: 62.53%\n",
    "Gaussian RBF SVM (C = 1.0, gamma = 0.125) test acc: 55.83%\n",
    "Lasso (alpha = 2.44e-04, yvar='binary'), training acc: 63.78%, test acc: 63.37%\n",
    "Lasso (alpha = 3.13e-02, yvar='score_diff'), training acc: 63.53%, test acc: 62.62%\n",
    "Lasso (alpha = 2.44e-04, yvar='sigmoid_score_diff'), training acc: 63.64%, test acc: 62.78%\n",
    "Lasso (alpha = 2.44e-04, sigmoid_k = 4.00e+00, yvar='sigmoid_score_diff'), training acc: 63.72%, test acc: 63.29%"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/neural_net.ipynb b/neural_net.ipynb