Last active
          November 27, 2017 04:08 
        
      - 
      
- 
        Save JnBrymn-EB/89b472bfafa6cf75eeaf45883da98eef to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Data Nerds Project\n", | |
| "Corporación Favorita Grocery Sales Forecasting\n", | |
| "\n", | |
| "https://www.kaggle.com/c/favorita-grocery-sales-forecasting" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "\n", | |
| "%matplotlib inline\n", | |
| "import matplotlib\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "plt.rcParams['axes.labelsize'] = 14\n", | |
| "plt.rcParams['xtick.labelsize'] = 12\n", | |
| "plt.rcParams['ytick.labelsize'] = 12\n", | |
| "\n", | |
| "import os" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "ROOT_DIR = '/Users/johnb/Personal/data_science/kaggle/corporación_favorita/data'\n", | |
| "HOLIDAYS_EVENTS = os.path.join(ROOT_DIR, 'holidays_events.csv')\n", | |
| "TEST = os.path.join(ROOT_DIR, 'test.csv')\n", | |
| "ITEMS = os.path.join(ROOT_DIR, 'items.csv')\n", | |
| "SAMPLE_SUBMISSION = os.path.join(ROOT_DIR, 'sample_submission.csv')\n", | |
| "TRAIN = os.path.join(ROOT_DIR, 'train.csv')\n", | |
| "OIL = os.path.join(ROOT_DIR, 'oil.csv')\n", | |
| "STORES = os.path.join(ROOT_DIR, 'stores.csv')\n", | |
| "TRANSACTIONS = os.path.join(ROOT_DIR, 'transactions.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/Users/johnb/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", | |
| " interactivity=interactivity, compiler=compiler, result=result)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# this takes a long time to read\n", | |
| "train = pd.read_csv(TRAIN)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "test = pd.read_csv(TEST)\n", | |
| "sample_submission = pd.read_csv(SAMPLE_SUBMISSION)\n", | |
| "holidays_events = pd.read_csv(HOLIDAYS_EVENTS)\n", | |
| "items = pd.read_csv(ITEMS)\n", | |
| "oil = pd.read_csv(OIL)\n", | |
| "stores = pd.read_csv(STORES)\n", | |
| "transactions = pd.read_csv(TRANSACTIONS)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Look at data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>date</th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>item_nbr</th>\n", | |
| " <th>unit_sales</th>\n", | |
| " <th>onpromotion</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>103665</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>105574</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>105575</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>3</td>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>108079</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>4</td>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>108701</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id date store_nbr item_nbr unit_sales onpromotion\n", | |
| "0 0 2013-01-01 25 103665 7.0 NaN\n", | |
| "1 1 2013-01-01 25 105574 1.0 NaN\n", | |
| "2 2 2013-01-01 25 105575 2.0 NaN\n", | |
| "3 3 2013-01-01 25 108079 1.0 NaN\n", | |
| "4 4 2013-01-01 25 108701 1.0 NaN" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "train.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>date</th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>item_nbr</th>\n", | |
| " <th>unit_sales</th>\n", | |
| " <th>onpromotion</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>count</th>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>125497040</td>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>103839389</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>unique</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1684</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>top</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2017-07-01</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>freq</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>118194</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>96028767</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>mean</th>\n", | |
| " <td>6.274852e+07</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2.746458e+01</td>\n", | |
| " <td>9.727692e+05</td>\n", | |
| " <td>8.554865e+00</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>std</th>\n", | |
| " <td>3.622788e+07</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.633051e+01</td>\n", | |
| " <td>5.205336e+05</td>\n", | |
| " <td>2.360515e+01</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>min</th>\n", | |
| " <td>0.000000e+00</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.000000e+00</td>\n", | |
| " <td>9.699500e+04</td>\n", | |
| " <td>-1.537200e+04</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25%</th>\n", | |
| " <td>3.137426e+07</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.200000e+01</td>\n", | |
| " <td>5.223830e+05</td>\n", | |
| " <td>2.000000e+00</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>50%</th>\n", | |
| " <td>6.274852e+07</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2.800000e+01</td>\n", | |
| " <td>9.595000e+05</td>\n", | |
| " <td>4.000000e+00</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>75%</th>\n", | |
| " <td>9.412278e+07</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>4.300000e+01</td>\n", | |
| " <td>1.354380e+06</td>\n", | |
| " <td>9.000000e+00</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>max</th>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>5.400000e+01</td>\n", | |
| " <td>2.127114e+06</td>\n", | |
| " <td>8.944000e+04</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id date store_nbr item_nbr unit_sales \\\n", | |
| "count 1.254970e+08 125497040 1.254970e+08 1.254970e+08 1.254970e+08 \n", | |
| "unique NaN 1684 NaN NaN NaN \n", | |
| "top NaN 2017-07-01 NaN NaN NaN \n", | |
| "freq NaN 118194 NaN NaN NaN \n", | |
| "mean 6.274852e+07 NaN 2.746458e+01 9.727692e+05 8.554865e+00 \n", | |
| "std 3.622788e+07 NaN 1.633051e+01 5.205336e+05 2.360515e+01 \n", | |
| "min 0.000000e+00 NaN 1.000000e+00 9.699500e+04 -1.537200e+04 \n", | |
| "25% 3.137426e+07 NaN 1.200000e+01 5.223830e+05 2.000000e+00 \n", | |
| "50% 6.274852e+07 NaN 2.800000e+01 9.595000e+05 4.000000e+00 \n", | |
| "75% 9.412278e+07 NaN 4.300000e+01 1.354380e+06 9.000000e+00 \n", | |
| "max 1.254970e+08 NaN 5.400000e+01 2.127114e+06 8.944000e+04 \n", | |
| "\n", | |
| " onpromotion \n", | |
| "count 103839389 \n", | |
| "unique 2 \n", | |
| "top False \n", | |
| "freq 96028767 \n", | |
| "mean NaN \n", | |
| "std NaN \n", | |
| "min NaN \n", | |
| "25% NaN \n", | |
| "50% NaN \n", | |
| "75% NaN \n", | |
| "max NaN " | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "train.describe(include='all')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False 96028767\n", | |
| "True 7810622\n", | |
| "Name: onpromotion, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "train['onpromotion'].value_counts()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>date</th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>item_nbr</th>\n", | |
| " <th>onpromotion</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>125497040</td>\n", | |
| " <td>2017-08-16</td>\n", | |
| " <td>1</td>\n", | |
| " <td>96995</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>125497041</td>\n", | |
| " <td>2017-08-16</td>\n", | |
| " <td>1</td>\n", | |
| " <td>99197</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>125497042</td>\n", | |
| " <td>2017-08-16</td>\n", | |
| " <td>1</td>\n", | |
| " <td>103501</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>125497043</td>\n", | |
| " <td>2017-08-16</td>\n", | |
| " <td>1</td>\n", | |
| " <td>103520</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>125497044</td>\n", | |
| " <td>2017-08-16</td>\n", | |
| " <td>1</td>\n", | |
| " <td>103665</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id date store_nbr item_nbr onpromotion\n", | |
| "0 125497040 2017-08-16 1 96995 False\n", | |
| "1 125497041 2017-08-16 1 99197 False\n", | |
| "2 125497042 2017-08-16 1 103501 False\n", | |
| "3 125497043 2017-08-16 1 103520 False\n", | |
| "4 125497044 2017-08-16 1 103665 False" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "test.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>date</th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>item_nbr</th>\n", | |
| " <th>onpromotion</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>count</th>\n", | |
| " <td>3.370464e+06</td>\n", | |
| " <td>3370464</td>\n", | |
| " <td>3.370464e+06</td>\n", | |
| " <td>3.370464e+06</td>\n", | |
| " <td>3370464</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>unique</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>16</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>top</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2017-08-28</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>freq</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>210654</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>3171867</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>mean</th>\n", | |
| " <td>1.271823e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2.750000e+01</td>\n", | |
| " <td>1.244798e+06</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>std</th>\n", | |
| " <td>9.729693e+05</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.558579e+01</td>\n", | |
| " <td>5.898362e+05</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>min</th>\n", | |
| " <td>1.254970e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.000000e+00</td>\n", | |
| " <td>9.699500e+04</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25%</th>\n", | |
| " <td>1.263397e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.400000e+01</td>\n", | |
| " <td>8.053210e+05</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>50%</th>\n", | |
| " <td>1.271823e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2.750000e+01</td>\n", | |
| " <td>1.294665e+06</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>75%</th>\n", | |
| " <td>1.280249e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>4.100000e+01</td>\n", | |
| " <td>1.730015e+06</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>max</th>\n", | |
| " <td>1.288675e+08</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>5.400000e+01</td>\n", | |
| " <td>2.134244e+06</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id date store_nbr item_nbr onpromotion\n", | |
| "count 3.370464e+06 3370464 3.370464e+06 3.370464e+06 3370464\n", | |
| "unique NaN 16 NaN NaN 2\n", | |
| "top NaN 2017-08-28 NaN NaN False\n", | |
| "freq NaN 210654 NaN NaN 3171867\n", | |
| "mean 1.271823e+08 NaN 2.750000e+01 1.244798e+06 NaN\n", | |
| "std 9.729693e+05 NaN 1.558579e+01 5.898362e+05 NaN\n", | |
| "min 1.254970e+08 NaN 1.000000e+00 9.699500e+04 NaN\n", | |
| "25% 1.263397e+08 NaN 1.400000e+01 8.053210e+05 NaN\n", | |
| "50% 1.271823e+08 NaN 2.750000e+01 1.294665e+06 NaN\n", | |
| "75% 1.280249e+08 NaN 4.100000e+01 1.730015e+06 NaN\n", | |
| "max 1.288675e+08 NaN 5.400000e+01 2.134244e+06 NaN" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "test.describe(include='all')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False 3171867\n", | |
| "True 198597\n", | |
| "Name: onpromotion, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "test['onpromotion'].value_counts()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>unit_sales</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>125497040</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>125497041</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>125497042</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>125497043</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>125497044</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id unit_sales\n", | |
| "0 125497040 0\n", | |
| "1 125497041 0\n", | |
| "2 125497042 0\n", | |
| "3 125497043 0\n", | |
| "4 125497044 0" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sample_submission.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date</th>\n", | |
| " <th>type</th>\n", | |
| " <th>locale</th>\n", | |
| " <th>locale_name</th>\n", | |
| " <th>description</th>\n", | |
| " <th>transferred</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2012-03-02</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>Local</td>\n", | |
| " <td>Manta</td>\n", | |
| " <td>Fundacion de Manta</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2012-04-01</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>Regional</td>\n", | |
| " <td>Cotopaxi</td>\n", | |
| " <td>Provincializacion de Cotopaxi</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2012-04-12</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>Local</td>\n", | |
| " <td>Cuenca</td>\n", | |
| " <td>Fundacion de Cuenca</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2012-04-14</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>Local</td>\n", | |
| " <td>Libertad</td>\n", | |
| " <td>Cantonizacion de Libertad</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2012-04-21</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>Local</td>\n", | |
| " <td>Riobamba</td>\n", | |
| " <td>Cantonizacion de Riobamba</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date type locale locale_name description \\\n", | |
| "0 2012-03-02 Holiday Local Manta Fundacion de Manta \n", | |
| "1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi \n", | |
| "2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca \n", | |
| "3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad \n", | |
| "4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba \n", | |
| "\n", | |
| " transferred \n", | |
| "0 False \n", | |
| "1 False \n", | |
| "2 False \n", | |
| "3 False \n", | |
| "4 False " | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "holidays_events.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date</th>\n", | |
| " <th>type</th>\n", | |
| " <th>locale</th>\n", | |
| " <th>locale_name</th>\n", | |
| " <th>description</th>\n", | |
| " <th>transferred</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>count</th>\n", | |
| " <td>350</td>\n", | |
| " <td>350</td>\n", | |
| " <td>350</td>\n", | |
| " <td>350</td>\n", | |
| " <td>350</td>\n", | |
| " <td>350</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>unique</th>\n", | |
| " <td>312</td>\n", | |
| " <td>6</td>\n", | |
| " <td>3</td>\n", | |
| " <td>24</td>\n", | |
| " <td>103</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>top</th>\n", | |
| " <td>2014-06-25</td>\n", | |
| " <td>Holiday</td>\n", | |
| " <td>National</td>\n", | |
| " <td>Ecuador</td>\n", | |
| " <td>Carnaval</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>freq</th>\n", | |
| " <td>4</td>\n", | |
| " <td>221</td>\n", | |
| " <td>174</td>\n", | |
| " <td>174</td>\n", | |
| " <td>10</td>\n", | |
| " <td>338</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date type locale locale_name description transferred\n", | |
| "count 350 350 350 350 350 350\n", | |
| "unique 312 6 3 24 103 2\n", | |
| "top 2014-06-25 Holiday National Ecuador Carnaval False\n", | |
| "freq 4 221 174 174 10 338" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "holidays_events.describe(include='all')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>item_nbr</th>\n", | |
| " <th>family</th>\n", | |
| " <th>class</th>\n", | |
| " <th>perishable</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>96995</td>\n", | |
| " <td>GROCERY I</td>\n", | |
| " <td>1093</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>99197</td>\n", | |
| " <td>GROCERY I</td>\n", | |
| " <td>1067</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>103501</td>\n", | |
| " <td>CLEANING</td>\n", | |
| " <td>3008</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>103520</td>\n", | |
| " <td>GROCERY I</td>\n", | |
| " <td>1028</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>103665</td>\n", | |
| " <td>BREAD/BAKERY</td>\n", | |
| " <td>2712</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " item_nbr family class perishable\n", | |
| "0 96995 GROCERY I 1093 0\n", | |
| "1 99197 GROCERY I 1067 0\n", | |
| "2 103501 CLEANING 3008 0\n", | |
| "3 103520 GROCERY I 1028 0\n", | |
| "4 103665 BREAD/BAKERY 2712 1" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "items.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date</th>\n", | |
| " <th>dcoilwtico</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2013-01-02</td>\n", | |
| " <td>93.14</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2013-01-03</td>\n", | |
| " <td>92.97</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2013-01-04</td>\n", | |
| " <td>93.12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2013-01-07</td>\n", | |
| " <td>93.20</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date dcoilwtico\n", | |
| "0 2013-01-01 NaN\n", | |
| "1 2013-01-02 93.14\n", | |
| "2 2013-01-03 92.97\n", | |
| "3 2013-01-04 93.12\n", | |
| "4 2013-01-07 93.20" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "oil.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>type</th>\n", | |
| " <th>cluster</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>Quito</td>\n", | |
| " <td>Pichincha</td>\n", | |
| " <td>D</td>\n", | |
| " <td>13</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>Quito</td>\n", | |
| " <td>Pichincha</td>\n", | |
| " <td>D</td>\n", | |
| " <td>13</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3</td>\n", | |
| " <td>Quito</td>\n", | |
| " <td>Pichincha</td>\n", | |
| " <td>D</td>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4</td>\n", | |
| " <td>Quito</td>\n", | |
| " <td>Pichincha</td>\n", | |
| " <td>D</td>\n", | |
| " <td>9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5</td>\n", | |
| " <td>Santo Domingo</td>\n", | |
| " <td>Santo Domingo de los Tsachilas</td>\n", | |
| " <td>D</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " store_nbr city state type cluster\n", | |
| "0 1 Quito Pichincha D 13\n", | |
| "1 2 Quito Pichincha D 13\n", | |
| "2 3 Quito Pichincha D 8\n", | |
| "3 4 Quito Pichincha D 9\n", | |
| "4 5 Santo Domingo Santo Domingo de los Tsachilas D 4" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "stores.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date</th>\n", | |
| " <th>store_nbr</th>\n", | |
| " <th>transactions</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2013-01-01</td>\n", | |
| " <td>25</td>\n", | |
| " <td>770</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2013-01-02</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2111</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2013-01-02</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2358</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2013-01-02</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3487</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2013-01-02</td>\n", | |
| " <td>4</td>\n", | |
| " <td>1922</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date store_nbr transactions\n", | |
| "0 2013-01-01 25 770\n", | |
| "1 2013-01-02 1 2111\n", | |
| "2 2013-01-02 2 2358\n", | |
| "3 2013-01-02 3 3487\n", | |
| "4 2013-01-02 4 1922" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "transactions.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Joining Stuff\n", | |
| "\n", | |
| "* train is the main table possible join rows `store_nbr`, `item_nbr`, and `date`\n", | |
| "* stores can be joined by `store_nbr`\n", | |
| "* transactions can be joined by `store_nbr`\n", | |
| "* items can be joined in by `item_nbr`\n", | |
| "* oil can be joined in by `date`\n", | |
| "--------\n", | |
| "* holiday can be joined in by `date` and possibly `locale_name` but it might be complicated - maybe we can augment this dataset with mapzen WhosOnFirst." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 286, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "12550" | |
| ] | |
| }, | |
| "execution_count": 286, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# get a sub set of the data so that it's easier to play with `merge`\n", | |
| "train_small = train.iloc[range(0,125497040, 10000)]\n", | |
| "len(train_small)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## merge easy things" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 289, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "features = train_small \\\n", | |
| " .merge(stores, on='store_nbr', how='left') \\\n", | |
| " .merge(transactions, on=['store_nbr','date'], how='left') \\\n", | |
| " .merge(items, on='item_nbr', how='left') \\\n", | |
| " .merge(oil, on='date', how='left')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## merge holidays\n", | |
| "Goal - to add 3 fields to each row which represent if the {store, date} is associated with a local, regional, or national holiday" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 290, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# These holidays matter because they exclude holidays \"transferred\" to another day and \n", | |
| "# \"Work Days\" which aren't holidays at all\n", | |
| "holidays_events_that_matter = holidays_events[\n", | |
| " (holidays_events['type'] != 'Work Day') \n", | |
| " | (holidays_events['transferred'] != True)\n", | |
| "]\n", | |
| "# drop redundant holidays (occasionally there are multiple local holidays on the same day)\n", | |
| "holidays_events_that_matter.drop_duplicates(keep='first', subset=['date','locale'], inplace=True)\n", | |
| "\n", | |
| "# create dummy fields in the tables we're going to join\n", | |
| "features['s'] = features.apply(lambda r: 's_{}'.format(r['state']), axis=1)\n", | |
| "features['c'] = features.apply(lambda r: 'c_{}'.format(r['city']), axis=1)\n", | |
| "def make_location_column(row):\n", | |
| " if row['locale'] == \"Regional\":\n", | |
| " return 's_{}'.format(row['locale_name'])\n", | |
| " elif row['locale'] == \"Local\":\n", | |
| " return 'c_{}'.format(row['locale_name'])\n", | |
| "holidays_events_that_matter['place'] = holidays_events_that_matter.apply(make_location_column, axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 295, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# make local, regional, and national holiday fields\n", | |
| "temp = features.merge(holidays_events_that_matter, left_on=['c', 'date'], right_on=['place', 'date'], how='left')\n", | |
| "features['local_holiday'] = temp['locale'] == 'Local'\n", | |
| "\n", | |
| "temp = features.merge(holidays_events_that_matter, left_on=['s', 'date'], right_on=['place', 'date'], how='left')\n", | |
| "features['regional_holiday'] = temp['locale'] == 'Regional'\n", | |
| "\n", | |
| "temp = features.merge(\n", | |
| " holidays_events_that_matter[holidays_events_that_matter['locale'] == 'National'],\n", | |
| " on='date', how='left')\n", | |
| "features['national_holiday'] = temp['locale'] == 'National'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# todo\n", | |
| "* figure out how to impute missing data in each set before joining them\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment