Yogendra0Sharma · January 18, 2017 07:32
diff --git a/00a_quick_ref.py b/00a_quick_ref.py
 ## FUC (Frequently Used Commands)
 df.isnull.sum()  # num of missing values per column


 ## USEFUL FUNCTIONS
 # Binning
 pd.cut
 pd.qcut  # qunatile-based pd.cut


 ## IDIOMATIC pandas
 # - work with SORTED index
 # - use explicit .loc and .iloc for indexing (espcially with integer indices)


 # SERIES as a specialized dict: index -> value
 # DATAFRAME as a specialized dict: column name -> Series of column data
 # INDEX: immutable array | ordered set


 ## DATA INDEXING AND SELECTION
 # indexing arr[2, 1]
 # slicing arr[:, 1:5]
 # masking arr[arr > 0]
 # fancy indexing arr[0, [1, 5]]

 # with .loc final index is INCLUDED
 # with .iloc - EXCLUDED

 # In a DataFrame:
 # - .iloc does not allow masking
 # - integer indices are not allowed
 # - direct slices, like df[1:3], are row-wise (vs column-wise)
 # - direct masking, df[df.col1 < 3], is row-wise

 ## Call .reindex() to arrange data to a NEW index

 ## Hierarchical Indexing
 # With **sorted** indices, partial indexing can be performed on lower levels
 # by passing an empty slice in the first index
 s[:, 2010]
 s[:, 2010, 'b']

 df['top_level_col']  # level 0

 # Use IndexSlice to slice a DataFrame
 idx = pd.IndexSlice
 df.loc[idx[:, lvl1_row_val], idx[:, lvl1_col_val]]


 ## COMBINING DATASETS

 ## CONCAT AND APPEND
 # Options to handle duplicate indices:
 pd.concat([x, y], verify_integrity=True)  # try-except ValueError
 pd.concat([x, y], ignore_index=True)  # a new integer index will be created
 pd.concat([x, y], keys=['x', 'y'])  # keys option specifies labels for the data source

 # pd.concat([df1, df2]) is equivalent to df1.append(df2)
 # pd.concat more efficient for more than 2 DFs

 ## MERGE AND JOIN
 # Categories of Joins:
 # - one-to-one
 # - many-to-one: one of the two key columns contains duplicate entries
 # - many-to-many: key column in both left and right array contains duplicates

 ## .join() is a convenience method: performs a merge that defaults to joining on indices


 ## AGGREGATION AND GROUPING
 df.groupby('key')  # lazy evaluation
 # <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358>

 df.groupby('key')['data']  # column indexing
 # <pandas.core.groupby.SeriesGroupBy object at 0x11110e390>

 # GroupBy --> collection of DFs
 for gkey, g in df.groupby('key'):  # iterating over groups
    pass
  
 ## Operations available on the GroupBy object:
 # - aggregate: reduces df to a scalar value
 # - filter: reduces shape of df
 # - transform: df in and out shapes are the same
 # - apply

 ## Split key
 # It's possible to use dict or series mapping INDEX to group,
 # or any Python func


 ## PIVOT TABLES
 # Pivot table is a multi-dimensional version of GroupBy aggregation


 ## TIME SERIES

 | Class             | How to create                          |
 |-------------------|----------------------------------------|
 | pd.Timestamp      | to_datetime, Timestamp                 |
 | pd.DatetimeIndex  | to_datetime, date_range, DatetimeIndex |  # Group of Timestamps
 | pd.Period         | Period                                 |
 | pd.PeriodIndex    | period_range, PeriodIndex              |

 # Regular sequences
 pd.period_range('2015-07', periods=8, freq='M')
 # PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
 #              '2016-01', '2016-02'],
 #             dtype='period[M]', freq='M')

 ## Frequencies and Offsets
 # NOTE: by adding as ``S`` suffix to monthly, quartely, and annual
 # frequences, they instead will be marked at the beginning
 | Code   | Description         | Code   | Description          |
 |--------|---------------------|--------|----------------------|
 | ``D``  | Calendar day        | ``B``  | Business day         |
 | ``W``  | Weekly              |        |                      |
 | ``M``  | Month end           | ``BM`` | Business month end   |
 | ``Q``  | Quarter end         | ``BQ`` | Business quarter end |
 | ``A``  | Year end            | ``BA`` | Business year end    |
 | ``H``  | Hours               | ``BH`` | Business hours       |
 | ``T``  | Minutes             |        |                      |
 | ``S``  | Seconds             |        |                      |
 | ``L``  | Milliseonds         |        |                      |
 | ``U``  | Microseconds        |        |                      |
 | ``N``  | nanoseconds         |        |                      |

 ## Time-shifts
 # shift() shifts the DATA
 # tshift() shifts the INDEX


 ## HIGH-PERFORMANCE PANDAS: eval() and query()
 # The eval() function in Pandas uses string expressions to efficiently
 # compute operations using DataFrames.

 # pd.eval()
 pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')

 # df.eval()
 df.eval('(A + B) / (C - 1)')
 df.eval('D = (A + B) / C', inplace=True)

 # Local vars in df.eval()
 column_mean = df.mean(axis='columns')
 df.eval('A + @column_mean')  # @ marks Python namespace vs namespace of columns

 # pandas.eval() function only has access to the one (Python) namespace

 # df.query()
 df.query('A < 0.5 and B < 0.5')
diff --git a/00b_cool_tricks.py b/00b_cool_tricks.py
 # source:
 # https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.09-Pivot-Tables.ipynb
 births.head(2)
 #    year  month day gender  births
 # 0  1969      1   1      F    4046
 # 1  1969      1   1      M    4440


 # Calcule a decade
 births['decade'] = 10 * (births['year'] // 10)  # integer division

 # Remove outliers
 quartiles = np.percentile(births['births'], [25, 50, 75])
 mu = quartiles[1]
 sig = 0.74 * (quartiles[2] - quartiles[0])  # robust estimate of the sample mean
                                            # 0.74 comes from the interquartile range
                                            # of a Gaussian distribution
 births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')

 # Create a datetime index from the year, month, day
 births.index = pd.to_datetime(10000 * births.year +
                              100 * births.month +
                              births.day, format='%Y%m%d')

 births['dayofweek'] = births.index.dayofweek

 # The mean number of births by the day of the *year*.
 births_by_date = births.pivot_table('births', 
                                    [births.index.month, births.index.day])
 births_by_date.head(3)  # multi-index over months and days
 # 1  1    4009.225
 #    2    4247.400
 #    3    4500.900
 # Name: births, dtype: float64

 # To make this easily plottable, let's turn these months and days into a date
 # by associating them with a dummy year variable (making sure to choose a leap year
 # so February 29th is correctly handled!)
 births_by_date.index = [pd.datetime(2012, month, day)
                        for (month, day) in births_by_date.index]
 births_by_date.head(3)  # TimeSeries
 # 2012-01-01    4009.225
 # 2012-01-02    4247.400
 # 2012-01-03    4500.900
 # Name: births, dtype: float64


 # source:
 # https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.10-Working-With-Strings.ipynb

 # Which recipe has the longest ingredient list
 recipes.name[np.argmax(recipes.ingredients.str.len())]
 # argmax returns the indices of the maximum values along an axis
diff --git a/01_series.py b/01_series.py
 s = pd.Series([.25, .5, .75, 1], index=list('abcd'))
 s
 # a    0.25
 # b    0.50
 # c    0.75
 # d    1.00
 # dtype: float64

 s.index
 # Index(['a', 'b', 'c', 'd'], dtype='object')


 ## Series as a specialized dict
 # By default, a series will be created where the index is drawn from
 # the SORTED keys
 population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
 population = pd.Series(population_dict)
 population
 # California    38332521
 # Florida       19552860
 # Illinois      12882135
 # New York      19651127
 # Texas         26448193
 # dtype: int64

 population.keys()  # equiv to population.index
 # Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

 population['California']
 # 38332521

 # slicing
 # NOTE: index MUST be sorted
 population['California':'Illinois']
 # California    38332521
 # Florida       19552860
 # Illinois      12882135
 # dtype: int64
diff --git a/02_dataframe.py b/02_dataframe.py
 ## DataFrame
 # - a sequence of **aligned** Series objects, i.e. they share the same index
 # - a specialized dict: column name -> Series of column data

 df.columns  # Index object holding the column labels
diff --git a/03_index.py b/03_index.py
 ind = pd.Index([2, 3, 5, 7, 11])
 # Int64Index([2, 3, 5, 7, 11], dtype='int64')

 ## Index as immutable array
 ind[::2]
 # Int64Index([2, 5, 11], dtype='int64')

 print(ind.size, ind.shape, ind.ndim, ind.dtype)
 # 5 (5,) 1 int64

 ## Index as ordered set
 # Index object supports set operations:
 # - union |
 # - intersection &
 # - difference -
 # - symmetric difference ^
diff --git a/04_indexing_and_selection.py b/04_indexing_and_selection.py
 ## DATA INDEXING AND SELECTION
 # indexing arr[2, 1]
 # slicing arr[:, 1:5]
 # masking arr[arr > 0]
 # fancy indexing arr[0, [1, 5]]


 ## DATA SELECTION IN SERIES
 s = pd.Series([.25, .5, .75, 1], index=list('abcd'))

 # slicing by explicit index
 s['a':'c']  # the final index is INCLUDED
 # a    0.25
 # b    0.50
 # c    0.75
 # dtype: float64

 # slicing by implicit integer index
 s[0:2]  # the final index is EXCLUDED
 # a    0.25
 # b    0.50
 # dtype: float64

 # masking
 s[(s > 0.3) & (s < 0.8)]
 # b    0.50
 # c    0.75
 # dtype: float64

 # fancy indexing
 s[['a', 'e']]
 # a    0.25
 # e     NaN
 # dtype: float64


 ## INDEXERS: loc, iloc and ix
 # .loc always references the explicit index
 # .iloc always references the implicit, Python-style index

 # WARNING: avoid using .ix (it tries to index by label before trying to
 # index by position)
 s = pd.Series(list('abcde'), index=range(1,6))
 s
 # 1    a
 # 2    b
 # 3    c
 # 4    d
 # 5    e
 # dtype: object

 s.ix[2:4]  # equivalent to s.loc[2:4]
 # 2    b
 # 3    c
 # 4    d
 # dtype: object

 s.iloc[2:4]
 # 3    c
 # 4    d
 # dtype: object


 ## DATA SELECTION IN DATAFRAME
 # - .iloc does not allow masking
 # - integer indices are not allowed
 # - direct slices, like df[1:3], are row-wise (vs column-wise)
 # - direct masking, df[df.col1 < 3], is row-wise
diff --git a/05_multiindex.py b/05_multiindex.py
 ## HIERARCHICAL INDEXING

 ## MultiIndex construction

 pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
 # MultiIndex(levels=[['a', 'b'], [1, 2]],
 #            labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

 pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
 # MultiIndex(levels=[['a', 'b'], [1, 2]],
 #            labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

 pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
 # MultiIndex(levels=[['a', 'b'], [1, 2]],
 #            labels=[[0, 0, 1, 1], [0, 1, 0, 1]])


 ## Data Selection
 # Many of the MultiIndex slicing operations will fail if the index
 # is not sorted
 df.sort_index()
 df.sortlevel()


 ## Data Selection: Series
 index = [('California', 2000, 'a'), ('California', 2010, 'a'),
         ('New York', 2000, 'a'), ('New York', 2010, 'b'),
         ('Texas', 2000, 'b'), ('Texas', 2010, 'b')]
 populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
 s = pd.Series(populations, index=pd.MultiIndex.from_tuples(index))
 s.index.names = ['state', 'year', 'letter']  # level names
 s
 # state       year  letter
 # California  2000  a         33871648
 #             2010  a         37253956
 # New York    2000  a         18976457
 #             2010  b         19378102
 # Texas       2000  b         20851820
 #             2010  b         25145561
 # dtype: int64

 # Texas
 s['Texas']

 # 2000
 # With **sorted** indices, partial indexing can be performed on lower levels
 # by passing an empty slice in the first index
 s[:, 2000]

 # b
 s[:, :, 'b']

 # 2010, b
 s[:, 2010, 'b']

 # Texas, b
 s[:, :, 'b']['Texas']

 # New York, a
 s[:, :, 'a']['New York']

 # California, 2010, a
 s['California', 2010, 'a']


 ## Data Selection: DataFrame
 health_data
 # subject      Bob       Guido         Sue      
 # type          HR  Temp    HR  Temp    HR  Temp
 # year visit                                    
 # 2013 1      39.0  38.0  53.0  35.8  28.0  37.5
 #      2      44.0  36.3  35.0  36.5  31.0  37.1
 # 2014 1      29.0  35.8  25.0  38.3  31.0  37.0
 #      2      23.0  35.2  36.0  38.5  22.0  37.4

 idx = pd.IndexSlice

 # Guido
 health_data['Guido']  # level 0

 # Guido, Temp
 health_data[('Guido', 'Temp')]

 # Guido, 1 visit
 health_data.loc[idx[:, 1], 'Guido']

 # Guido, 1 visit, Temp
 health_data.loc[idx[:, 1], ('Guido', 'Temp')]  # or idx['Guido', 'Temp']

 # Guido, 2014, HR
 health_data.loc[2014, ('Guido', 'HR')]

 # 2014
 health_data.loc[2014]

 # 2013, 2 visit
 health_data.loc[(2014, 2)]

 # 2014, 2 visit, Temp
 health_data.loc[(2014, 2), idx[:, 'Temp']]
diff --git a/06_reindexing.py b/06_reindexing.py
 ## Call .reindex() to arrange data to a new index

 # Reindexing rows
 df = df.reindex(index=['A','B','C','D','E','F'])

 # ... columns
 df = df.reindex(columns=['col1','col2','col3'])

 # ... or both
 df = df.reindex(index=list('ABCD'), columns=['col1', 'col2'])


 # You can also fill values for new indexes
 s.reindex(['A','B','C','D','E','F','G'], fill_value=0)

 # Or use a particular method for filling values
 s.reindex(range(15), method='ffill')
diff --git a/07_concat_append.py b/07_concat_append.py
 ## COMBINING DATASETS: CONCAT AND APPEND

 # Handling duplicate indices
 x
 #     A   B
 # 0  A0  B0
 # 1  A1  B1

 y
 #     A   B
 # 0  A2  B2
 # 1  A3  B3

 # pd.concat **preserves** indices, even if the result will have duplicate indices
 pd.concat([x, y])
 #     A   B
 # 0  A0  B0
 # 1  A1  B1
 # 0  A2  B2
 # 1  A3  B3


 # Catching the repeats as an error
 try:
    pd.concat([x, y], verify_integrity=True)
 except ValueError as e:
    pass

 # Ignoring the index -> a new integer index will be created
 pd.concat([x, y], ignore_index=True)
 #     A   B
 # 0  A0  B0
 # 1  A1  B1
 # 2  A2  B2
 # 3  A3  B3

 # Adding MultiIndex keys
 pd.concat([x, y], keys=['x', 'y'])  # keys option specifies labels for the data source
 #       A   B
 # x 0  A0  B0
 #   1  A1  B1
 # y 0  A2  B2
 #   1  A3  B3


 # The .append() method
 # pd.concat([df1, df2]) is equivalent to df1.append(df2)

 # NOTE: .append() is not very efficient, because it involves creating of a new
 # index and data buffer.
 # Instead of mutliple .append() operations, build a list of DataFrames and pass
 # them all at once to the concat() function.
diff --git a/08_merge_join.py b/08_merge_join.py
 ## COMBINING DATASETS: MERGE AND JOIN

 # Categories of Joins:
 # - one-to-one
 # - many-to-one: one of the two key columns contains duplicate entries
 # - many-to-many: key column in both left and right array contains duplicates

 # many-to-one example
 df1
 #   employee        group
 # 0      Bob   Accounting
 # 1     Jake  Engineering
 # 2     Lisa  Engineering
 # 3      Sue           HR

 df2
 #          group supervisor
 # 0   Accounting      Carly
 # 1  Engineering      Guido
 # 2           HR      Steve

 pd.merge(df1, df2)
 #   employee        group supervisor
 # 0      Bob   Accounting      Carly
 # 1     Jake  Engineering      Guido
 # 2     Lisa  Engineering      Guido
 # 3      Sue           HR      Steve


 # Set arithmetic for joins:
 # inner -> intersection of two sets of inputs
 # outer -> union
 # left and right


 ## .join()
 # Convenience method: performs a merge that defaults to joining on indices
diff --git a/09_groupby.py b/09_groupby.py
 ## AGGREGATION AND GROUPING

 df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
 df
 #   key  data
 # 0   A     0
 # 1   B     1
 # 2   C     2
 # 3   A     3
 # 4   B     4
 # 5   C     5

 df.groupby('key')  # lazy evaluation
 # <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358>

 df.groupby('key')['data']  # column indexing
 # <pandas.core.groupby.SeriesGroupBy object at 0x11110e390>


 ## Iteration over groups
 for gkey, g in df.groupby('key'):
    print('{:3} shape={}'.format(gkey, g.shape))
 # A   shape=(2, 2)
 # B   shape=(2, 2)
 # C   shape=(2, 2)


 ## Operations available on the GroupBy object:
 # - aggregate
 # - filter
 # - transform
 # - apply

 # GroupBy object, in many ways, can be treated as a collection of DFs

 # Aggregation
 df.groupby('key').agg({'data': ['min', 'count', lambda g: g.sum() + 1]})
 #     data               
 #      min count <lambda>
 # key                    
 # A      0     2        4
 # B      1     2        6
 # C      2     2        8

 # Filtering allows you to drop data based on the group properties
 df.groupby('key').sum()
 #      data
 # key      
 # A       3
 # B       5
 # C       7

 # The filter func should return a Bool, specifying whether the group passes the filtering
 # Below, because group A does not have sum > 3, it is dropped from the result.
 df.groupby('key').filter(lambda f: f['data'].sum() > 3)
 Out[96]: 
  key  data
 1   B     1
 2   C     2
 4   B     4
 5   C     5

 # Transformation: the output is the same shape as the input
 # data in DF is transformed in some way
 df.groupby('key').transform(lambda x: x - x.mean())
 #    data
 # 0  -1.5
 # 1  -1.5
 # 2  -1.5
 # 3   1.5
 # 4   1.5
 # 5   1.5

 # Apply lets you apply an arbitrary function to the group results.
 # In: DF, Out: DF/Series or a scalar


 ## Specifying the split key

 # A dict or series mapping INDEX to group
 df2 = df.set_index('key')
 mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
 df2.groupby(mapping).sum()
 #            data
 # consonant    12
 # vowel         3

 # A more complicated example of a mapping used on the index:
 # http://stackoverflow.com/a/19616072

 # Any Python func
 df2.groupby(str.lower).sum()
 #    data
 # a     3
 # b     5
 # c     7

 # A list of valid keys
 df2.groupby([str.lower, mapping]).mean()
 #              data
 # a vowel       1.5
 # b consonant   2.5
 # c consonant   3.5
diff --git a/10_pivot_tables.py b/10_pivot_tables.py
 ## PIVOT TABLES
 titanic = sns.load_dataset('titanic')

 # Pivot table as a multi-dimensional version of GroupBy aggregation
 titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
 # Same result using the pivot table syntax
 titanic.pivot_table('survived', index='sex', columns='class')


 ## Multi-level pivot tables
 age = pd.cut(titanic['age'], [0, 18, 80])
 titanic.pivot_table('survived', ['sex', age], 'class')
 # class               First    Second     Third
 # sex    age                                   
 # female (0, 18]   0.909091  1.000000  0.511628
 #        (18, 80]  0.972973  0.900000  0.423729
 # male   (0, 18]   0.800000  0.600000  0.215686
 #        (18, 80]  0.375000  0.071429  0.133663

 fare = pd.qcut(titanic['fare'], 2)  # automatically compute quantiles
 titanic.pivot_table('survived', ['sex', age], [fare, 'class']).round(1)
 # fare            [0, 14.454]              (14.454, 512.329]             
 # class                 First Second Third             First Second Third
 # sex    age                                                             
 # female (0, 18]          NaN    1.0   0.7               0.9    1.0   0.3
 #        (18, 80]         NaN    0.9   0.4               1.0    0.9   0.4
 # male   (0, 18]          NaN    0.0   0.3               0.8    0.8   0.2
 #        (18, 80]         0.0    0.1   0.1               0.4    0.0   0.2

 # aggfunc can be specified as a dict mapping a column to any agg func
 # NOTE: values keyword is omitted
 # When specifying a mapping for aggfunc, this is determined automatically
 titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':sum, 'fare':'mean'}).round(1)
 #        survived                fare             
 # class     First Second Third  First Second Third
 # sex                                             
 # female     91.0   70.0  72.0  106.1   22.0  16.1
 # male       45.0   17.0  47.0   67.2   19.7  12.7# 
diff --git a/11_vect_str_ops.py b/11_vect_str_ops.py
 ## VECTORIZED STRING OPERATIONS

 # Extract dummy variables as a dataframe
 s = pd.Series(['a|b', 'a', None, 'a|c'])
 # 0     a|b
 # 1       a
 # 2    None
 # 3     a|c
 # dtype: object

 s.str.get_dummies(sep='|')
 #    a  b  c
 # 0  1  1  0
 # 1  1  0  0
 # 2  0  0  0
 # 3  1  0  1
diff --git a/12_time_series.py b/12_time_series.py
 ## WORKING WITH TIME SERIES

 # Date and time data:
 # - time stamps: reference particular moments in time (July 4th, 2015 at 7:00am)
 # - time interval and periods reference a length of time between
 # a particular beginning and end point: eg, the year 2015.
 # Periods usually reference a special case of time intervals
 # in which each interval is of uniform length and does not overlap
 # (eg, 24 hour-long periods comprising days)
 # - time deltas (or durations) (eg 22.5 seconds)

 # Time intervals: assignments
 # 1) sd: 2016-01-01, ed: 2016-01-15  # have start and end dates
 # 2) sd: 2016-01-10, ed: 2016-02-03  # can overlap

 date = pd.to_datetime('15th Dec, 2016')
 date
 # Timestamp('2016-12-15 00:00:00')

 # From a group of these Timestamp objects, Pandas can construct
 # a DatetimeIndex that can be used to index data in Series of DFs


 ## Pandas Time Series Data Structures
 # - time stamps: Timestamp and DatetimeIndex
 # - time periods: Period and PeriodIndex
 # - time deltas: Timedelta and TimedeltaIndex

 # The most fundamental of these date/time objects are the
 # Timestamp and DatatimeIndex objects
 dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015',
                       '2015-Jul-6', '07-07-2015', '20150708'])
 dates
 # DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
 #                '2015-07-08'],
 #               dtype='datetime64[ns]', freq=None)

 dates.to_period('D')
 # PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07',
 #              '2015-07-08'],
 #             dtype='period[D]', freq='D')

 dates - dates[0]
 # TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None)


 ## Regular sequences
 # pd.date_range() -> DatetimeIndex
 # pd.period_range() -> PeriodIndex
 # pd.timedelta_range() -> TimedeltaIndex

 pd.period_range('2015-07', periods=8, freq='M')
 # PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
 #              '2016-01', '2016-02'],
 #             dtype='period[M]', freq='M')


 ## Frequencies and Offsets
 | Code   | Description         | Code   | Description          |
 |--------|---------------------|--------|----------------------|
 | ``D``  | Calendar day        | ``B``  | Business day         |
 | ``W``  | Weekly              |        |                      |
 | ``M``  | Month end           | ``BM`` | Business month end   |
 | ``Q``  | Quarter end         | ``BQ`` | Business quarter end |
 | ``A``  | Year end            | ``BA`` | Business year end    |
 | ``H``  | Hours               | ``BH`` | Business hours       |
 | ``T``  | Minutes             |        |                      |
 | ``S``  | Seconds             |        |                      |
 | ``L``  | Milliseonds         |        |                      |
 | ``U``  | Microseconds        |        |                      |
 | ``N``  | nanoseconds         |        |                      |

 # By adding as ``S`` suffix to monthly, quartely, and annual frequences, they
 # instead will be marked at the beginning:
 | Code    | Description            || Code    | Description            |
 |---------|------------------------||---------|------------------------|
 | ``MS``  | Month start            ||``BMS``  | Business month start   |
 | ``QS``  | Quarter start          ||``BQS``  | Business quarter start |
 | ``AS``  | Year start             ||``BAS``  | Business year start    |

 pd.timedelta_range(0, periods=9, freq="2H30T")  # freq 2h 30 mins


 ## Time-shifts
 # shift() shifts the DATA
 # tshift() shifts the INDEX
 # for example see link below


 ## Resampling, Shifting and Windowing
 # see: https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.11-Working-with-Time-Series.ipynb
 # and: http://pandas.pydata.org/pandas-docs/stable/timeseries.html
diff --git a/13_high_perf.py b/13_high_perf.py
 ## HIGH-PERFORMANCE PANDAS: eval() and query()
 # The eval() function in Pandas uses string expressions to efficiently
 # compute operations using DataFrames.
 rng = np.random.RandomState(42)
 df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))
                           for i in range(5))

 # pd.eval()
 result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
 result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
 np.allclose(result1, result2)  # True

 # It supports the use of the literal `and` and `or` in Boolean expressions
 result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
 np.allclose(result1, result3)  # True


 # DataFrame.eval() for Column-Wise Operations
 df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
 df.head(3)
 #           A         B         C
 # 0  0.821257  0.110632  0.846452
 # 1  0.127489  0.397287  0.797295
 # 2  0.149917  0.229251  0.722253

 result1 = pd.eval("(df.A + df.B) / (df.C - 1)")
 result2 = df.eval('(A + B) / (C - 1)')
 np.allclose(result1, result2)  # True

 # Supports assignment
 df.eval('D = (A + B) / C', inplace=True)
 df.head(3)
 #           A         B         C         D
 # 0  0.821257  0.110632  0.846452  1.100935
 # 1  0.127489  0.397287  0.797295  0.658195
 # 2  0.149917  0.229251  0.722253  0.524981

 # Local variables in DataFrame.eval()
 column_mean = df.mean(axis='columns')
 result1 = df['A'] + column_mean
 result2 = df.eval('A + @column_mean')
 np.allclose(result1, result2)  # True

 # The ``@`` character here marks a variable name rather than a column name,
 # and lets you efficiently evaluate expressions involving the two "namespaces":
 # the namespace of columns, and the namespace of Python objects.
 # NOTE: ``@`` character is only supported by the ``DataFrame.eval()`` method,
 # not by the ``pandas.eval()`` function, because the ``pandas.eval()`` function
 # only has access to the one (Python) namespace.


 # DataFrame.query() Method
 result1 = df[(df.A < 0.5) & (df.B < 0.5)]
 result2 = df.query('A < 0.5 and B < 0.5')
 np.allclose(result1, result2)


 # Performance: When to Use These Functions
 x = df[(df.A < 0.5) & (df.B < 0.5)]
 # is roughly equivalent to:
 tmp1 = df.A < 0.5
 tmp2 = df.B < 0.5
 tmp3 = tmp1 & tmp2
 x = df[tmp3]

 # If the size of the temporary DFs is significant compared to your
 # available system memory, then it's a good idea to use an
 # eval() or query() expression.
 df.values.nbytes  # check approx size of your array in bytes

 # eval/query sometimes offers a cleaner syntax
	## FUC (Frequently Used Commands)
	df.isnull.sum() # num of missing values per column


	## USEFUL FUNCTIONS
	# Binning
	pd.cut
	pd.qcut # qunatile-based pd.cut


	## IDIOMATIC pandas
	# - work with SORTED index
	# - use explicit .loc and .iloc for indexing (espcially with integer indices)


	# SERIES as a specialized dict: index -> value
	# DATAFRAME as a specialized dict: column name -> Series of column data
	# INDEX: immutable array \| ordered set


	## DATA INDEXING AND SELECTION
	# indexing arr[2, 1]
	# slicing arr[:, 1:5]
	# masking arr[arr > 0]
	# fancy indexing arr[0, [1, 5]]

	# with .loc final index is INCLUDED
	# with .iloc - EXCLUDED

	# In a DataFrame:
	# - .iloc does not allow masking
	# - integer indices are not allowed
	# - direct slices, like df[1:3], are row-wise (vs column-wise)
	# - direct masking, df[df.col1 < 3], is row-wise

	## Call .reindex() to arrange data to a NEW index

	## Hierarchical Indexing
	# With sorted indices, partial indexing can be performed on lower levels
	# by passing an empty slice in the first index
	s[:, 2010]
	s[:, 2010, 'b']

	df['top_level_col'] # level 0

	# Use IndexSlice to slice a DataFrame
	idx = pd.IndexSlice
	df.loc[idx[:, lvl1_row_val], idx[:, lvl1_col_val]]


	## COMBINING DATASETS

	## CONCAT AND APPEND
	# Options to handle duplicate indices:
	pd.concat([x, y], verify_integrity=True) # try-except ValueError
	pd.concat([x, y], ignore_index=True) # a new integer index will be created
	pd.concat([x, y], keys=['x', 'y']) # keys option specifies labels for the data source

	# pd.concat([df1, df2]) is equivalent to df1.append(df2)
	# pd.concat more efficient for more than 2 DFs

	## MERGE AND JOIN
	# Categories of Joins:
	# - one-to-one
	# - many-to-one: one of the two key columns contains duplicate entries
	# - many-to-many: key column in both left and right array contains duplicates

	## .join() is a convenience method: performs a merge that defaults to joining on indices


	## AGGREGATION AND GROUPING
	df.groupby('key') # lazy evaluation
	# <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358>

	df.groupby('key')['data'] # column indexing
	# <pandas.core.groupby.SeriesGroupBy object at 0x11110e390>

	# GroupBy --> collection of DFs
	for gkey, g in df.groupby('key'): # iterating over groups
	pass

	## Operations available on the GroupBy object:
	# - aggregate: reduces df to a scalar value
	# - filter: reduces shape of df
	# - transform: df in and out shapes are the same
	# - apply

	## Split key
	# It's possible to use dict or series mapping INDEX to group,
	# or any Python func


	## PIVOT TABLES
	# Pivot table is a multi-dimensional version of GroupBy aggregation


	## TIME SERIES

	\| Class \| How to create \|
	\|-------------------\|----------------------------------------\|
	\| pd.Timestamp \| to_datetime, Timestamp \|
	\| pd.DatetimeIndex \| to_datetime, date_range, DatetimeIndex \| # Group of Timestamps
	\| pd.Period \| Period \|
	\| pd.PeriodIndex \| period_range, PeriodIndex \|

	# Regular sequences
	pd.period_range('2015-07', periods=8, freq='M')
	# PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
	# '2016-01', '2016-02'],
	# dtype='period[M]', freq='M')

	## Frequencies and Offsets
	# NOTE: by adding as ``S`` suffix to monthly, quartely, and annual
	# frequences, they instead will be marked at the beginning
	\| Code \| Description \| Code \| Description \|
	\|--------\|---------------------\|--------\|----------------------\|
	\| ``D`` \| Calendar day \| ``B`` \| Business day \|
	\| ``W`` \| Weekly \| \| \|
	\| ``M`` \| Month end \| ``BM`` \| Business month end \|
	\| ``Q`` \| Quarter end \| ``BQ`` \| Business quarter end \|
	\| ``A`` \| Year end \| ``BA`` \| Business year end \|
	\| ``H`` \| Hours \| ``BH`` \| Business hours \|
	\| ``T`` \| Minutes \| \| \|
	\| ``S`` \| Seconds \| \| \|
	\| ``L`` \| Milliseonds \| \| \|
	\| ``U`` \| Microseconds \| \| \|
	\| ``N`` \| nanoseconds \| \| \|

	## Time-shifts
	# shift() shifts the DATA
	# tshift() shifts the INDEX


	## HIGH-PERFORMANCE PANDAS: eval() and query()
	# The eval() function in Pandas uses string expressions to efficiently
	# compute operations using DataFrames.

	# pd.eval()
	pd.eval('(df1 < 0.5) & (df2 < 0.5) \| (df3 < df4)')

	# df.eval()
	df.eval('(A + B) / (C - 1)')
	df.eval('D = (A + B) / C', inplace=True)

	# Local vars in df.eval()
	column_mean = df.mean(axis='columns')
	df.eval('A + @column_mean') # @ marks Python namespace vs namespace of columns

	# pandas.eval() function only has access to the one (Python) namespace

	# df.query()
	df.query('A < 0.5 and B < 0.5')
	# source:
	# https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.09-Pivot-Tables.ipynb
	births.head(2)
	# year month day gender births
	# 0 1969 1 1 F 4046
	# 1 1969 1 1 M 4440


	# Calcule a decade
	births['decade'] = 10 * (births['year'] // 10) # integer division

	# Remove outliers
	quartiles = np.percentile(births['births'], [25, 50, 75])
	mu = quartiles[1]
	sig = 0.74 * (quartiles[2] - quartiles[0]) # robust estimate of the sample mean
	# 0.74 comes from the interquartile range
	# of a Gaussian distribution
	births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')

	# Create a datetime index from the year, month, day
	births.index = pd.to_datetime(10000 * births.year +
	100 * births.month +
	births.day, format='%Y%m%d')

	births['dayofweek'] = births.index.dayofweek

	# The mean number of births by the day of the year.
	births_by_date = births.pivot_table('births',
	[births.index.month, births.index.day])
	births_by_date.head(3) # multi-index over months and days
	# 1 1 4009.225
	# 2 4247.400
	# 3 4500.900
	# Name: births, dtype: float64

	# To make this easily plottable, let's turn these months and days into a date
	# by associating them with a dummy year variable (making sure to choose a leap year
	# so February 29th is correctly handled!)
	births_by_date.index = [pd.datetime(2012, month, day)
	for (month, day) in births_by_date.index]
	births_by_date.head(3) # TimeSeries
	# 2012-01-01 4009.225
	# 2012-01-02 4247.400
	# 2012-01-03 4500.900
	# Name: births, dtype: float64


	# source:
	# https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.10-Working-With-Strings.ipynb

	# Which recipe has the longest ingredient list
	recipes.name[np.argmax(recipes.ingredients.str.len())]
	# argmax returns the indices of the maximum values along an axis
	s = pd.Series([.25, .5, .75, 1], index=list('abcd'))
	s
	# a 0.25
	# b 0.50
	# c 0.75
	# d 1.00
	# dtype: float64

	s.index
	# Index(['a', 'b', 'c', 'd'], dtype='object')


	## Series as a specialized dict
	# By default, a series will be created where the index is drawn from
	# the SORTED keys
	population_dict = {'California': 38332521,
	'Texas': 26448193,
	'New York': 19651127,
	'Florida': 19552860,
	'Illinois': 12882135}
	population = pd.Series(population_dict)
	population
	# California 38332521
	# Florida 19552860
	# Illinois 12882135
	# New York 19651127
	# Texas 26448193
	# dtype: int64

	population.keys() # equiv to population.index
	# Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

	population['California']
	# 38332521

	# slicing
	# NOTE: index MUST be sorted
	population['California':'Illinois']
	# California 38332521
	# Florida 19552860
	# Illinois 12882135
	# dtype: int64
	## DataFrame
	# - a sequence of aligned Series objects, i.e. they share the same index
	# - a specialized dict: column name -> Series of column data

	df.columns # Index object holding the column labels
	ind = pd.Index([2, 3, 5, 7, 11])
	# Int64Index([2, 3, 5, 7, 11], dtype='int64')

	## Index as immutable array
	ind[::2]
	# Int64Index([2, 5, 11], dtype='int64')

	print(ind.size, ind.shape, ind.ndim, ind.dtype)
	# 5 (5,) 1 int64

	## Index as ordered set
	# Index object supports set operations:
	# - union \|
	# - intersection &
	# - difference -
	# - symmetric difference ^
	## HIERARCHICAL INDEXING

	## MultiIndex construction

	pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
	# MultiIndex(levels=[['a', 'b'], [1, 2]],
	# labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

	pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
	# MultiIndex(levels=[['a', 'b'], [1, 2]],
	# labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

	pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
	# MultiIndex(levels=[['a', 'b'], [1, 2]],
	# labels=[[0, 0, 1, 1], [0, 1, 0, 1]])


	## Data Selection
	# Many of the MultiIndex slicing operations will fail if the index
	# is not sorted
	df.sort_index()
	df.sortlevel()


	## Data Selection: Series
	index = [('California', 2000, 'a'), ('California', 2010, 'a'),
	('New York', 2000, 'a'), ('New York', 2010, 'b'),
	('Texas', 2000, 'b'), ('Texas', 2010, 'b')]
	populations = [33871648, 37253956,
	18976457, 19378102,
	20851820, 25145561]
	s = pd.Series(populations, index=pd.MultiIndex.from_tuples(index))
	s.index.names = ['state', 'year', 'letter'] # level names
	s
	# state year letter
	# California 2000 a 33871648
	# 2010 a 37253956
	# New York 2000 a 18976457
	# 2010 b 19378102
	# Texas 2000 b 20851820
	# 2010 b 25145561
	# dtype: int64

	# Texas
	s['Texas']

	# 2000
	# With sorted indices, partial indexing can be performed on lower levels
	# by passing an empty slice in the first index
	s[:, 2000]

	# b
	s[:, :, 'b']

	# 2010, b
	s[:, 2010, 'b']

	# Texas, b
	s[:, :, 'b']['Texas']

	# New York, a
	s[:, :, 'a']['New York']

	# California, 2010, a
	s['California', 2010, 'a']


	## Data Selection: DataFrame
	health_data
	# subject Bob Guido Sue
	# type HR Temp HR Temp HR Temp
	# year visit
	# 2013 1 39.0 38.0 53.0 35.8 28.0 37.5
	# 2 44.0 36.3 35.0 36.5 31.0 37.1
	# 2014 1 29.0 35.8 25.0 38.3 31.0 37.0
	# 2 23.0 35.2 36.0 38.5 22.0 37.4

	idx = pd.IndexSlice

	# Guido
	health_data['Guido'] # level 0

	# Guido, Temp
	health_data[('Guido', 'Temp')]

	# Guido, 1 visit
	health_data.loc[idx[:, 1], 'Guido']

	# Guido, 1 visit, Temp
	health_data.loc[idx[:, 1], ('Guido', 'Temp')] # or idx['Guido', 'Temp']

	# Guido, 2014, HR
	health_data.loc[2014, ('Guido', 'HR')]

	# 2014
	health_data.loc[2014]

	# 2013, 2 visit
	health_data.loc[(2014, 2)]

	# 2014, 2 visit, Temp
	health_data.loc[(2014, 2), idx[:, 'Temp']]
	## Call .reindex() to arrange data to a new index

	# Reindexing rows
	df = df.reindex(index=['A','B','C','D','E','F'])

	# ... columns
	df = df.reindex(columns=['col1','col2','col3'])

	# ... or both
	df = df.reindex(index=list('ABCD'), columns=['col1', 'col2'])


	# You can also fill values for new indexes
	s.reindex(['A','B','C','D','E','F','G'], fill_value=0)

	# Or use a particular method for filling values
	s.reindex(range(15), method='ffill')
	## COMBINING DATASETS: CONCAT AND APPEND

	# Handling duplicate indices
	x
	# A B
	# 0 A0 B0
	# 1 A1 B1

	y
	# A B
	# 0 A2 B2
	# 1 A3 B3

	# pd.concat preserves indices, even if the result will have duplicate indices
	pd.concat([x, y])
	# A B
	# 0 A0 B0
	# 1 A1 B1
	# 0 A2 B2
	# 1 A3 B3


	# Catching the repeats as an error
	try:
	pd.concat([x, y], verify_integrity=True)
	except ValueError as e:
	pass

	# Ignoring the index -> a new integer index will be created
	pd.concat([x, y], ignore_index=True)
	# A B
	# 0 A0 B0
	# 1 A1 B1
	# 2 A2 B2
	# 3 A3 B3

	# Adding MultiIndex keys
	pd.concat([x, y], keys=['x', 'y']) # keys option specifies labels for the data source
	# A B
	# x 0 A0 B0
	# 1 A1 B1
	# y 0 A2 B2
	# 1 A3 B3


	# The .append() method
	# pd.concat([df1, df2]) is equivalent to df1.append(df2)

	# NOTE: .append() is not very efficient, because it involves creating of a new
	# index and data buffer.
	# Instead of mutliple .append() operations, build a list of DataFrames and pass
	# them all at once to the concat() function.
	## COMBINING DATASETS: MERGE AND JOIN

	# Categories of Joins:
	# - one-to-one
	# - many-to-one: one of the two key columns contains duplicate entries
	# - many-to-many: key column in both left and right array contains duplicates

	# many-to-one example
	df1
	# employee group
	# 0 Bob Accounting
	# 1 Jake Engineering
	# 2 Lisa Engineering
	# 3 Sue HR

	df2
	# group supervisor
	# 0 Accounting Carly
	# 1 Engineering Guido
	# 2 HR Steve

	pd.merge(df1, df2)
	# employee group supervisor
	# 0 Bob Accounting Carly
	# 1 Jake Engineering Guido
	# 2 Lisa Engineering Guido
	# 3 Sue HR Steve


	# Set arithmetic for joins:
	# inner -> intersection of two sets of inputs
	# outer -> union
	# left and right


	## .join()
	# Convenience method: performs a merge that defaults to joining on indices
	## AGGREGATION AND GROUPING

	df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
	'data': range(6)}, columns=['key', 'data'])
	df
	# key data
	# 0 A 0
	# 1 B 1
	# 2 C 2
	# 3 A 3
	# 4 B 4
	# 5 C 5

	df.groupby('key') # lazy evaluation
	# <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358>

	df.groupby('key')['data'] # column indexing
	# <pandas.core.groupby.SeriesGroupBy object at 0x11110e390>


	## Iteration over groups
	for gkey, g in df.groupby('key'):
	print('{:3} shape={}'.format(gkey, g.shape))
	# A shape=(2, 2)
	# B shape=(2, 2)
	# C shape=(2, 2)


	## Operations available on the GroupBy object:
	# - aggregate
	# - filter
	# - transform
	# - apply

	# GroupBy object, in many ways, can be treated as a collection of DFs

	# Aggregation
	df.groupby('key').agg({'data': ['min', 'count', lambda g: g.sum() + 1]})
	# data
	# min count <lambda>
	# key
	# A 0 2 4
	# B 1 2 6
	# C 2 2 8

	# Filtering allows you to drop data based on the group properties
	df.groupby('key').sum()
	# data
	# key
	# A 3
	# B 5
	# C 7

	# The filter func should return a Bool, specifying whether the group passes the filtering
	# Below, because group A does not have sum > 3, it is dropped from the result.
	df.groupby('key').filter(lambda f: f['data'].sum() > 3)
	Out[96]:
	key data
	1 B 1
	2 C 2
	4 B 4
	5 C 5

	# Transformation: the output is the same shape as the input
	# data in DF is transformed in some way
	df.groupby('key').transform(lambda x: x - x.mean())
	# data
	# 0 -1.5
	# 1 -1.5
	# 2 -1.5
	# 3 1.5
	# 4 1.5
	# 5 1.5

	# Apply lets you apply an arbitrary function to the group results.
	# In: DF, Out: DF/Series or a scalar


	## Specifying the split key

	# A dict or series mapping INDEX to group
	df2 = df.set_index('key')
	mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
	df2.groupby(mapping).sum()
	# data
	# consonant 12
	# vowel 3

	# A more complicated example of a mapping used on the index:
	# http://stackoverflow.com/a/19616072

	# Any Python func
	df2.groupby(str.lower).sum()
	# data
	# a 3
	# b 5
	# c 7

	# A list of valid keys
	df2.groupby([str.lower, mapping]).mean()
	# data
	# a vowel 1.5
	# b consonant 2.5
	# c consonant 3.5