Created
January 18, 2017 07:32
-
-
Save Yogendra0Sharma/53f22c17822e2281cf07ad14ac185175 to your computer and use it in GitHub Desktop.
pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## FUC (Frequently Used Commands) | |
df.isnull.sum() # num of missing values per column | |
## USEFUL FUNCTIONS | |
# Binning | |
pd.cut | |
pd.qcut # qunatile-based pd.cut | |
## IDIOMATIC pandas | |
# - work with SORTED index | |
# - use explicit .loc and .iloc for indexing (espcially with integer indices) | |
# SERIES as a specialized dict: index -> value | |
# DATAFRAME as a specialized dict: column name -> Series of column data | |
# INDEX: immutable array | ordered set | |
## DATA INDEXING AND SELECTION | |
# indexing arr[2, 1] | |
# slicing arr[:, 1:5] | |
# masking arr[arr > 0] | |
# fancy indexing arr[0, [1, 5]] | |
# with .loc final index is INCLUDED | |
# with .iloc - EXCLUDED | |
# In a DataFrame: | |
# - .iloc does not allow masking | |
# - integer indices are not allowed | |
# - direct slices, like df[1:3], are row-wise (vs column-wise) | |
# - direct masking, df[df.col1 < 3], is row-wise | |
## Call .reindex() to arrange data to a NEW index | |
## Hierarchical Indexing | |
# With **sorted** indices, partial indexing can be performed on lower levels | |
# by passing an empty slice in the first index | |
s[:, 2010] | |
s[:, 2010, 'b'] | |
df['top_level_col'] # level 0 | |
# Use IndexSlice to slice a DataFrame | |
idx = pd.IndexSlice | |
df.loc[idx[:, lvl1_row_val], idx[:, lvl1_col_val]] | |
## COMBINING DATASETS | |
## CONCAT AND APPEND | |
# Options to handle duplicate indices: | |
pd.concat([x, y], verify_integrity=True) # try-except ValueError | |
pd.concat([x, y], ignore_index=True) # a new integer index will be created | |
pd.concat([x, y], keys=['x', 'y']) # keys option specifies labels for the data source | |
# pd.concat([df1, df2]) is equivalent to df1.append(df2) | |
# pd.concat more efficient for more than 2 DFs | |
## MERGE AND JOIN | |
# Categories of Joins: | |
# - one-to-one | |
# - many-to-one: one of the two key columns contains duplicate entries | |
# - many-to-many: key column in both left and right array contains duplicates | |
## .join() is a convenience method: performs a merge that defaults to joining on indices | |
## AGGREGATION AND GROUPING | |
df.groupby('key') # lazy evaluation | |
# <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358> | |
df.groupby('key')['data'] # column indexing | |
# <pandas.core.groupby.SeriesGroupBy object at 0x11110e390> | |
# GroupBy --> collection of DFs | |
for gkey, g in df.groupby('key'): # iterating over groups | |
pass | |
## Operations available on the GroupBy object: | |
# - aggregate: reduces df to a scalar value | |
# - filter: reduces shape of df | |
# - transform: df in and out shapes are the same | |
# - apply | |
## Split key | |
# It's possible to use dict or series mapping INDEX to group, | |
# or any Python func | |
## PIVOT TABLES | |
# Pivot table is a multi-dimensional version of GroupBy aggregation | |
## TIME SERIES | |
| Class | How to create | | |
|-------------------|----------------------------------------| | |
| pd.Timestamp | to_datetime, Timestamp | | |
| pd.DatetimeIndex | to_datetime, date_range, DatetimeIndex | # Group of Timestamps | |
| pd.Period | Period | | |
| pd.PeriodIndex | period_range, PeriodIndex | | |
# Regular sequences | |
pd.period_range('2015-07', periods=8, freq='M') | |
# PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', | |
# '2016-01', '2016-02'], | |
# dtype='period[M]', freq='M') | |
## Frequencies and Offsets | |
# NOTE: by adding as ``S`` suffix to monthly, quartely, and annual | |
# frequences, they instead will be marked at the beginning | |
| Code | Description | Code | Description | | |
|--------|---------------------|--------|----------------------| | |
| ``D`` | Calendar day | ``B`` | Business day | | |
| ``W`` | Weekly | | | | |
| ``M`` | Month end | ``BM`` | Business month end | | |
| ``Q`` | Quarter end | ``BQ`` | Business quarter end | | |
| ``A`` | Year end | ``BA`` | Business year end | | |
| ``H`` | Hours | ``BH`` | Business hours | | |
| ``T`` | Minutes | | | | |
| ``S`` | Seconds | | | | |
| ``L`` | Milliseonds | | | | |
| ``U`` | Microseconds | | | | |
| ``N`` | nanoseconds | | | | |
## Time-shifts | |
# shift() shifts the DATA | |
# tshift() shifts the INDEX | |
## HIGH-PERFORMANCE PANDAS: eval() and query() | |
# The eval() function in Pandas uses string expressions to efficiently | |
# compute operations using DataFrames. | |
# pd.eval() | |
pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)') | |
# df.eval() | |
df.eval('(A + B) / (C - 1)') | |
df.eval('D = (A + B) / C', inplace=True) | |
# Local vars in df.eval() | |
column_mean = df.mean(axis='columns') | |
df.eval('A + @column_mean') # @ marks Python namespace vs namespace of columns | |
# pandas.eval() function only has access to the one (Python) namespace | |
# df.query() | |
df.query('A < 0.5 and B < 0.5') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: | |
# https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.09-Pivot-Tables.ipynb | |
births.head(2) | |
# year month day gender births | |
# 0 1969 1 1 F 4046 | |
# 1 1969 1 1 M 4440 | |
# Calcule a decade | |
births['decade'] = 10 * (births['year'] // 10) # integer division | |
# Remove outliers | |
quartiles = np.percentile(births['births'], [25, 50, 75]) | |
mu = quartiles[1] | |
sig = 0.74 * (quartiles[2] - quartiles[0]) # robust estimate of the sample mean | |
# 0.74 comes from the interquartile range | |
# of a Gaussian distribution | |
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)') | |
# Create a datetime index from the year, month, day | |
births.index = pd.to_datetime(10000 * births.year + | |
100 * births.month + | |
births.day, format='%Y%m%d') | |
births['dayofweek'] = births.index.dayofweek | |
# The mean number of births by the day of the *year*. | |
births_by_date = births.pivot_table('births', | |
[births.index.month, births.index.day]) | |
births_by_date.head(3) # multi-index over months and days | |
# 1 1 4009.225 | |
# 2 4247.400 | |
# 3 4500.900 | |
# Name: births, dtype: float64 | |
# To make this easily plottable, let's turn these months and days into a date | |
# by associating them with a dummy year variable (making sure to choose a leap year | |
# so February 29th is correctly handled!) | |
births_by_date.index = [pd.datetime(2012, month, day) | |
for (month, day) in births_by_date.index] | |
births_by_date.head(3) # TimeSeries | |
# 2012-01-01 4009.225 | |
# 2012-01-02 4247.400 | |
# 2012-01-03 4500.900 | |
# Name: births, dtype: float64 | |
# source: | |
# https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.10-Working-With-Strings.ipynb | |
# Which recipe has the longest ingredient list | |
recipes.name[np.argmax(recipes.ingredients.str.len())] | |
# argmax returns the indices of the maximum values along an axis |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
s = pd.Series([.25, .5, .75, 1], index=list('abcd')) | |
s | |
# a 0.25 | |
# b 0.50 | |
# c 0.75 | |
# d 1.00 | |
# dtype: float64 | |
s.index | |
# Index(['a', 'b', 'c', 'd'], dtype='object') | |
## Series as a specialized dict | |
# By default, a series will be created where the index is drawn from | |
# the SORTED keys | |
population_dict = {'California': 38332521, | |
'Texas': 26448193, | |
'New York': 19651127, | |
'Florida': 19552860, | |
'Illinois': 12882135} | |
population = pd.Series(population_dict) | |
population | |
# California 38332521 | |
# Florida 19552860 | |
# Illinois 12882135 | |
# New York 19651127 | |
# Texas 26448193 | |
# dtype: int64 | |
population.keys() # equiv to population.index | |
# Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object') | |
population['California'] | |
# 38332521 | |
# slicing | |
# NOTE: index MUST be sorted | |
population['California':'Illinois'] | |
# California 38332521 | |
# Florida 19552860 | |
# Illinois 12882135 | |
# dtype: int64 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## DataFrame | |
# - a sequence of **aligned** Series objects, i.e. they share the same index | |
# - a specialized dict: column name -> Series of column data | |
df.columns # Index object holding the column labels |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ind = pd.Index([2, 3, 5, 7, 11]) | |
# Int64Index([2, 3, 5, 7, 11], dtype='int64') | |
## Index as immutable array | |
ind[::2] | |
# Int64Index([2, 5, 11], dtype='int64') | |
print(ind.size, ind.shape, ind.ndim, ind.dtype) | |
# 5 (5,) 1 int64 | |
## Index as ordered set | |
# Index object supports set operations: | |
# - union | | |
# - intersection & | |
# - difference - | |
# - symmetric difference ^ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## DATA INDEXING AND SELECTION | |
# indexing arr[2, 1] | |
# slicing arr[:, 1:5] | |
# masking arr[arr > 0] | |
# fancy indexing arr[0, [1, 5]] | |
## DATA SELECTION IN SERIES | |
s = pd.Series([.25, .5, .75, 1], index=list('abcd')) | |
# slicing by explicit index | |
s['a':'c'] # the final index is INCLUDED | |
# a 0.25 | |
# b 0.50 | |
# c 0.75 | |
# dtype: float64 | |
# slicing by implicit integer index | |
s[0:2] # the final index is EXCLUDED | |
# a 0.25 | |
# b 0.50 | |
# dtype: float64 | |
# masking | |
s[(s > 0.3) & (s < 0.8)] | |
# b 0.50 | |
# c 0.75 | |
# dtype: float64 | |
# fancy indexing | |
s[['a', 'e']] | |
# a 0.25 | |
# e NaN | |
# dtype: float64 | |
## INDEXERS: loc, iloc and ix | |
# .loc always references the explicit index | |
# .iloc always references the implicit, Python-style index | |
# WARNING: avoid using .ix (it tries to index by label before trying to | |
# index by position) | |
s = pd.Series(list('abcde'), index=range(1,6)) | |
s | |
# 1 a | |
# 2 b | |
# 3 c | |
# 4 d | |
# 5 e | |
# dtype: object | |
s.ix[2:4] # equivalent to s.loc[2:4] | |
# 2 b | |
# 3 c | |
# 4 d | |
# dtype: object | |
s.iloc[2:4] | |
# 3 c | |
# 4 d | |
# dtype: object | |
## DATA SELECTION IN DATAFRAME | |
# - .iloc does not allow masking | |
# - integer indices are not allowed | |
# - direct slices, like df[1:3], are row-wise (vs column-wise) | |
# - direct masking, df[df.col1 < 3], is row-wise |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## HIERARCHICAL INDEXING | |
## MultiIndex construction | |
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]) | |
# MultiIndex(levels=[['a', 'b'], [1, 2]], | |
# labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) | |
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) | |
# MultiIndex(levels=[['a', 'b'], [1, 2]], | |
# labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) | |
pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) | |
# MultiIndex(levels=[['a', 'b'], [1, 2]], | |
# labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) | |
## Data Selection | |
# Many of the MultiIndex slicing operations will fail if the index | |
# is not sorted | |
df.sort_index() | |
df.sortlevel() | |
## Data Selection: Series | |
index = [('California', 2000, 'a'), ('California', 2010, 'a'), | |
('New York', 2000, 'a'), ('New York', 2010, 'b'), | |
('Texas', 2000, 'b'), ('Texas', 2010, 'b')] | |
populations = [33871648, 37253956, | |
18976457, 19378102, | |
20851820, 25145561] | |
s = pd.Series(populations, index=pd.MultiIndex.from_tuples(index)) | |
s.index.names = ['state', 'year', 'letter'] # level names | |
s | |
# state year letter | |
# California 2000 a 33871648 | |
# 2010 a 37253956 | |
# New York 2000 a 18976457 | |
# 2010 b 19378102 | |
# Texas 2000 b 20851820 | |
# 2010 b 25145561 | |
# dtype: int64 | |
# Texas | |
s['Texas'] | |
# 2000 | |
# With **sorted** indices, partial indexing can be performed on lower levels | |
# by passing an empty slice in the first index | |
s[:, 2000] | |
# b | |
s[:, :, 'b'] | |
# 2010, b | |
s[:, 2010, 'b'] | |
# Texas, b | |
s[:, :, 'b']['Texas'] | |
# New York, a | |
s[:, :, 'a']['New York'] | |
# California, 2010, a | |
s['California', 2010, 'a'] | |
## Data Selection: DataFrame | |
health_data | |
# subject Bob Guido Sue | |
# type HR Temp HR Temp HR Temp | |
# year visit | |
# 2013 1 39.0 38.0 53.0 35.8 28.0 37.5 | |
# 2 44.0 36.3 35.0 36.5 31.0 37.1 | |
# 2014 1 29.0 35.8 25.0 38.3 31.0 37.0 | |
# 2 23.0 35.2 36.0 38.5 22.0 37.4 | |
idx = pd.IndexSlice | |
# Guido | |
health_data['Guido'] # level 0 | |
# Guido, Temp | |
health_data[('Guido', 'Temp')] | |
# Guido, 1 visit | |
health_data.loc[idx[:, 1], 'Guido'] | |
# Guido, 1 visit, Temp | |
health_data.loc[idx[:, 1], ('Guido', 'Temp')] # or idx['Guido', 'Temp'] | |
# Guido, 2014, HR | |
health_data.loc[2014, ('Guido', 'HR')] | |
# 2014 | |
health_data.loc[2014] | |
# 2013, 2 visit | |
health_data.loc[(2014, 2)] | |
# 2014, 2 visit, Temp | |
health_data.loc[(2014, 2), idx[:, 'Temp']] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Call .reindex() to arrange data to a new index | |
# Reindexing rows | |
df = df.reindex(index=['A','B','C','D','E','F']) | |
# ... columns | |
df = df.reindex(columns=['col1','col2','col3']) | |
# ... or both | |
df = df.reindex(index=list('ABCD'), columns=['col1', 'col2']) | |
# You can also fill values for new indexes | |
s.reindex(['A','B','C','D','E','F','G'], fill_value=0) | |
# Or use a particular method for filling values | |
s.reindex(range(15), method='ffill') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## COMBINING DATASETS: CONCAT AND APPEND | |
# Handling duplicate indices | |
x | |
# A B | |
# 0 A0 B0 | |
# 1 A1 B1 | |
y | |
# A B | |
# 0 A2 B2 | |
# 1 A3 B3 | |
# pd.concat **preserves** indices, even if the result will have duplicate indices | |
pd.concat([x, y]) | |
# A B | |
# 0 A0 B0 | |
# 1 A1 B1 | |
# 0 A2 B2 | |
# 1 A3 B3 | |
# Catching the repeats as an error | |
try: | |
pd.concat([x, y], verify_integrity=True) | |
except ValueError as e: | |
pass | |
# Ignoring the index -> a new integer index will be created | |
pd.concat([x, y], ignore_index=True) | |
# A B | |
# 0 A0 B0 | |
# 1 A1 B1 | |
# 2 A2 B2 | |
# 3 A3 B3 | |
# Adding MultiIndex keys | |
pd.concat([x, y], keys=['x', 'y']) # keys option specifies labels for the data source | |
# A B | |
# x 0 A0 B0 | |
# 1 A1 B1 | |
# y 0 A2 B2 | |
# 1 A3 B3 | |
# The .append() method | |
# pd.concat([df1, df2]) is equivalent to df1.append(df2) | |
# NOTE: .append() is not very efficient, because it involves creating of a new | |
# index and data buffer. | |
# Instead of mutliple .append() operations, build a list of DataFrames and pass | |
# them all at once to the concat() function. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## COMBINING DATASETS: MERGE AND JOIN | |
# Categories of Joins: | |
# - one-to-one | |
# - many-to-one: one of the two key columns contains duplicate entries | |
# - many-to-many: key column in both left and right array contains duplicates | |
# many-to-one example | |
df1 | |
# employee group | |
# 0 Bob Accounting | |
# 1 Jake Engineering | |
# 2 Lisa Engineering | |
# 3 Sue HR | |
df2 | |
# group supervisor | |
# 0 Accounting Carly | |
# 1 Engineering Guido | |
# 2 HR Steve | |
pd.merge(df1, df2) | |
# employee group supervisor | |
# 0 Bob Accounting Carly | |
# 1 Jake Engineering Guido | |
# 2 Lisa Engineering Guido | |
# 3 Sue HR Steve | |
# Set arithmetic for joins: | |
# inner -> intersection of two sets of inputs | |
# outer -> union | |
# left and right | |
## .join() | |
# Convenience method: performs a merge that defaults to joining on indices |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## AGGREGATION AND GROUPING | |
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], | |
'data': range(6)}, columns=['key', 'data']) | |
df | |
# key data | |
# 0 A 0 | |
# 1 B 1 | |
# 2 C 2 | |
# 3 A 3 | |
# 4 B 4 | |
# 5 C 5 | |
df.groupby('key') # lazy evaluation | |
# <pandas.core.groupby.DataFrameGroupBy object at 0x110fd5358> | |
df.groupby('key')['data'] # column indexing | |
# <pandas.core.groupby.SeriesGroupBy object at 0x11110e390> | |
## Iteration over groups | |
for gkey, g in df.groupby('key'): | |
print('{:3} shape={}'.format(gkey, g.shape)) | |
# A shape=(2, 2) | |
# B shape=(2, 2) | |
# C shape=(2, 2) | |
## Operations available on the GroupBy object: | |
# - aggregate | |
# - filter | |
# - transform | |
# - apply | |
# GroupBy object, in many ways, can be treated as a collection of DFs | |
# Aggregation | |
df.groupby('key').agg({'data': ['min', 'count', lambda g: g.sum() + 1]}) | |
# data | |
# min count <lambda> | |
# key | |
# A 0 2 4 | |
# B 1 2 6 | |
# C 2 2 8 | |
# Filtering allows you to drop data based on the group properties | |
df.groupby('key').sum() | |
# data | |
# key | |
# A 3 | |
# B 5 | |
# C 7 | |
# The filter func should return a Bool, specifying whether the group passes the filtering | |
# Below, because group A does not have sum > 3, it is dropped from the result. | |
df.groupby('key').filter(lambda f: f['data'].sum() > 3) | |
Out[96]: | |
key data | |
1 B 1 | |
2 C 2 | |
4 B 4 | |
5 C 5 | |
# Transformation: the output is the same shape as the input | |
# data in DF is transformed in some way | |
df.groupby('key').transform(lambda x: x - x.mean()) | |
# data | |
# 0 -1.5 | |
# 1 -1.5 | |
# 2 -1.5 | |
# 3 1.5 | |
# 4 1.5 | |
# 5 1.5 | |
# Apply lets you apply an arbitrary function to the group results. | |
# In: DF, Out: DF/Series or a scalar | |
## Specifying the split key | |
# A dict or series mapping INDEX to group | |
df2 = df.set_index('key') | |
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'} | |
df2.groupby(mapping).sum() | |
# data | |
# consonant 12 | |
# vowel 3 | |
# A more complicated example of a mapping used on the index: | |
# http://stackoverflow.com/a/19616072 | |
# Any Python func | |
df2.groupby(str.lower).sum() | |
# data | |
# a 3 | |
# b 5 | |
# c 7 | |
# A list of valid keys | |
df2.groupby([str.lower, mapping]).mean() | |
# data | |
# a vowel 1.5 | |
# b consonant 2.5 | |
# c consonant 3.5 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## PIVOT TABLES | |
titanic = sns.load_dataset('titanic') | |
# Pivot table as a multi-dimensional version of GroupBy aggregation | |
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack() | |
# Same result using the pivot table syntax | |
titanic.pivot_table('survived', index='sex', columns='class') | |
## Multi-level pivot tables | |
age = pd.cut(titanic['age'], [0, 18, 80]) | |
titanic.pivot_table('survived', ['sex', age], 'class') | |
# class First Second Third | |
# sex age | |
# female (0, 18] 0.909091 1.000000 0.511628 | |
# (18, 80] 0.972973 0.900000 0.423729 | |
# male (0, 18] 0.800000 0.600000 0.215686 | |
# (18, 80] 0.375000 0.071429 0.133663 | |
fare = pd.qcut(titanic['fare'], 2) # automatically compute quantiles | |
titanic.pivot_table('survived', ['sex', age], [fare, 'class']).round(1) | |
# fare [0, 14.454] (14.454, 512.329] | |
# class First Second Third First Second Third | |
# sex age | |
# female (0, 18] NaN 1.0 0.7 0.9 1.0 0.3 | |
# (18, 80] NaN 0.9 0.4 1.0 0.9 0.4 | |
# male (0, 18] NaN 0.0 0.3 0.8 0.8 0.2 | |
# (18, 80] 0.0 0.1 0.1 0.4 0.0 0.2 | |
# aggfunc can be specified as a dict mapping a column to any agg func | |
# NOTE: values keyword is omitted | |
# When specifying a mapping for aggfunc, this is determined automatically | |
titanic.pivot_table(index='sex', columns='class', | |
aggfunc={'survived':sum, 'fare':'mean'}).round(1) | |
# survived fare | |
# class First Second Third First Second Third | |
# sex | |
# female 91.0 70.0 72.0 106.1 22.0 16.1 | |
# male 45.0 17.0 47.0 67.2 19.7 12.7# |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## VECTORIZED STRING OPERATIONS | |
# Extract dummy variables as a dataframe | |
s = pd.Series(['a|b', 'a', None, 'a|c']) | |
# 0 a|b | |
# 1 a | |
# 2 None | |
# 3 a|c | |
# dtype: object | |
s.str.get_dummies(sep='|') | |
# a b c | |
# 0 1 1 0 | |
# 1 1 0 0 | |
# 2 0 0 0 | |
# 3 1 0 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## WORKING WITH TIME SERIES | |
# Date and time data: | |
# - time stamps: reference particular moments in time (July 4th, 2015 at 7:00am) | |
# - time interval and periods reference a length of time between | |
# a particular beginning and end point: eg, the year 2015. | |
# Periods usually reference a special case of time intervals | |
# in which each interval is of uniform length and does not overlap | |
# (eg, 24 hour-long periods comprising days) | |
# - time deltas (or durations) (eg 22.5 seconds) | |
# Time intervals: assignments | |
# 1) sd: 2016-01-01, ed: 2016-01-15 # have start and end dates | |
# 2) sd: 2016-01-10, ed: 2016-02-03 # can overlap | |
date = pd.to_datetime('15th Dec, 2016') | |
date | |
# Timestamp('2016-12-15 00:00:00') | |
# From a group of these Timestamp objects, Pandas can construct | |
# a DatetimeIndex that can be used to index data in Series of DFs | |
## Pandas Time Series Data Structures | |
# - time stamps: Timestamp and DatetimeIndex | |
# - time periods: Period and PeriodIndex | |
# - time deltas: Timedelta and TimedeltaIndex | |
# The most fundamental of these date/time objects are the | |
# Timestamp and DatatimeIndex objects | |
dates = pd.to_datetime([datetime(2015, 7, 3), '4th of July, 2015', | |
'2015-Jul-6', '07-07-2015', '20150708']) | |
dates | |
# DatetimeIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07', | |
# '2015-07-08'], | |
# dtype='datetime64[ns]', freq=None) | |
dates.to_period('D') | |
# PeriodIndex(['2015-07-03', '2015-07-04', '2015-07-06', '2015-07-07', | |
# '2015-07-08'], | |
# dtype='period[D]', freq='D') | |
dates - dates[0] | |
# TimedeltaIndex(['0 days', '1 days', '3 days', '4 days', '5 days'], dtype='timedelta64[ns]', freq=None) | |
## Regular sequences | |
# pd.date_range() -> DatetimeIndex | |
# pd.period_range() -> PeriodIndex | |
# pd.timedelta_range() -> TimedeltaIndex | |
pd.period_range('2015-07', periods=8, freq='M') | |
# PeriodIndex(['2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', | |
# '2016-01', '2016-02'], | |
# dtype='period[M]', freq='M') | |
## Frequencies and Offsets | |
| Code | Description | Code | Description | | |
|--------|---------------------|--------|----------------------| | |
| ``D`` | Calendar day | ``B`` | Business day | | |
| ``W`` | Weekly | | | | |
| ``M`` | Month end | ``BM`` | Business month end | | |
| ``Q`` | Quarter end | ``BQ`` | Business quarter end | | |
| ``A`` | Year end | ``BA`` | Business year end | | |
| ``H`` | Hours | ``BH`` | Business hours | | |
| ``T`` | Minutes | | | | |
| ``S`` | Seconds | | | | |
| ``L`` | Milliseonds | | | | |
| ``U`` | Microseconds | | | | |
| ``N`` | nanoseconds | | | | |
# By adding as ``S`` suffix to monthly, quartely, and annual frequences, they | |
# instead will be marked at the beginning: | |
| Code | Description || Code | Description | | |
|---------|------------------------||---------|------------------------| | |
| ``MS`` | Month start ||``BMS`` | Business month start | | |
| ``QS`` | Quarter start ||``BQS`` | Business quarter start | | |
| ``AS`` | Year start ||``BAS`` | Business year start | | |
pd.timedelta_range(0, periods=9, freq="2H30T") # freq 2h 30 mins | |
## Time-shifts | |
# shift() shifts the DATA | |
# tshift() shifts the INDEX | |
# for example see link below | |
## Resampling, Shifting and Windowing | |
# see: https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.11-Working-with-Time-Series.ipynb | |
# and: http://pandas.pydata.org/pandas-docs/stable/timeseries.html |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## HIGH-PERFORMANCE PANDAS: eval() and query() | |
# The eval() function in Pandas uses string expressions to efficiently | |
# compute operations using DataFrames. | |
rng = np.random.RandomState(42) | |
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) | |
for i in range(5)) | |
# pd.eval() | |
result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4) | |
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)') | |
np.allclose(result1, result2) # True | |
# It supports the use of the literal `and` and `or` in Boolean expressions | |
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)') | |
np.allclose(result1, result3) # True | |
# DataFrame.eval() for Column-Wise Operations | |
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C']) | |
df.head(3) | |
# A B C | |
# 0 0.821257 0.110632 0.846452 | |
# 1 0.127489 0.397287 0.797295 | |
# 2 0.149917 0.229251 0.722253 | |
result1 = pd.eval("(df.A + df.B) / (df.C - 1)") | |
result2 = df.eval('(A + B) / (C - 1)') | |
np.allclose(result1, result2) # True | |
# Supports assignment | |
df.eval('D = (A + B) / C', inplace=True) | |
df.head(3) | |
# A B C D | |
# 0 0.821257 0.110632 0.846452 1.100935 | |
# 1 0.127489 0.397287 0.797295 0.658195 | |
# 2 0.149917 0.229251 0.722253 0.524981 | |
# Local variables in DataFrame.eval() | |
column_mean = df.mean(axis='columns') | |
result1 = df['A'] + column_mean | |
result2 = df.eval('A + @column_mean') | |
np.allclose(result1, result2) # True | |
# The ``@`` character here marks a variable name rather than a column name, | |
# and lets you efficiently evaluate expressions involving the two "namespaces": | |
# the namespace of columns, and the namespace of Python objects. | |
# NOTE: ``@`` character is only supported by the ``DataFrame.eval()`` method, | |
# not by the ``pandas.eval()`` function, because the ``pandas.eval()`` function | |
# only has access to the one (Python) namespace. | |
# DataFrame.query() Method | |
result1 = df[(df.A < 0.5) & (df.B < 0.5)] | |
result2 = df.query('A < 0.5 and B < 0.5') | |
np.allclose(result1, result2) | |
# Performance: When to Use These Functions | |
x = df[(df.A < 0.5) & (df.B < 0.5)] | |
# is roughly equivalent to: | |
tmp1 = df.A < 0.5 | |
tmp2 = df.B < 0.5 | |
tmp3 = tmp1 & tmp2 | |
x = df[tmp3] | |
# If the size of the temporary DFs is significant compared to your | |
# available system memory, then it's a good idea to use an | |
# eval() or query() expression. | |
df.values.nbytes # check approx size of your array in bytes | |
# eval/query sometimes offers a cleaner syntax |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment