Last active
January 26, 2023 07:21
-
-
Save yoki/32eacf92ce58c8d15e12226f11088cc7 to your computer and use it in GitHub Desktop.
Pandas tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 10_generation | |
# 11_filter | |
# 12_setting_value | |
# 13_io | |
# 15_groupby | |
# 16_merge | |
# 17_pivot_reshape | |
# 18_misc | |
# 19_type | |
# 21_category | |
# 25_time_series |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# creation | |
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) | |
df = pd.read_csv('file.csv') | |
df.to_csv('name.csv', encoding='utf-8') | |
# df info | |
df.columns.values # DF info | |
nrow = df.shape[0] # DF info | |
# filter | |
df.loc[:, ['col_1','col_2']] # select rows by string index | |
df.iloc[[2,3]] # select rows by numerical index. double bracket gives number, instead of series | |
df.iloc[:, [1,2]] | |
# update | |
df.loc[df.AAA >= 5,'BBB'] = -1 # update AAA: condition, BBB: columns to be changed | |
# iteration | |
for idx, row in df.iterrows(): | |
print([row.value1, row["value2"]]) | |
df.value3 = df.apply(lambda x: x.value1 + x.value2, axis = 1) | |
grp = iris.groupby('Species').apply(eval('np.sum')) | |
# manage dataframe | |
df.reset_index() | |
df.sort_values(by=[('Group1', 'C')], ascending=False) # multi index sort | |
df1.dropna(how='any') | |
df['col'] = df['col'].fillna(-1).astype(int) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# creation of series | |
s = pd.Series([1,3,5,np.nan,6,8]) | |
# dataframe from series | |
df = pd.Dataframe(s, columns=['supercolname']) | |
# creation with index, numpy data, and column name | |
dates = pd.date_range('20130101',periods=6) | |
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD')) | |
# create from data without index | |
names = ['Bob','Jessica','Mary','John','Mel'] | |
births = [968, 155, 77, 578, 973] | |
BabyDataSet = list(zip(names,births)) | |
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) | |
# creation from dictionary | |
In [10]: df2 = pd.DataFrame({ 'A' : 1., | |
'B' : pd.Timestamp('20130102'), | |
'C' : pd.Series(1,index=list(range(4)),dtype='float32'), | |
'D' : np.array([3] * 4,dtype='int32'), | |
'E' : pd.Categorical(["test","train","test","train"]), | |
'F' : 'foo' } | |
# output | |
# A B C D E F | |
# 0 1 2013-01-02 1 3 test foo | |
# 1 1 2013-01-02 1 3 train foo | |
# 2 1 2013-01-02 1 3 test foo | |
# 3 1 2013-01-02 1 3 train foo |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# loc, iloc, ix | |
# loc select by names of columns or rows | |
df.loc[['row_2','row_3']] # select columns | |
df.loc[:, ['col_1','col_2']] # select rows | |
# iloc select by numerical index | |
df.iloc[[2,3]] | |
df.iloc[:, [1,2]] | |
# ix select by numerical index for row and column name | |
df.ix[[3,7], "row_b"] | |
# simple conditions | |
df3 = df2[df2['Time0']==2012] | |
df3 = df3[df3['Value1'] > 20000] | |
df3 = df3[df3['Country0'] == df3['Country1']] | |
df = df[pd.notnull(df['latitude'])] # not null | |
df[df['category'].str.contains('national')] # string condition | |
# multiple conditions | |
df[df['id'].isin(['109', '673'])] | |
df[(df['category'] == 'national') & (df['is_removed'] == '1')] | |
# negative conditions | |
df = df[~df['id'].isin(['1', '2', '3'])] | |
# by index | |
df['20130102':'20130104'] | |
df[0:3] | |
# selected columns | |
df[['A','B']] | |
# combine index and column selection | |
df.loc['20130102':'20130104',['A','B']] | |
df.loc['20130102','A','B'] # return scalar | |
# select row close to some value | |
df.ix[(df.CCC-aValue).abs().argsort()] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Column names | |
df.columns.values | |
# column rename | |
df.rename(columns={"uname":"username"}, inplace=True) | |
# select columns | |
df.loc[['row_2','row_3']] | |
# drop columns | |
df.drop(columns=['B', 'C']) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# single value | |
df.at[dates[0],'A'] = 0 | |
# column | |
df.loc[:,'D'] = np.array([5] * len(df)) | |
df['logic'] = np.where(df['AAA'] > 5,'high','low'); | |
# column rename | |
df.rename(columns={"uname":"username"}, inplace=True) | |
# update information | |
df.loc[df.AAA >= 5,'BBB'] = -1 # AAA: condition, BBB: columns to be changed | |
df.ix[[3,7],'BBB'] = -1 # use ix, not iloc | |
# group by update 1. update only variable which satisfies group conditions | |
moras = moras.reset_index() | |
tmp = moras.groupby(['wid'])["mid"].idxmax() | |
moras.ix[tmp,"pitch_change"] = np.nan | |
# group by update. Assign unique ide within group | |
df['domainId'] = df.groupby('orgid')['orgid'].rank(method='first') | |
#---------- | |
# deleteing | |
#---------- | |
# column | |
del df[name] # column | |
# row | |
df.drop(df.index[[1,3]], inplace=True) | |
df = df[(df.a == 2)] | |
df = df[~np.isnan(df['corr'])] # delete NaN |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------ | |
# Import | |
#------------ | |
#CSV | |
df = pd.read_csv('file.csv') | |
df = pd.read_csv('file.csv', header=0,index_col=0, quotechar='"',sep=':', na_values = ['na', '-', '.', '']) | |
# series | |
df = pd.concat([s1, s2], axis=1) | |
# Python dictionary by column | |
df = DataFrame({'col0' : [1.0, 2.0, 3.0, 4.0],'col1' : [100, 200, 300, 400]}) | |
# python dictionary by row | |
df = DataFrame.from_dict({ 'row0' : {'col0':0, 'col1':'A'},'row1' : {'col0':1, 'col1':'B'}}, orient='index') | |
df = DataFrame.from_dict({ 'row0' : [1, 1+1j, 'A'],'row1' : [2, 2+2j, 'B']}, orient='index') | |
#HDF5 | |
pd.read_hdf('foo.h5','df') | |
#------- | |
# save | |
#------ | |
df.to_csv('name.csv', encoding='utf-8') | |
df.to_hdf('foo.h5','df') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://sinhrks.hatenablog.com/entry/2014/10/13/005327 | |
grp = iris.groupby('Species') | |
grp['len'].sum() #single column | |
grp.sum() # all column | |
grp[['width', 'len']].sum() # multipl columns | |
# custom functions. Custom function's input is series. | |
grp[ 'Length'].apply(np.sum) | |
grp[ 'Length'].apply(lambda x: ",".join(x)) | |
# multiple custom functions | |
grp.agg({'Length': [np.sum, np.mean], 'Width': [np.sum, np.mean]}) | |
# add new column based on groupby. | |
iris["avgren"] = grp["Length"].transform("sum") | |
# Complex cumulation | |
# Using quarterly frequency y-y growth rate, create level data. | |
df = pd.DataFrame(['country_id','quarter','year', 'yoygrowth']) | |
df = df.sort_values(["country_id", "quarter", "year"]) | |
def cumulative_func(df): | |
results = [] | |
for group in df.groupby(["country_id", "quarter"]).indices.values(): | |
level = 1 | |
result = [] | |
for val in df.iloc[group].yoygrowth.values: | |
level *= 1 + 0.01 * val | |
result.append(level) | |
results.append(pd.Series(result, index=group)) | |
# return pd.concat(results) | |
return pd.concat(results).reindex(df.index) | |
df = df.reset_index(drop=True) | |
df["level"] = cumulative_func(df) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://pandas.pydata.org/pandas-docs/stable/merging.html | |
#-------------- | |
#Concat | |
#--------------- | |
#horizontal concat | |
frames = [df1, df2, df3] | |
result = pd.concat(frames) | |
# vertical concat, if index is not meaningfu, better ignore | |
result = df1.append(df2, ignore_index=True) | |
result = df1.append([df2, df3], ignore_index=True) | |
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, | |
keys=None, levels=None, names=None, verify_integrity=False, | |
copy=True) | |
# Add row | |
result = df1.append(one_row, ignore_index=True) | |
# Join | |
result = pd.merge(left, right, how='left', on=['key1', 'key2']) | |
# how: =left/right/outer/inner |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html | |
long_df = pd.DataFrame({'vname': ['one', 'one', 'one', 'two', 'two','two','one', 'one', 'one', 'two', 'two','two'], | |
'country': ['A', 'B', 'C', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C'], | |
'year': ['x1', 'x1', 'x1', 'x1', 'x1', 'x1','x2', 'x2', 'x2', 'x2', 'x2', 'x2'], | |
'val': [1, 2, 3, 4, 5, 6,7, 8, 9, 10, 11, 12]}) | |
# reshape from long to wide | |
wide_df = long_df.pivot(index=['country','year'], columns='vname', values="val").reset_index() | |
# reshape from wide to long (neither "vname" or "val" doesn't exist in wide_df. final DF contains [country, year, vname, val] coluumns) | |
new_long_df = wide_df.melt(id_vars=['country','year'], var_name='vname', value_name="val") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Iteration | |
for idx, row in df.iterrows(): | |
print([row.value1, row["value2"]]) | |
df.value3 = df.apply(lambda x: x.value1 + x.value2, axis = 1) | |
# reset index | |
df.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') | |
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.html | |
df.sort_values(by=[('Group1', 'C')], ascending=False) # multi index sort | |
#na | |
df1.dropna(how='any') | |
df1.fillna(value=5) | |
pd.isnull(df1) | |
df1.[df1.x.notna()] | |
#histgram | |
s.value_counts() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#change column type | |
df['name'] = df['name'].astype('str') | |
df['col'] = df['col'].fillna(-1).astype(int) | |
df['col'] = pd.to_numeric(df['col']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) | |
df["grade"] = df["raw_grade"].astype("category") | |
df["grade"].cat.categories = ["very good", "good", "very bad"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries | |
# easy sample | |
# resample | |
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) | |
rng = pd.date_range('1/1/2012', periods=100, freq='S') | |
ts.resample('5Min').sum() | |
# time zone | |
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') | |
ts = pd.Series(np.random.randn(len(rng)), rng) | |
ts_utc = ts.tz_localize('UTC') | |
ts_utc.tz_convert('US/Eastern') | |
# convert monthly data to quarterly data | |
rng = pd.date_range('1/1/2012', periods=5, freq='M') | |
ts = pd.Series(np.random.randn(len(rng)), index=rng) | |
ps = ts.to_period() | |
ps.to_timestamp() | |
## full examples | |
## | |
# generate timestamp | |
pd.Timestamp(datetime(2012, 5, 1)) | |
pd.Timestamp('2012-05-01') | |
pd.Timestamp(2012, 5, 1) | |
# generate Period (time span) | |
pd.Period('2011-01') | |
pd.Period('2012-05', freq='D') | |
# generate series using timestamp/period | |
dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')] | |
ts = pd.Series(np.random.randn(3), dates) | |
# convert date-like object to timestamp | |
pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])) | |
pd.to_datetime(['2005/11/23', '2010.12.31']) | |
pd.to_datetime(['04-01-2012 10:00'], dayfirst=True) | |
pd.to_datetime('2010/11/12') | |
pd.Timestamp('2010/11/12') | |
# convert pandas columns to timestamp index | |
df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5],'hour': [2, 3]}) | |
# to_timestamp error handling | |
pd.to_datetime(['2009/07/31', 'asd'], errors='raise') | |
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') | |
pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') # convert errors to NAT | |
# get range of timestamps | |
index = pd.date_range('2000-1-1', periods=1000, freq='M') | |
index = pd.bdate_range('2012-1-1', periods=250) # business day | |
rng = pd.date_range(datetime(2011, 1, 1), datetime(2012, 1, 1)) | |
pd.Timestamp.min | |
pd.Timestamp.max | |
rng = pd.date_range(start, end, freq='BM') | |
# access range | |
ts['1/31/2011'] | |
ts[datetime(2011, 12, 25):] | |
ts['10/31/2011':'12/31/2011'] | |
ts['2011'] | |
ts['2011-6'] | |
dft['2013-1':'2013-2-28 00:00:00'] | |
dft['2013-1-15':'2013-1-15 12:30:00'] | |
# date shift | |
d + pd.tseries.offsets.DateOffset(months=4, days=5) | |
from pandas.tseries.offsets import * | |
d + DateOffset(months=4, days=5) | |
d - 5 * BDay() | |
d + BMonthEnd() | |
offset = BMonthEnd() | |
offset.rollforward(d) | |
offset.rollback(d) | |
d + Week() | |
d + Week(weekday=4) | |
d + YearEnd() | |
d + YearEnd(month=6) | |
rng = pd.date_range('2012-01-01', '2012-01-03') | |
s = pd.Series(rng) | |
rng + DateOffset(months=2) | |
s + DateOffset(months=2) | |
s - Day(2) | |
td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) | |
# Hoilday | |
from pandas.tseries.holiday import USFederalHolidayCalendar | |
bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar()) | |
dt = datetime(2014, 1, 17) | |
dt + bday_us | |
# business hour | |
bh = BusinessHour() | |
pd.Timestamp('2014-08-01 10:00') + bh | |
pd.Timestamp('2014-08-01 10:00') + BusinessHour(2) | |
# custom business hour | |
bh = BusinessHour(start='11:00', end=time(20, 0)) | |
pd.Timestamp('2014-08-01 09:00') + bh | |
# annoted offsets | |
pd.Timestamp('2014-01-02') + MonthBegin(n=4) | |
pd.Timestamp('2014-01-31') + MonthEnd(n=1) | |
pd.Timestamp('2014-01-02') + MonthEnd(n=0) | |
# shift/lag | |
ts = ts[:5] | |
ts.shift(1) | |
ts.shift(5, freq=offsets.BDay()) | |
ts.shift(5, freq='BM') | |
# frequency conversion | |
dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) | |
ts = pd.Series(randn(3), index=dr) | |
ts.asfreq(BDay()) | |
ts.asfreq(BDay(), method='pad') | |
# resampling (e.g. 1min -> 5min) | |
rng = pd.date_range('1/1/2012', periods=100, freq='S') | |
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) | |
df.resample('M', on='date') | |
ts.resample('5Min').sum() | |
ts.resample('5Min').mean() | |
ts.resample('5Min').ohlc() # open high low close | |
ts.resample('5Min', closed='right').mean() | |
ts.resample('5Min', closed='left').mean() | |
# offset alias | |
# B business day frequency | |
# D calendar day frequency | |
# W weekly frequency | |
# M month end frequency | |
# BM business month end frequency | |
# MS month start frequency | |
# BMS business month start frequency | |
# Q quarter end frequency | |
# A year end frequency | |
# H hourly frequency | |
# T, min minutely frequency | |
# S secondly frequency | |
# upsampling (e.g. 5min to min) | |
ts[:2].resample('250L').ffill() | |
# sparse resampling | |
# omitted | |
################ | |
## Time span | |
################# | |
# period | |
p = pd.Period('2012', freq='A-DEC') | |
p + 1 #Period('2013', 'A-DEC') | |
p = pd.Period('2014-07-01 09:00', freq='H') | |
p + Hour(2) | |
p + timedelta(minutes=120) | |
# period range | |
prng = pd.period_range('1/1/2011', '1/1/2012', freq='M') | |
pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') | |
pd.PeriodIndex(start='2014-01', freq='3M', periods=4) | |
ps = pd.Series(np.random.randn(len(prng)), prng) | |
idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') | |
#PeriodIndex Partial String Indexing¶ | |
ps['2011-01'] | |
ps[datetime(2011, 12, 25):] | |
ps['10/31/2011':'12/31/2011'] | |
ps['2011'] | |
dfp['2013-01-01 10H'] | |
dfp['2013-01-01 10H':'2013-01-01 11H'] | |
p = pd.Period('2011', freq='A-DEC') | |
# convert period to other freq | |
p.asfreq('M', how='start') | |
# time zone | |
rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D',tz='Europe/London') | |
ts = pd.Series(np.random.randn(len(rng)), rng) | |
ts_utc = ts.tz_localize('UTC') | |
ts_utc.tz_convert('US/Eastern') | |
rng_eastern = rng_utc.tz_convert('US/Eastern') | |
didx.tz_localize(None) # remove timezone | |
# convert timezone from US/Eastern to Asia/Tokyo | |
df['timestamp'] = pd.to_datetime(df.DATE) #convert to datetime | |
df.index = df.timestamp # convert to datetimeindex | |
ind =df.index | |
ind = ind.tz_localize('US/Eastern') # convert to tz-aware US time | |
ind = pd.to_datetime(ind.values) # convert to UTC without timezone | |
ind = ind + pd.tseries.offsets.DateOffset(hours=9) # convert to Tokyo time | |
df.index = ind | |
df.timestamp = ind | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# interpolation | |
ts.interpolate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment