Skip to content

Instantly share code, notes, and snippets.

@diogobaltazar
Last active February 2, 2019 23:30
Show Gist options
  • Save diogobaltazar/321dabdf5b00b4ef636196b1dc704ad3 to your computer and use it in GitHub Desktop.
Save diogobaltazar/321dabdf5b00b4ef636196b1dc704ad3 to your computer and use it in GitHub Desktop.
pandas | series | dataframes | index hierarchy | GroupBy, aggregate functions | Merging, joining, concatenating | operations | IO
import numpy as np
import pandas as pd
> lbls = ['a', 'b', 'c'] # or np.array
> data = [123, 423, 456]
> d = {}
> for i in np.arange(len(lbls)):
> d[lbls[i]] = data[i]
> pd.Series(data = data)
0 123
1 423
2 456
dtype: int64
> pd.Series(data = data, index = lbls)
a 123
b 423
c 456
dtype: int64
> s = pd.Series([sum]) # HOLDING FUNCTION inside dp series!!!! It can hold a set of transformations to exec
> s[0](np.arange(1, 101))
5050
>import numpy as np
> from numpy.random import randn
> import pandas as pd
> import string as st
> np.random.seed(101) # set the SEED to GEN DATA
> nRows = 2
> nCols = 2
> rows = st.ascii_lowercase[:nRows]
> cols = st.ascii_uppercase[nRows:nRows + nCols]
> pd.DataFrame(data = randn(nRows, nCols) \
, index = [rows[_] for _ in np.arange(nRows)] \
, columns = [cols[_] for _ in np.arange(nRows)])
> df # Colums and rows are Series!!!
C D
a 2.706850 0.628133
b 0.907969 0.503826
> df[[cols[_] for _ in np.arange(nRows)]]
C D
a 2.706850 0.628133
b 0.907969 0.503826
> df['E'] = '' # ADD COLUMN
> df
C D E
a 2.706850 0.628133
b 0.907969 0.503826
> df.drop('a', inplace = True) # axis 0 by default
> df.drop('E', axis=1, inplace = True) # inplace to affect the DS
> df
C D
b 0.907969 0.503826
> df.loc['b']
C 0.907969
D 0.503826
E
Name: b, dtype: object
> df.iloc[1]
C 0.907969
D 0.503826
E
Name: b, dtype: object
> df.loc['a', 'C']
2.706849839399938
> df.loc[['a', 'b'], ['C', 'D']]
C D
a 2.706850 0.628133
b 0.907969 0.503826
> df > 0
C D E
a True True True
b True True True
> f = df > 0 # save function in variable?! not exactly, there are no args
> df[f] # for nRows = nCols = 3
D E F
a 2.706850 0.628133 0.907969
b 0.503826 0.651118 NaN
c NaN 0.605965 NaN
> df['D'] > 0 # a column with a comparison operator returns a column of boolean values
a True
b True
c False
Name: D, dtype: bool
> df['D'][df['D'] > 0] # passing a col of bool values, filters only the rows whose index matches the indexes of True bool values
a 2.706850
b 0.503826
Name: D, dtype: float64
> df[df['D'] > 0]
D E F
a 2.706850 0.628133 0.907969
b 0.503826 0.651118 -0.319318
> df[(df['D'] > 0) & (df['D'] < 2)] # DIFF logical OR operator!!! (op 'and' expects two boolean, not two Series of boolean)
D E F
b 0.503826 0.651118 -0.319318
> df.reset_index() # resetting the index to int values
index D E F
0 a 2.706850 0.628133 0.907969
1 b 0.503826 0.651118 -0.319318
2 c -0.848077 0.605965 -2.018168
> df['G'] = 'A B C'.split() # needs to match shape
> df
D E F G
a 2.706850 0.628133 0.907969 A
b 0.503826 0.651118 -0.319318 B
c -0.848077 0.605965 -2.018168 C
> df.set_index('G')
G D E F
A 2.706850 0.628133 0.907969
B 0.503826 0.651118 -0.319318
C -0.848077 0.605965 -2.018168
# **************************************************************************
# INDEX HIERARCHY
# **************************************************************************
> import numpy as np
> import pandas as pd
> A = 'G1 G1 G1 G2 G2 G2'.split()
> B = [1, 2, 3, 1, 2, 3]
> Z = pd.MultiIndex.from_tuples(list(zip(A, B)))
> df = pd.DataFrame(np.random.randn(6, 2), Z, ['A', 'B'])
> df.index.names = ['I1', 'I2']
> df
I1 I2 A B
G1 1 2.302987 0.729024
2 -0.863091 0.305632
3 0.243178 0.864165
G2 1 -1.560931 -0.251897
2 -0.578120 0.236996
3 0.200780 0.327845
> df.loc['G1'].loc[1]['B']
0.187125
# CROSS SECTIONS (XS)
> df.xs('G1') # or df.loc['G1']
I2 A B
1 1.062710 1.752014
2 0.695547 0.153661
3 0.167638 -0.765930
> df.xs(1, level = 'I2')
I1 A B
G1 -0.447898 0.266207
G2 0.846904 -0.298436
> dict = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}
> df = pd.DataFrame(dict)
> df.dropna() # drop all rows with NaN values
A B C
0 1.0 5.0 1
> df.dropna(axis = 1) # drop all cols with NaN values
> df.fillna(value = 'NO_DATA') # replace NaN
> import numpy as np
> import pandas as pd
> dict = {'COMPANY': ['Palantir', 'Airbus', 'Klarna', 'Klarna']\
, 'PERSON': ['P1', 'P2', 'P3', 'Diogo']\
, 'IQ': [101, 202, 303, 404]}
> df = pd.DataFrame(dict)
> comp = df.groupby('COMPANY')
> comp.mean() # ignores non-n column
COMPANY IQ
Airbus 202.0
Klarna 353.5
Palantir 101.0
> comp.describe() # data for numeric columns
IQ
COMPANY count mean std min 25% 50% 75% max
Airbus 1.0 202.0 NaN 202.0 202.00 202.0 202.00 202.0
Klarna 2.0 353.5 71.417785 303.0 328.25 353.5 378.75 404.0
Palantir 1.0 101.0 NaN 101.0 101.00 101.0 101.00 101.0
# **************************************************************************
# CONCATENATION: merging, joining (inner, outer, left, right)
# **************************************************************************
> import numpy as np
> import pandas as pd
> import string as st
> upChars = list(st.ascii_uppercase)
> loChars = list(st.ascii_lowercase)
> cols = upChars[:4]
> counter = 3
> O1 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)}
counter += 4
> O2 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)}
counter += 4
> O3 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)}
> getIndexes = lambda X: list(map(lambda X: X[1:], X))
> df1 = pd.DataFrame(data = O1, index = getIndexes(O1[cols[0]]))
> df2 = pd.DataFrame(data = O2, index = getIndexes(O2[cols[0]]))
> df3 = pd.DataFrame(data = O3, index = getIndexes(O3[cols[0]]))
> df1
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
> pd.concat([df3, df2, df1])
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
> pd.concat([df3, df2, df1], axis = 1) # joining on columns (df1 has no values for indexes 4 - 11)
A B C D A B C D A B C D
0 A0 B0 C0 D0 NaN NaN NaN NaN NaN NaN NaN NaN
1 A1 B1 C1 D1 NaN NaN NaN NaN NaN NaN NaN NaN
10 NaN NaN NaN NaN NaN NaN NaN NaN A10 B10 C10 D10
11 NaN NaN NaN NaN NaN NaN NaN NaN A11 B11 C11 D11
2 A2 B2 C2 D2 NaN NaN NaN NaN NaN NaN NaN NaN
3 A3 B3 C3 D3 NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN A4 B4 C4 D4 NaN NaN NaN NaN
5 NaN NaN NaN NaN A5 B5 C5 D5 NaN NaN NaN NaN
6 NaN NaN NaN NaN A6 B6 C6 D6 NaN NaN NaN NaN
7 NaN NaN NaN NaN A7 B7 C7 D7 NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN NaN NaN NaN A8 B8 C8 D8
9 NaN NaN NaN NaN NaN NaN NaN NaN A9 B9 C9 D9
> df1 = pd.DataFrame({'A': [1] \
,'B': [1] \
,'C': [2]})
> df2 = pd.DataFrame({'D': [1, 0, 9] \
,'E': [1, 0, 0] \
,'C': [2, 3, 1]})
> df3 = pd.DataFrame({'F': ['a', 'b', 'c'] \
,'E': [1, 0, 1] \
,'C': [2, 3, 1]})
# Inner merging merges on mutual key values
> pd.merge(df1, df2, how = 'inner', on = 'C') # defaults to 'inner'
C D E F
0 2 1 1 a
1 3 0 0 b
> pd.merge(df1, df2, on = ['D', 'C'])
C D E F
0 2 1 1 a
1 3 0 0 b
# Outer merging on mutual keys does not exclude the non-matching rows
> pd.merge(df2, df3, how = 'outer', on = ['C', 'E'])
C D E F
0 2 1.0 1 a
1 3 0.0 0 b
2 1 9.0 0 NaN
3 1 NaN 1 c
# Non-matching rows are from either of the DS, named 'left' and 'right' DSs
> pd.merge(df2, df3, how = 'left', on = ['C', 'E'])
C D E F
0 2 1 1 a
1 3 0 0 b
2 1 9 0 NaN
> pd.merge(df2, df3, how = 'right', on = ['C', 'E'])
C D E F
0 2 1.0 1 a
1 3 0.0 0 b
2 1 NaN 1 c
# Join defaults to a merge taking the **INDEX** values as keys
# **************************************************************************
# OPERATIONS
# **************************************************************************
> import numpy as np
> import pandas as pd
> import string as st
> df = pd.DataFrame(data = {'A' : [_ + 1 for _ in range(4)] \
, 'B' : [444, 555, 666, 444] \
, 'C' : ['abc', 'def', 'ghi', 'xyz'] })
> df.head()
A B C
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz
> df['B'].unique()
array([444, 555, 666], dtype=int64)
> df['B'].nunique()
3
> df['B].value_counts()
444 2
555 1
666 1
Name: B, dtype: int64
> df[(df['B'] % 2 == 0) & (df['A'] > 2)]
A B C
2 3 666 ghi
> f = lambda x: x**2
> df['A'].apply(f)
0 1
1 4
2 9
3 16
Name: A, dtype: int64
3 4 444 xyz
> df.columns
Index(['A', 'B', 'C'], dtype='object')
> df.index
RangeIndex(start=0, stop=4, step=1)
> df.sort_values('B')
A B C
0 1 444 abc
3 4 444 xyz
1 2 555 def
2 3 666 ghi
> df.isnull()
A B C
0 False False False
1 False False False
2 False False False
3 False False False
# PIVOT TABLES
# hierarchy index on A, B
> df.pivot_table(values = 'D', index = ['A', 'B'], columns = ['C'])
C
A B x y
bar one 4.0 1.0
two NaN 5.0
foo one 1.0 3.0
two 2.0 NaN
# **************************************************************************
# IO
# **************************************************************************
> import pandas as pd
> df = pd.read_csv('data/data.csv')
> df
A B C D E
0 1 2 3 4 5
1 6 7 8 9 10
> df.to_csv('data/data_2.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment