Last active
February 2, 2019 23:30
-
-
Save diogobaltazar/321dabdf5b00b4ef636196b1dc704ad3 to your computer and use it in GitHub Desktop.
pandas | series | dataframes | index hierarchy | GroupBy, aggregate functions | Merging, joining, concatenating | operations | IO
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
> lbls = ['a', 'b', 'c'] # or np.array | |
> data = [123, 423, 456] | |
> d = {} | |
> for i in np.arange(len(lbls)): | |
> d[lbls[i]] = data[i] | |
> pd.Series(data = data) | |
0 123 | |
1 423 | |
2 456 | |
dtype: int64 | |
> pd.Series(data = data, index = lbls) | |
a 123 | |
b 423 | |
c 456 | |
dtype: int64 | |
> s = pd.Series([sum]) # HOLDING FUNCTION inside dp series!!!! It can hold a set of transformations to exec | |
> s[0](np.arange(1, 101)) | |
5050 | |
>import numpy as np | |
> from numpy.random import randn | |
> import pandas as pd | |
> import string as st | |
> np.random.seed(101) # set the SEED to GEN DATA | |
> nRows = 2 | |
> nCols = 2 | |
> rows = st.ascii_lowercase[:nRows] | |
> cols = st.ascii_uppercase[nRows:nRows + nCols] | |
> pd.DataFrame(data = randn(nRows, nCols) \ | |
, index = [rows[_] for _ in np.arange(nRows)] \ | |
, columns = [cols[_] for _ in np.arange(nRows)]) | |
> df # Colums and rows are Series!!! | |
C D | |
a 2.706850 0.628133 | |
b 0.907969 0.503826 | |
> df[[cols[_] for _ in np.arange(nRows)]] | |
C D | |
a 2.706850 0.628133 | |
b 0.907969 0.503826 | |
> df['E'] = '' # ADD COLUMN | |
> df | |
C D E | |
a 2.706850 0.628133 | |
b 0.907969 0.503826 | |
> df.drop('a', inplace = True) # axis 0 by default | |
> df.drop('E', axis=1, inplace = True) # inplace to affect the DS | |
> df | |
C D | |
b 0.907969 0.503826 | |
> df.loc['b'] | |
C 0.907969 | |
D 0.503826 | |
E | |
Name: b, dtype: object | |
> df.iloc[1] | |
C 0.907969 | |
D 0.503826 | |
E | |
Name: b, dtype: object | |
> df.loc['a', 'C'] | |
2.706849839399938 | |
> df.loc[['a', 'b'], ['C', 'D']] | |
C D | |
a 2.706850 0.628133 | |
b 0.907969 0.503826 | |
> df > 0 | |
C D E | |
a True True True | |
b True True True | |
> f = df > 0 # save function in variable?! not exactly, there are no args | |
> df[f] # for nRows = nCols = 3 | |
D E F | |
a 2.706850 0.628133 0.907969 | |
b 0.503826 0.651118 NaN | |
c NaN 0.605965 NaN | |
> df['D'] > 0 # a column with a comparison operator returns a column of boolean values | |
a True | |
b True | |
c False | |
Name: D, dtype: bool | |
> df['D'][df['D'] > 0] # passing a col of bool values, filters only the rows whose index matches the indexes of True bool values | |
a 2.706850 | |
b 0.503826 | |
Name: D, dtype: float64 | |
> df[df['D'] > 0] | |
D E F | |
a 2.706850 0.628133 0.907969 | |
b 0.503826 0.651118 -0.319318 | |
> df[(df['D'] > 0) & (df['D'] < 2)] # DIFF logical OR operator!!! (op 'and' expects two boolean, not two Series of boolean) | |
D E F | |
b 0.503826 0.651118 -0.319318 | |
> df.reset_index() # resetting the index to int values | |
index D E F | |
0 a 2.706850 0.628133 0.907969 | |
1 b 0.503826 0.651118 -0.319318 | |
2 c -0.848077 0.605965 -2.018168 | |
> df['G'] = 'A B C'.split() # needs to match shape | |
> df | |
D E F G | |
a 2.706850 0.628133 0.907969 A | |
b 0.503826 0.651118 -0.319318 B | |
c -0.848077 0.605965 -2.018168 C | |
> df.set_index('G') | |
G D E F | |
A 2.706850 0.628133 0.907969 | |
B 0.503826 0.651118 -0.319318 | |
C -0.848077 0.605965 -2.018168 | |
# ************************************************************************** | |
# INDEX HIERARCHY | |
# ************************************************************************** | |
> import numpy as np | |
> import pandas as pd | |
> A = 'G1 G1 G1 G2 G2 G2'.split() | |
> B = [1, 2, 3, 1, 2, 3] | |
> Z = pd.MultiIndex.from_tuples(list(zip(A, B))) | |
> df = pd.DataFrame(np.random.randn(6, 2), Z, ['A', 'B']) | |
> df.index.names = ['I1', 'I2'] | |
> df | |
I1 I2 A B | |
G1 1 2.302987 0.729024 | |
2 -0.863091 0.305632 | |
3 0.243178 0.864165 | |
G2 1 -1.560931 -0.251897 | |
2 -0.578120 0.236996 | |
3 0.200780 0.327845 | |
> df.loc['G1'].loc[1]['B'] | |
0.187125 | |
# CROSS SECTIONS (XS) | |
> df.xs('G1') # or df.loc['G1'] | |
I2 A B | |
1 1.062710 1.752014 | |
2 0.695547 0.153661 | |
3 0.167638 -0.765930 | |
> df.xs(1, level = 'I2') | |
I1 A B | |
G1 -0.447898 0.266207 | |
G2 0.846904 -0.298436 | |
> dict = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]} | |
> df = pd.DataFrame(dict) | |
> df.dropna() # drop all rows with NaN values | |
A B C | |
0 1.0 5.0 1 | |
> df.dropna(axis = 1) # drop all cols with NaN values | |
> df.fillna(value = 'NO_DATA') # replace NaN | |
> import numpy as np | |
> import pandas as pd | |
> dict = {'COMPANY': ['Palantir', 'Airbus', 'Klarna', 'Klarna']\ | |
, 'PERSON': ['P1', 'P2', 'P3', 'Diogo']\ | |
, 'IQ': [101, 202, 303, 404]} | |
> df = pd.DataFrame(dict) | |
> comp = df.groupby('COMPANY') | |
> comp.mean() # ignores non-n column | |
COMPANY IQ | |
Airbus 202.0 | |
Klarna 353.5 | |
Palantir 101.0 | |
> comp.describe() # data for numeric columns | |
IQ | |
COMPANY count mean std min 25% 50% 75% max | |
Airbus 1.0 202.0 NaN 202.0 202.00 202.0 202.00 202.0 | |
Klarna 2.0 353.5 71.417785 303.0 328.25 353.5 378.75 404.0 | |
Palantir 1.0 101.0 NaN 101.0 101.00 101.0 101.00 101.0 | |
# ************************************************************************** | |
# CONCATENATION: merging, joining (inner, outer, left, right) | |
# ************************************************************************** | |
> import numpy as np | |
> import pandas as pd | |
> import string as st | |
> upChars = list(st.ascii_uppercase) | |
> loChars = list(st.ascii_lowercase) | |
> cols = upChars[:4] | |
> counter = 3 | |
> O1 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)} | |
counter += 4 | |
> O2 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)} | |
counter += 4 | |
> O3 = {l: [str(l) + str(index) for index in range(counter + 1)[-len(cols):]] for _, l in enumerate(cols)} | |
> getIndexes = lambda X: list(map(lambda X: X[1:], X)) | |
> df1 = pd.DataFrame(data = O1, index = getIndexes(O1[cols[0]])) | |
> df2 = pd.DataFrame(data = O2, index = getIndexes(O2[cols[0]])) | |
> df3 = pd.DataFrame(data = O3, index = getIndexes(O3[cols[0]])) | |
> df1 | |
A B C D | |
0 A0 B0 C0 D0 | |
1 A1 B1 C1 D1 | |
2 A2 B2 C2 D2 | |
3 A3 B3 C3 D3 | |
> pd.concat([df3, df2, df1]) | |
A B C D | |
0 A0 B0 C0 D0 | |
1 A1 B1 C1 D1 | |
2 A2 B2 C2 D2 | |
3 A3 B3 C3 D3 | |
4 A4 B4 C4 D4 | |
5 A5 B5 C5 D5 | |
6 A6 B6 C6 D6 | |
7 A7 B7 C7 D7 | |
8 A8 B8 C8 D8 | |
9 A9 B9 C9 D9 | |
10 A10 B10 C10 D10 | |
11 A11 B11 C11 D11 | |
> pd.concat([df3, df2, df1], axis = 1) # joining on columns (df1 has no values for indexes 4 - 11) | |
A B C D A B C D A B C D | |
0 A0 B0 C0 D0 NaN NaN NaN NaN NaN NaN NaN NaN | |
1 A1 B1 C1 D1 NaN NaN NaN NaN NaN NaN NaN NaN | |
10 NaN NaN NaN NaN NaN NaN NaN NaN A10 B10 C10 D10 | |
11 NaN NaN NaN NaN NaN NaN NaN NaN A11 B11 C11 D11 | |
2 A2 B2 C2 D2 NaN NaN NaN NaN NaN NaN NaN NaN | |
3 A3 B3 C3 D3 NaN NaN NaN NaN NaN NaN NaN NaN | |
4 NaN NaN NaN NaN A4 B4 C4 D4 NaN NaN NaN NaN | |
5 NaN NaN NaN NaN A5 B5 C5 D5 NaN NaN NaN NaN | |
6 NaN NaN NaN NaN A6 B6 C6 D6 NaN NaN NaN NaN | |
7 NaN NaN NaN NaN A7 B7 C7 D7 NaN NaN NaN NaN | |
8 NaN NaN NaN NaN NaN NaN NaN NaN A8 B8 C8 D8 | |
9 NaN NaN NaN NaN NaN NaN NaN NaN A9 B9 C9 D9 | |
> df1 = pd.DataFrame({'A': [1] \ | |
,'B': [1] \ | |
,'C': [2]}) | |
> df2 = pd.DataFrame({'D': [1, 0, 9] \ | |
,'E': [1, 0, 0] \ | |
,'C': [2, 3, 1]}) | |
> df3 = pd.DataFrame({'F': ['a', 'b', 'c'] \ | |
,'E': [1, 0, 1] \ | |
,'C': [2, 3, 1]}) | |
# Inner merging merges on mutual key values | |
> pd.merge(df1, df2, how = 'inner', on = 'C') # defaults to 'inner' | |
C D E F | |
0 2 1 1 a | |
1 3 0 0 b | |
> pd.merge(df1, df2, on = ['D', 'C']) | |
C D E F | |
0 2 1 1 a | |
1 3 0 0 b | |
# Outer merging on mutual keys does not exclude the non-matching rows | |
> pd.merge(df2, df3, how = 'outer', on = ['C', 'E']) | |
C D E F | |
0 2 1.0 1 a | |
1 3 0.0 0 b | |
2 1 9.0 0 NaN | |
3 1 NaN 1 c | |
# Non-matching rows are from either of the DS, named 'left' and 'right' DSs | |
> pd.merge(df2, df3, how = 'left', on = ['C', 'E']) | |
C D E F | |
0 2 1 1 a | |
1 3 0 0 b | |
2 1 9 0 NaN | |
> pd.merge(df2, df3, how = 'right', on = ['C', 'E']) | |
C D E F | |
0 2 1.0 1 a | |
1 3 0.0 0 b | |
2 1 NaN 1 c | |
# Join defaults to a merge taking the **INDEX** values as keys | |
# ************************************************************************** | |
# OPERATIONS | |
# ************************************************************************** | |
> import numpy as np | |
> import pandas as pd | |
> import string as st | |
> df = pd.DataFrame(data = {'A' : [_ + 1 for _ in range(4)] \ | |
, 'B' : [444, 555, 666, 444] \ | |
, 'C' : ['abc', 'def', 'ghi', 'xyz'] }) | |
> df.head() | |
A B C | |
0 1 444 abc | |
1 2 555 def | |
2 3 666 ghi | |
3 4 444 xyz | |
> df['B'].unique() | |
array([444, 555, 666], dtype=int64) | |
> df['B'].nunique() | |
3 | |
> df['B].value_counts() | |
444 2 | |
555 1 | |
666 1 | |
Name: B, dtype: int64 | |
> df[(df['B'] % 2 == 0) & (df['A'] > 2)] | |
A B C | |
2 3 666 ghi | |
> f = lambda x: x**2 | |
> df['A'].apply(f) | |
0 1 | |
1 4 | |
2 9 | |
3 16 | |
Name: A, dtype: int64 | |
3 4 444 xyz | |
> df.columns | |
Index(['A', 'B', 'C'], dtype='object') | |
> df.index | |
RangeIndex(start=0, stop=4, step=1) | |
> df.sort_values('B') | |
A B C | |
0 1 444 abc | |
3 4 444 xyz | |
1 2 555 def | |
2 3 666 ghi | |
> df.isnull() | |
A B C | |
0 False False False | |
1 False False False | |
2 False False False | |
3 False False False | |
# PIVOT TABLES | |
# hierarchy index on A, B | |
> df.pivot_table(values = 'D', index = ['A', 'B'], columns = ['C']) | |
C | |
A B x y | |
bar one 4.0 1.0 | |
two NaN 5.0 | |
foo one 1.0 3.0 | |
two 2.0 NaN | |
# ************************************************************************** | |
# IO | |
# ************************************************************************** | |
> import pandas as pd | |
> df = pd.read_csv('data/data.csv') | |
> df | |
A B C D E | |
0 1 2 3 4 5 | |
1 6 7 8 9 10 | |
> df.to_csv('data/data_2.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment