Last active
June 26, 2019 13:05
-
-
Save prerakmody/7b4d852d0f9d82225298e611b351615a to your computer and use it in GitHub Desktop.
Pandas Hacks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pdb; #pdb.set_trace() | |
import pandas as pd | |
from IPython.display import display | |
# 1. Kick Off | |
df = pd.read_csv('myfile.csv') | |
print (df.head()) | |
print (df.head(n=10)) | |
print (df.columns) | |
col_Y = '' | |
for col in df.columns: | |
print (' - Col : ', col, ' || Unique Vals : ', df[col].unique()) | |
display(df) | |
## 2. Handling nan values | |
df.dropna(inplace=True) | |
nanidxs = pd.isnull(df).any(1).nonzero()[0] | |
print ('Total NaN vals : ', len(df) - len(df.dropna()), ' || NaN Idxs : ', nanidxs) | |
print ('Data with NaN vals : ', df.iloc[nanidxs]) | |
for col in df.columns: | |
tmp_len = len(df[df[col].isnull()]) | |
if tmp_len: | |
print ('Col : ', col, ' || NaN Rows :', tmp_len) | |
bool_NaN = df_scene10.isnull().values.any() | |
## 3. Dropping rows/cols | |
df.drop('col1', axis=1, inplace=True) | |
df.drop(df[df['col1'] == 'col1_value'].index, axis=0, inplace=True) | |
# 4. Studying columns | |
def getCounter(df, col): | |
tmp = df[col].value_counts() | |
print (' --> ', list(zip(tmp.keys(), tmp.tolist()))) | |
getCounter(df, col1) | |
## 5. Convert to matrix | |
data = df.as_matrix() | |
data = df.values | |
X = data[:,:-1] # for ML purposes | |
Y = data[:,-1] # for ML purposes | |
## 6. GroupBy commands | |
stats1 = df.groupby(['col1'])['col2'].count() | |
df['col1'].agg(['sum','count']).sort_values('count', ascending=False) | |
df.groupby(['col1', 'col2']).agg({'col3':'count', 'col4':'sum'}) | |
df.groupby('col1')['col1'].agg(['count']).nlargest(10, 'count') | |
df.groupby(['col1', 'col2']).aggregate({'col3':'count'}).sort_values('col3', ascending=False) | |
## 7. Datetime | |
df['col1'] = pd.to_datetime(df['col1']) | |
df['hrs'] = df['col1'].dt.hour | |
## 8. Categorical/One hot encoding | |
df['col1'] = df['col1'].astype("category").cat.codes | |
df['col1'] = pd.Categorical(df['col1']) | |
df1 = pd.get_dummies(df['col1'], prefix = 'col1') | |
print (' - Col :', 'col1', ' || Extra cols added : ', len(df1.columns)) | |
df = pd.concat([df, df1], axis=1) | |
col1_unique = np.array(df[col1].unique().tolist()) | |
col1_not = np.delete(col1_unique, np.where(np.isin(some_list, col1_unique))) | |
df[col1 + '_none'] = np.where(np.isin(df[col1], col1_not), 1, 0) | |
## 9. Finding/Indexing | |
df['Y'] = np.where(df['col1']=='col1_value', 1, 0) | |
df['col2'] = df['col1'].copy() | |
idxs = df[df['col2'] == 'val2'].index.tolist() | |
df.loc[idxs, 'col2'] = df.loc[idxs, 'col1']*some_var | |
## 10. Misc | |
df.corr() | |
import seaborn as sns | |
sns.heatmap(df.corr(), cmap = sns.cm.vlag) | |
## 11. Rearranging Columns | |
cols = df.columns.tolist() | |
cols.remove('Y') | |
df = df[cols + ['Y']] | |
## 12. Writing to a .csv | |
tmp = np.array([1,1,1,1,1,1,1]).reshape(-1,1) | |
df = pd.DataFrame(tmp) | |
df.to_csv('file.csv', sep=',', index=False, header=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment