Last active
July 21, 2016 09:28
-
-
Save fabsta/cfd79f0b453b6310d53ce1160548905d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[TOC] | |
# numerical value | |
age_mean = df['Age'].mean() | |
df['Age'] = df['Age'].fillna(age_mean) | |
# categorical value | |
from scipy.stats import mode | |
mode_embarked = mode(df['Embarked'])[0][0] | |
df['Embarked'] = df['Embarked'].fillna(mode_embarked) | |
# missing values (replace) | |
train_file['Name'].fillna("Nameless", inplace=True) | |
##### | |
# drop | |
##### | |
# dataframe | |
df.fillna(0) # Fill in missing data with zeros | |
df.drop_duplicates() # Drop duplicates | |
# column | |
data.drop_duplicates(subset='k1') # duplicate in column k1 only | |
df.dropna(axis=1, how='all') # Drop column if they only contain missing values | |
# drop columns with column names where the first three letters of the column names was 'pre' | |
cols = [c for c in df.columns if c.lower()[:3] != 'pre'] | |
df=df[cols] | |
# rows | |
df_no_missing = df.dropna() | |
df.dropna(thresh=5) # Drop rows that contain less than five observations | |
df.drop(['Cochice', 'Pima']) # drop rows | |
df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA | |
df = df[df.name != 'Tina'] # Drop a row if it contains a certain value | |
df.drop('reports', axis=1) # drop column | |
cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin") | |
levels(ds$location) | |
# fill in missing data | |
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True) | |
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # mean by other column category |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment