fabsta · July 21, 2016 09:28
diff --git a/4. cleaning - missing data,duplicates,dropping (python data science).py b/4. cleaning - missing data,duplicates,dropping (python data science).py

 [TOC]

 # numerical value
 age_mean = df['Age'].mean()
 df['Age'] = df['Age'].fillna(age_mean)

 # categorical value
 from scipy.stats import mode
 mode_embarked = mode(df['Embarked'])[0][0]
 df['Embarked'] = df['Embarked'].fillna(mode_embarked)

 # missing values (replace)
 train_file['Name'].fillna("Nameless", inplace=True)



 #####
 # drop
 #####

 # dataframe
 df.fillna(0)  # Fill in missing data with zeros
 df.drop_duplicates()  # Drop duplicates

 # column
 data.drop_duplicates(subset='k1')  # duplicate in column k1 only
 df.dropna(axis=1, how='all') # Drop column if they only contain missing values
 # drop columns with column names where the first three letters of the column names was 'pre'
 cols = [c for c in df.columns if c.lower()[:3] != 'pre']
 df=df[cols]


 # rows
 df_no_missing = df.dropna()
 df.dropna(thresh=5)  # Drop rows that contain less than five observations
 df.drop(['Cochice', 'Pima']) # drop rows
 df_cleaned = df.dropna(how='all')  # Drop rows where all cells in that row is NA
 df = df[df.name != 'Tina'] # Drop a row if it contains a certain value
 df.drop('reports', axis=1) # drop column


 cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
 levels(ds$location)




 # fill in missing data
 df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
 df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # mean by other column category

	[TOC]

	# numerical value
	age_mean = df['Age'].mean()
	df['Age'] = df['Age'].fillna(age_mean)

	# categorical value
	from scipy.stats import mode
	mode_embarked = mode(df['Embarked'])[0][0]
	df['Embarked'] = df['Embarked'].fillna(mode_embarked)

	# missing values (replace)
	train_file['Name'].fillna("Nameless", inplace=True)



	#####
	# drop
	#####

	# dataframe
	df.fillna(0) # Fill in missing data with zeros
	df.drop_duplicates() # Drop duplicates

	# column
	data.drop_duplicates(subset='k1') # duplicate in column k1 only
	df.dropna(axis=1, how='all') # Drop column if they only contain missing values
	# drop columns with column names where the first three letters of the column names was 'pre'
	cols = [c for c in df.columns if c.lower()[:3] != 'pre']
	df=df[cols]


	# rows
	df_no_missing = df.dropna()
	df.dropna(thresh=5) # Drop rows that contain less than five observations
	df.drop(['Cochice', 'Pima']) # drop rows
	df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA
	df = df[df.name != 'Tina'] # Drop a row if it contains a certain value
	df.drop('reports', axis=1) # drop column


	cities <- c("Adelaide", "Brisbane", "Canberra", "Darwin")
	levels(ds$location)




	# fill in missing data
	df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
	df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # mean by other column category