mzaradzki’s gists

mzaradzki / pump_null_boolean.py

Last active July 3, 2017 10:19

	for col, tp in nullcols:
	if (tp == bool):
	dfX.loc[dfX[col]==True, col] = 'TRUE'
	dfX.loc[dfX[col]==False, col] = 'FALSE'
	dfX.loc[dfX[col].isnull(), col] = 'MISSING'

mzaradzki / pump_null_listing.py

Last active July 3, 2017 10:13

	# Find columns with Null values
	nullcols = []
	for col in dfX.columns:
	nbnull = (dfX[col].isnull()*1).sum()
	if ( nbnull > 0 ):
	tp = type(dfX[dfX[col].notnull()][col].iat[0]) # type of first non null value
	nullcols.append([col, tp])
	print(col, nbnull, t)

	# Output

mzaradzki / pump_import_data.py

Created July 3, 2017 07:25

	import pandas as pd

	dfX = pd.read_csv('PUMP_training_set_values.csv') # predictive variables
	dfY = pd.read_csv('PUMP_training_set_labels.csv') # target variable

mzaradzki / pump_null_zero_geographicals_1.py

Last active July 3, 2017 13:43

	# bound of min/max latitude/longitude/height for Tanzania
	bound_df = dfX[(dfX['latitude']<-0.5)&(dfX['longitude']>25)&(dfX['gps_height']>0)]

	# mean of geographical data in each bucket
	mean_geo_df = bound_df.groupby(['basin',])['latitude','longitude','gps_height'].mean()

	assert(mean_geo_df.shape[0] == len(dfX['basin'].unique()))

	# Out[31]: mean_geo_df
	# latitude longitude gps_height

mzaradzki / pump_parsing_dates.py

Created July 2, 2017 11:52

	# Split date_recorded stamp (string) into year, month, day of month, day of week features
	from dateutil import parser

	dfX['date_recorded_year'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[0]))
	dates.append('date_recorded_year')

	dfX['date_recorded_month'] = dfX['date_recorded'].apply(lambda x: int(x.split('-')[1]))
	dates.append('date_recorded_month')

	# WARNING : probably not usefull for this dataset

mzaradzki / pump_null_dates.py

Created July 2, 2017 11:46

	# before filling the null keep track of them
	dfX['construction_year_missing'] = (dfX['construction_year']==0)*1
	dates.append( 'construction_year_missing' ) # list of dates related fields

	# to fill missing dates, can use : mean, median or oldest
	mean_year = dfX[dfX['construction_year']>0]['construction_year'].mean()

	dfX.loc[dfX['construction_year']==0, 'construction_year'] = int(mean_year)

mzaradzki / pump_null_categorical.py

Last active July 3, 2017 10:20

Handling of Null values for categorical and boolean columns in Pandas dataframes

	for col, tp in nullcols:
	if (tp == str):
	dfX.loc[dfX[col].isnull(), col] = 'MISSING'

mzaradzki / pump_similar_variables.py

Last active July 4, 2017 11:09

	# Search for variables that are very similar
	def show_similars(cols, threshold=0.90):
	for i1, col1 in enumerate(cols):
	for i2, col2 in enumerate(cols):
	if (i1<i2):
	cm12 = pd.crosstab(dfX[col1], dfX[col2]).values # contingency table
	cv12 = cramers_corrected_stat(cm12) # Cramer V statistic
	if (cv12 > threshold):
	print((col1, col2), int(cv12*100))

mzaradzki / pump_trim_categoricals.py

Last active July 3, 2017 12:08

	for col in categories:
	cs = dfX[col].value_counts(normalize=False, sort=True, ascending=False)
	rare_values = [k for k in cs.keys() if cs[k]<40] # Theshold = 40 occurrences
	if len(rare_values)>0:
	print( 'Trim values : ', col, len(rare_values))
	dfX.loc[dfX[col].isin(rare_values), col] = col+'_rare'

	# Output :
	# Trim values : funder 1730
	# Trim values : installer 1982

mzaradzki / docSearcher_snippet1.js

Created May 29, 2017 16:16

AWS Lambda function to class Cloud Search API with javascript SDK

	exports.handler = (event, context, callback) => {

	var csd = new AWS.CloudSearchDomain({
	endpoint: CS_NAME+'.'+SERVICES_REGION+'.cloudsearch.amazonaws.com',
	apiVersion: '2013-01-01'
	});

	var params = {
	query: event.query,
	sort: '_score desc',

m.zaradzki mzaradzki