notha99y’s gists

notha99y / pearson_cc_heatmap.py

Created December 30, 2019 08:35

	plt.figure(figsize=(14,12))
	plt.title('Pearson Correlation of Features', size = 15)
	colormap = sns.diverging_palette(10, 220, as_cmap = True)
	sns.heatmap(corr_df.corr(),
	cmap = colormap,
	square = True,
	annot = True,
	linewidths=0.1,vmax=1.0, linecolor='white',
	annot_kws={'fontsize':12 })
	plt.show()

notha99y / hierarchical_clustering_titanic.py

Created December 21, 2018 10:47

	from scipy.cluster.hierarchy import linkage
	from scipy.cluster.hierarchy import dendrogram
	sample_train,sample_val, gt_train, gt_val = train_test_split(train_df,
	train_df['Survived'],
	test_size=0.05,
	random_state=99)

	sample_val_processed = simple_preprocessing(sample_val, train = False)
	sample_val_processed = scaler.fit_transform(sample_val_processed)
	mergings = linkage(sample_val_processed, method='complete')

notha99y / simple_preprocessing_titanic.py

Created December 21, 2018 10:25

	def simple_preprocessing(dataframe, train=True):
	le = LabelEncoder()
	X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)
	X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0])
	X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0]))
	X['Sex'] = np.where(X['Sex'] == 'male', 1, 0)

	if train:
	X = X.drop(['Survived'], axis=1)
	y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead')

notha99y / xgb_clf.py

Created December 19, 2018 09:03

	from xgboost import XGBClassifier

	xgb_clf = XGBClassifier(max_depth=12, learning_rate=1e-4,n_estimators=500)
	xgb_clf.fit(X_train, np.argmax(np.array(y_train), axis = 1))
	xgb_y_pred = xgb_clf.predict(X_val)

	pd.Series(xgb_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
	figsize = (10, 10),
	title = 'Feature importance from XGBoost').invert_yaxis();

notha99y / rf_clf.py

Created December 19, 2018 08:59

	from sklearn.ensemble import RandomForestClassifier
	rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=12)
	rf_clf.fit(X_train, y_train)
	rf_y_pred = rf_clf.predict(X_val)

	pd.Series(rf_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh',
	figsize = (10, 10),
	title = 'Feature importance from RandomForest').invert_yaxis();

notha99y / quantitative_analysis_age_embarked_pclass.py

Created December 18, 2018 09:19

	# multivariate analysis with Embarked variable and Pclass variable
	quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', palette=c_palette3, verbose=False, swarm=False)

notha99y / quantitative_analysis_age_survived.py

Created December 18, 2018 09:17

	# bivariate analysis with target variable
	quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True)

notha99y / quantitative_analysis_age.py

Created December 18, 2018 09:16

	# univariate analysis
	quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True)

notha99y / quantitative_summarized.py

Created December 18, 2018 07:13

	def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
	'''
	Helper function that gives a quick summary of quantattive data

	Arguments
	=========
	dataframe: pandas dataframe
	x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
	y: str. vertical axis to plot the quantitative data
	hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)

notha99y / categoical_analysis_gender.py

Created December 17, 2018 09:35

	# Feature Variable: Gender
	categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette)

Ren Jie notha99y