This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(14,12)) | |
plt.title('Pearson Correlation of Features', size = 15) | |
colormap = sns.diverging_palette(10, 220, as_cmap = True) | |
sns.heatmap(corr_df.corr(), | |
cmap = colormap, | |
square = True, | |
annot = True, | |
linewidths=0.1,vmax=1.0, linecolor='white', | |
annot_kws={'fontsize':12 }) | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.cluster.hierarchy import linkage | |
from scipy.cluster.hierarchy import dendrogram | |
sample_train,sample_val, gt_train, gt_val = train_test_split(train_df, | |
train_df['Survived'], | |
test_size=0.05, | |
random_state=99) | |
sample_val_processed = simple_preprocessing(sample_val, train = False) | |
sample_val_processed = scaler.fit_transform(sample_val_processed) | |
mergings = linkage(sample_val_processed, method='complete') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def simple_preprocessing(dataframe, train=True): | |
le = LabelEncoder() | |
X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1) | |
X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0]) | |
X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0])) | |
X['Sex'] = np.where(X['Sex'] == 'male', 1, 0) | |
if train: | |
X = X.drop(['Survived'], axis=1) | |
y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xgboost import XGBClassifier | |
xgb_clf = XGBClassifier(max_depth=12, learning_rate=1e-4,n_estimators=500) | |
xgb_clf.fit(X_train, np.argmax(np.array(y_train), axis = 1)) | |
xgb_y_pred = xgb_clf.predict(X_val) | |
pd.Series(xgb_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh', | |
figsize = (10, 10), | |
title = 'Feature importance from XGBoost').invert_yaxis(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=12) | |
rf_clf.fit(X_train, y_train) | |
rf_y_pred = rf_clf.predict(X_val) | |
pd.Series(rf_clf.feature_importances_, index = X_train.columns).nlargest(12).plot(kind = 'barh', | |
figsize = (10, 10), | |
title = 'Feature importance from RandomForest').invert_yaxis(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# multivariate analysis with Embarked variable and Pclass variable | |
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', palette=c_palette3, verbose=False, swarm=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# bivariate analysis with target variable | |
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# univariate analysis | |
quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False): | |
''' | |
Helper function that gives a quick summary of quantattive data | |
Arguments | |
========= | |
dataframe: pandas dataframe | |
x: str. horizontal axis to plot the labels of categorical data (usually the target variable) | |
y: str. vertical axis to plot the quantitative data | |
hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Feature Variable: Gender | |
categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette) |
NewerOlder