Skip to content

Instantly share code, notes, and snippets.

View FelixChop's full-sized avatar

Félix Revert FelixChop

View GitHub Profile
from sklearn.preprocessing import StandardScaler, MinMaxScaler
standardise_age = StandardScaler()
rescale_fare = MinMaxScaler()
standardise_age.fit(train[['Age']])
rescale_fare.fit(train[['Fare']])
transformed_age = \
pd.DataFrame(imputer_age.transform(train[['Age']]),
columns=['Age', 'Age_missing'],
index=train.index) # the most important line: do not forget the index
train = train.drop(columns=['Age']).join(transformed_age)
transformed_age = \
pd.DataFrame(imputer_age.transform(validation[['Age']]),
columns=['Age', 'Age_missing'],
pd.DataFrame(imputer_age.transform(train[['Age']]),
columns=['Age', 'Age_missing'])
from sklearn.impute import SimpleImputer
imputer_age = SimpleImputer(strategy='median',
add_indicator=True)
from sklearn.ensemble import IsolationForest
outlier_detection = IsolationForest(random_state=1, behaviour="new")
outlier_detection.fit(titanic[['Fare', 'SibSp', 'Parch', 'Age']].dropna())
data = titanic[['Fare', 'SibSp', 'Parch', 'Age']].dropna()
data['anomaly_score'] = outlier_detection.score_samples(data)
data.sort_values('anomaly_score')
for column_to_delete in ['Ticket', 'Cabin', 'Name']:
del titanic[column_to_delete]
profile = ProfileReport(data, progress_bar=True, minimal=True)
from sklearn.model_selection import train_test_split
target = 'Survived'
intermediate_sample, holdout = train_test_split(titanic,
test_size=.2,
random_state=2020,
stratify=titanic[target])
train, validation = train_test_split(intermediate_sample,
test_size=.2,
random_state=2020,
# The following code removes the data where 'column_with_outliers' is more than 10 times its average
data = data.loc[data['column_with_outliers'] < data['column_with_outliers'].mean()*10]
data = data.loc[~(data['column_class'] == 'imbalanced_class')].reset_index(drop=True)