This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#drop the id column | |
train.drop(columns='Customer Id', axis=1, inplace=True) | |
test.drop(columns='Customer Id', axis=1, inplace=True) | |
#fill missing values | |
train = ds.feature_engineering.fill_missing_cats(train) | |
train = ds.feature_engineering.fill_missing_num(train) | |
test = ds.feature_engineering.fill_missing_cats(test) | |
test = ds.feature_engineering.fill_missing_num(test) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import datasist as ds | |
train = pd.read_csv('train_data.csv') | |
test = pd.read_csv('test_data.csv') | |
vardef = pd.read_csv("variabledef.csv") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
import lightgbm as lgb | |
import xgboost as xgb | |
Xtrain, Xtest, ytrain, ytest = train_test_split(train_enc, target, test_size=0.3, random_state=1) | |
rf_classifier = RandomForestClassifier(n_estimators=20, max_depth=4) | |
lgb_classifier = lgb.LGBMClassifier(n_estimators=20, max_depth=4) | |
xgb_classifier = xgb.XGBClassifier(n_estimators=20, max_depth=4) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = models[1] #get a model from the list of returned models | |
features = train_enc.columns #get the feature names from the processed data | |
ds.model.plot_feature_importance(model, features) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import warnings | |
warnings.filterwarnings('ignore') | |
#load loan datasets | |
loan_demographics = pd.read_csv('traindemographics.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#fill with mode (Permanent) | |
loan_demographics['employment_status_clients'] = loan_demographics['employment_status_clients'].fillna(value='Permanent') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mean_df = round(sendy_data['Temperature'].mean()) | |
mode_df = round(sendy_data['Temperature'].mode()[0]) | |
median_df = round(sendy_data['Temperature'].median()) | |
#Fill with mean | |
print("Filling with mean value of {}".format(mean_df)) | |
sendy_data['Temperature'] = sendy_data['Temperature'].fillna(mean_df) | |
#Fill with mode | |
print("Filling with modal value of {}".format(mode_df)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.linear_model import LinearRegression | |
lr = LinearRegression() | |
to_train = ['Precipitation in millimeters', 'Destination Lat', 'Destination Long', 'Time from Pickup to Arrival'] | |
temp_df = sendy_data[to_train] | |
#Split dataset with missing values and no missing values as test and train set respectively. | |
x_train = temp_df[temp_df['Precipitation in millimeters'].notnull()].drop(columns='Precipitation in millimeters') | |
y_train = temp_df[temp_df['Precipitation in millimeters'].notnull()]['Precipitation in millimeters'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#get the index of missing so we can some of the values used for filling | |
missing_indx = list(sendy_data['Temperature'][sendy_data['Temperature'].isna()].index) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# explicitly require this experimental feature | |
from sklearn.experimental import enable_iterative_imputer | |
from sklearn.impute import IterativeImputer | |
from sklearn.ensemble import RandomForestRegressor | |
# Run the imputer with a simple Random Forest estimator | |
imp = IterativeImputer(RandomForestRegressor(n_estimators=5), max_iter=5, random_state=1) | |
to_train = ['Temperature', 'Destination Lat', 'Destination Long','Time from Pickup to Arrival'] | |
#perform filling |