Skip to content

Instantly share code, notes, and snippets.

View risenW's full-sized avatar

Rising Odegua risenW

View GitHub Profile
#drop the id column
train.drop(columns='Customer Id', axis=1, inplace=True)
test.drop(columns='Customer Id', axis=1, inplace=True)
#fill missing values
train = ds.feature_engineering.fill_missing_cats(train)
train = ds.feature_engineering.fill_missing_num(train)
test = ds.feature_engineering.fill_missing_cats(test)
test = ds.feature_engineering.fill_missing_num(test)
import pandas as pd
import numpy as np
import datasist as ds
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
vardef = pd.read_csv("variabledef.csv")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
Xtrain, Xtest, ytrain, ytest = train_test_split(train_enc, target, test_size=0.3, random_state=1)
rf_classifier = RandomForestClassifier(n_estimators=20, max_depth=4)
lgb_classifier = lgb.LGBMClassifier(n_estimators=20, max_depth=4)
xgb_classifier = xgb.XGBClassifier(n_estimators=20, max_depth=4)
model = models[1] #get a model from the list of returned models
features = train_enc.columns #get the feature names from the processed data
ds.model.plot_feature_importance(model, features)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#load loan datasets
loan_demographics = pd.read_csv('traindemographics.csv')
#fill with mode (Permanent)
loan_demographics['employment_status_clients'] = loan_demographics['employment_status_clients'].fillna(value='Permanent')
mean_df = round(sendy_data['Temperature'].mean())
mode_df = round(sendy_data['Temperature'].mode()[0])
median_df = round(sendy_data['Temperature'].median())
#Fill with mean
print("Filling with mean value of {}".format(mean_df))
sendy_data['Temperature'] = sendy_data['Temperature'].fillna(mean_df)
#Fill with mode
print("Filling with modal value of {}".format(mode_df))
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
to_train = ['Precipitation in millimeters', 'Destination Lat', 'Destination Long', 'Time from Pickup to Arrival']
temp_df = sendy_data[to_train]
#Split dataset with missing values and no missing values as test and train set respectively.
x_train = temp_df[temp_df['Precipitation in millimeters'].notnull()].drop(columns='Precipitation in millimeters')
y_train = temp_df[temp_df['Precipitation in millimeters'].notnull()]['Precipitation in millimeters']
#get the index of missing so we can some of the values used for filling
missing_indx = list(sendy_data['Temperature'][sendy_data['Temperature'].isna()].index)
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
# Run the imputer with a simple Random Forest estimator
imp = IterativeImputer(RandomForestRegressor(n_estimators=5), max_iter=5, random_state=1)
to_train = ['Temperature', 'Destination Lat', 'Destination Long','Time from Pickup to Arrival']
#perform filling