Created
June 27, 2018 13:22
-
-
Save vb100/10bfabacc78920480ec8ebea15db2062 to your computer and use it in GitHub Desktop.
This is one of the biggest Machine learning project 100 % made by me. This module read periodically updated Training set, analyze it by performing Hyperparameter Tuning for Decision Tree/Random Forest and set the best selected hyperparameters to the classifier. Then calculate probabilities for a property to be a comps, construct Panda dataframe …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Jun 21 14:26:09 2018 | |
@author: Vytautas.Bielinskas | |
Definitions: | |
JN - Jupyter Notebook | |
ML - Machine learning | |
BOG - Bag Of Words | |
RF - Random Forest | |
""" | |
# Get current directory | |
import os | |
cwd = os.getcwd() | |
print('The current directory is {}.'.format(cwd)) | |
# ::: Use Pandas to load the data in a dataframe ::: | |
import pandas as pd | |
training_set = pd.read_excel('training_set_20180625.xlsx', index_col = 0) | |
# ::: Clean dataset ::: | |
# Data pre-process and data cleaning was performed in JP | |
print('The shape of training set after data cleaning is {}.'.format(training_set.shape)) | |
# ::: Extract target from the features ::: | |
y = training_set['Comps'].values | |
X = training_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']].values | |
# ----------------------------------------------------------------------------- | |
# ::: ------------ RANDOM FOREST: PREPARATION FOR ACTION ----------------- ::: | |
# ::: Feature scaling ::: | |
from sklearn.preprocessing import StandardScaler | |
sc = StandardScaler() | |
X_train_RF = sc.fit_transform(X) | |
#X_test_RF = sc.transform(X_test) | |
# ::: Split dataset into train and test sets ::: | |
""" This part is neccessary for checking accuracies """ | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X_train_RF, y, test_size = 0.2, random_state = 42) | |
print('Size of train dataset {} rows.'.format(X_train.shape[0])) | |
print('Size of test dataset {} rows.'.format(X_test.shape[0])) | |
N = 50 | |
accur = [] | |
record = {} | |
best_hyper = {} | |
this_acc = 0.0 | |
# ::: Import Classifiers ::: | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
# ::: Array of Machine learning classifiers if experimenting with more than one ::: | |
ML_classifiers = [] | |
ML_classifiers.append(DecisionTreeClassifier()) | |
ML_classifiers.append(RandomForestClassifier()) | |
iteration = 0 | |
while iteration < N: | |
# ::: Hyperparameter Tuning ::: | |
from scipy.stats import randint | |
param_dist = {'max_depth' : [3, None], | |
'min_samples_leaf' : randint(1, 9), | |
'criterion' : ['gini', 'entropy']} | |
# ::: Instantiate a Decision Tree classifier : tree ::: | |
tree = ML_classifiers[1] # <------ Choose your ML classifier here! | |
# ::: Instantiate the RandomizedSearchCV object: tree_cv ::: | |
from sklearn.model_selection import RandomizedSearchCV | |
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5) | |
# ::: Fit it to the data ::: | |
tree_cv.fit(X_train, y_train) | |
# ::: Print the tuned parameters and score ::: | |
print(' ---------------') | |
print('Sample: {}.'.format(iteration)) | |
print('Tuned Decision Tree Parameters: {}.'.format(tree_cv.best_params_)) | |
print('Best score is {}.'.format(tree_cv.best_score_)) | |
record["Best score"] = tree_cv.best_score_ | |
accur.append(dict(record)) | |
# ::: Predicting the Test set results ::: | |
y_pred = tree_cv.predict(X_test) | |
# ::: Making Confusion Matrix ::: | |
from sklearn.metrics import confusion_matrix | |
cm = confusion_matrix(y_test, y_pred) | |
if tree_cv.best_score_ > this_acc: | |
print('!!! Current acc: {}, got new value is {}.'.format(this_acc, tree_cv.best_score_)) | |
best_hyper['criterion'] = tree_cv.best_params_['criterion'] | |
best_hyper['max_depth'] = tree_cv.best_params_['max_depth'] | |
best_hyper['min_samples_leaf'] = tree_cv.best_params_['min_samples_leaf'] | |
best_hyper['_best_score_'] = tree_cv.best_score_ | |
this_acc = tree_cv.best_score_ | |
# ::: Iteration +1 ::: | |
iteration = iteration + 1 | |
acc_df = pd.DataFrame(accur) | |
print('\nAverage accuracy is: {}.'.format(acc_df['Best score'].mean())) | |
print('Maximum accuracy is: {}.'.format(acc_df['Best score'].max())) | |
print('The difference is: {}.'.format(acc_df['Best score'].mean() - acc_df['Best score'].max())) | |
print('Selected features: {}'.format(best_hyper)) | |
# -------------------------------> | |
# Plot accuracies | |
def plot_acc(acc_df): | |
N = len(acc_df) | |
import seaborn as sns | |
import numpy as np | |
import matplotlib.patches as mpatches | |
import matplotlib.pyplot as plt | |
# Generate plot | |
sns.set_style('darkgrid') | |
# Set plot size | |
_ = plt.rcParams['figure.figsize'] = (7,5) | |
_ = plt.hist(acc_df.values, color = '#1f4e79') | |
_ = plt.margins(0.02) | |
# Label the axes | |
_ = plt.xlabel('Test Accuracy', fontsize = 16, family='Arial') | |
_ = plt.ylabel('Samples', fontsize = 16) | |
# Display the plot | |
plt.show() | |
return None | |
""" !!! Open following line only for testing !!! """ | |
plot_acc(acc_df) | |
# ----------------------------------------------------------------------------- | |
# ---------------- SET THE BEST HYPERPARAMETERS TO THE CLASSIFIER ------------- | |
param_dist = {'max_depth' : best_hyper['max_depth'], | |
'min_samples_leaf' : best_hyper['min_samples_leaf'], | |
'criterion' : best_hyper['criterion']} | |
if 'DecisionTree'.upper() in str(tree).split('(')[0].upper(): | |
classifier = DecisionTreeClassifier(max_depth = param_dist['max_depth'], | |
min_samples_leaf = param_dist['min_samples_leaf'], | |
criterion = param_dist['criterion']) | |
print('\nDecision Tree classifier is using now...') | |
elif 'RandomForest'.upper() in str(tree).split('(')[0].upper(): | |
classifier = RandomForestClassifier(max_depth = param_dist['max_depth'], | |
min_samples_leaf = param_dist['min_samples_leaf'], | |
criterion = param_dist['criterion']) | |
print('\nRandom Forest Classifier is using now...\n') | |
# ::: Import Test dataset ::: | |
test_set = pd.read_csv(cwd + '/Tests/TestSet_20180626.csv') | |
# --------------------- RANDOM FOREST IMPLEMENTATION -------------------------- | |
# ----------------------------------------------------------------------------- | |
# ---------------------RE-ARANGE TRAINING AND TEST SETS ----------------------- | |
# ::: Extract features from Test set ::: | |
X_test_RF = test_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']] # <-- Pre-process for EVALUATION in JN! | |
# ::: Feature Scaling to Test set (Real data) ::: | |
X_test_RF_scaled = sc.fit_transform(X_test_RF) | |
# ::: Add extra column for Comps identification ::: | |
import numpy as np | |
z = np.zeros((len(X_test_RF), 1), dtype = 'float64') | |
""" Here zero (0) is default value for Comps. Later this field can be changed based on prediction on RF. """ | |
# <-----X_test_RF_scaled = np.append(X_test_RF_scaled, z, axis = 1)-----> | |
# ::: Fit classifier to the Train set. Let algorithm learn right now! | |
classifier.fit(X_train_RF, y) | |
# ::: Predicting the Test set results ::: | |
y_pred = classifier.predict(X_test_RF_scaled) | |
# ::: Get values of Probabilities ::: | |
probas = classifier.predict_proba(X_test_RF_scaled) | |
# ::: Get importances of features ::: | |
importances = classifier.feature_importances_ | |
# ::: Print the feature ranking ::: | |
indices = np.argsort(importances)[::1] | |
print('Feature ranking:') | |
for f in range(X_test_RF.shape[1]): | |
print('%d. feature %d (%f)' % (f + 1, indices[f], importances[indices[f]])) | |
# ----------------------- SOME RESULTS VISUALIZATIONS ------------------------- | |
# ::: ------------- Plot the feature importances of the forest -------------::: | |
import matplotlib.pyplot as plt | |
try: | |
plt.figure() | |
plt.title('Feature importances') | |
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], | |
axis = 0) | |
plt.bar(range(X_test_RF.shape[1]), importances[indices], | |
color = '#1f4e79', | |
yerr = std[indices], | |
align = 'center') | |
plt.xticks(range(X_test_RF.shape[1]), indices) | |
plt.xlim([-1, X_test_RF.shape[1]]) | |
plt.show() | |
except: | |
print('\n Feature importance graph can be plotted only for Random Forest Classifier. \n') | |
# ::: ------------------------ Plot the Scatter -------------------------::: | |
fig, ax = plt.subplots() | |
x = X_test_RF_scaled[:, 1] | |
y = X_test_RF_scaled[:, 2] | |
size_multiplier = 500 | |
for point in range(0, len(X_test_RF_scaled), 1): | |
if y_pred[point] == 1.0: | |
ax.scatter(x[point], y[point], | |
c = '#5b9bd5', | |
s = probas[point][1] * size_multiplier, | |
label = 'Result', | |
alpha = 0.50, | |
edgecolors = 'none') | |
else: | |
ax.scatter(x[point], | |
y[point], | |
c = '#ff8080', | |
s = probas[point][0] * size_multiplier, | |
label = 'Result', | |
alpha = 0.50, | |
edgecolors = 'none') | |
plt.xlabel('Distance to the nearest transport hub', family = 'Arial') | |
plt.ylabel('Evaluation of description', family = 'Arial') | |
ax.grid(True) | |
plt.show() | |
# -------------------------- STRUCTURING DATA --------------------------------- | |
# ::: Create Pandas Dataframe for writing results in Excel ::: | |
list_of_data = [] | |
record_data = {} | |
index_of_ID = test_set.columns.get_loc('ID') | |
index_of_Address = test_set.columns.get_loc('Address') | |
index_of_Bedrooms = test_set.columns.get_loc('Bedrooms') | |
index_of_Bathrooms = test_set.columns.get_loc('Bathrooms') | |
index_of_Reception = test_set.columns.get_loc('Reception') | |
index_of_Size = test_set.columns.get_loc('Size') | |
index_of_flisted = test_set.columns.get_loc('First Listed') | |
index_of_OrigPrice = test_set.columns.get_loc('Original price') | |
index_of_CurrPrice = test_set.columns.get_loc('Current price') | |
index_of_URL = test_set.columns.get_loc('URL') | |
index_of_FloorLink = test_set.columns.get_loc('FLOORPLAN LINK') | |
index_of_New = test_set.columns.get_loc('New') | |
# Set list of possible areas | |
areas = [] | |
areas.append('West Drayton') | |
areas.append('Ilford') | |
area_value = areas[1] # <----- Choose your area index here! | |
for row in range(0, len(X_test_RF_scaled), 1): | |
record_data['ID'] = test_set.iat[row, index_of_ID] | |
record_data['AREA'] = area_value | |
record_data['ADDRESS'] = test_set.iat[row, index_of_Address] | |
record_data['BEDROOMS'] = test_set.iat[row, index_of_Bedrooms] | |
record_data['BATHROOMS'] = test_set.iat[row, index_of_Bathrooms] | |
record_data['RECEPTION'] = test_set.iat[row, index_of_Reception] | |
record_data['SIZE'] = test_set.iat[row, index_of_Size] | |
record_data['FIRST LISTED'] = test_set.iat[row, index_of_flisted] | |
record_data['ORIGINAL PRICE'] = test_set.iat[row, index_of_OrigPrice] | |
record_data['CURRENT PRICE'] = test_set.iat[row, index_of_CurrPrice] | |
record_data['WEB'] = test_set.iat[row, index_of_URL] | |
record_data['FLOORPLAN LINK'] = test_set.iat[row, index_of_FloorLink] | |
record_data['NEW'] = test_set.iat[row, index_of_New] | |
record_data['0'] = probas[row][0] | |
record_data['1'] = probas[row][1] | |
list_of_data.append(dict(record_data)) | |
DF = pd.DataFrame(list_of_data) | |
DF = DF[['ID', 'AREA', 'ADDRESS', 'BEDROOMS', 'BATHROOMS', 'RECEPTION', 'SIZE', | |
'FIRST LISTED', 'ORIGINAL PRICE', 'CURRENT PRICE', 'WEB', | |
'FLOORPLAN LINK', 'NEW', '1', '0']] | |
DF = DF.sort_values(by=['0']) | |
#------------------------------------------------------------------------------ | |
#--------------------------- UPDATING EXCEL FILE ------------------------------ | |
""" | |
All new data will be written to File_before.xlsx Excel file | |
""" | |
def updateExcel(dataset, area_value): | |
print('# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #') | |
print('Writing data to Excel file...') | |
import requests, re, os | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import openpyxl, os, datetime | |
from pandas import ExcelWriter as ewriter | |
from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font, Fill | |
from openpyxl.comments import Comment | |
""" checking if property is NOT LAND : start """ | |
def ifNotLand(url): | |
land = False | |
r = requests.get(url) | |
c = r.content | |
soup = BeautifulSoup(c, "html.parser") | |
title = soup.find("title").text | |
if "land for sale".upper() in title.upper(): | |
land = True | |
return land | |
""" checking if property is NOT LAND : end """ | |
directory = (cwd + '/Excel') | |
os.chdir(directory) | |
os.getcwd() | |
filename = "File_before.xlsx" | |
wb = openpyxl.load_workbook(filename, data_only = False) | |
#print(wb.get_sheet_names) | |
print("---- Start update Excel file ----") | |
sheet = wb.get_sheet_by_name("Data Sales") | |
rowForSearching = 10 | |
while len(str(sheet.cell(row = rowForSearching, column = 2).value)) > 4: | |
rowForSearching = rowForSearching + 1 | |
end_of_table = rowForSearching - 1 # Get the end of current table | |
""" Define some Excel cell styles and formatting """ | |
fillDefault = PatternFill(fill_type = None, | |
start_color = "FFFFFFFF", | |
end_color = "FF000000") | |
fillGreen = PatternFill(fill_type = "solid", | |
start_color = "56af11", | |
end_color = "56af11") | |
HyperlinkBlue = Font(color = "0563c1", | |
underline = "single") | |
rightAligment = Alignment(horizontal = "right") | |
centerAligment = Alignment(horizontal = "center") | |
""" Starting writing data to Excel file """ | |
id_column = dataset.columns.get_loc("ID") # get index of ID column | |
id_area = dataset.columns.get_loc("AREA") # get index of STATION column | |
id_address = dataset.columns.get_loc("ADDRESS") # get index of ADDRESS column | |
id_bedrooms = dataset.columns.get_loc("BEDROOMS") # get index of BEDROOMS column | |
id_bathrooms = dataset.columns.get_loc("BATHROOMS") # get index of BATHROOMS column | |
id_reception = dataset.columns.get_loc("RECEPTION") # get index of RECEPTION column | |
id_size = dataset.columns.get_loc("SIZE") # get index of SIZE column | |
id_firstListed = dataset.columns.get_loc("FIRST LISTED") # get index of FIRST LISTED column | |
id_origPrice = dataset.columns.get_loc("ORIGINAL PRICE")# get index of ORIGINAL PRICE column | |
id_currPrice = dataset.columns.get_loc("CURRENT PRICE") # get index of CURRENT PRICE column | |
id_link = dataset.columns.get_loc("WEB") # get index of WEB column | |
id_floorplan = dataset.columns.get_loc("FLOORPLAN LINK")# get index of FLOORPLAN LINK column | |
id_new = dataset.columns.get_loc("NEW") # get index of NEW column | |
id_1 = dataset.columns.get_loc("1") # get index of <<1>> column | |
id_0 = dataset.columns.get_loc("0") # get index of <<0>> column | |
for x in range(rowForSearching, rowForSearching + len(dataset), 1): | |
for j in range(1, sheet.max_column, 1): | |
sheet.cell(row = x, column = j).fill = fillDefault | |
dataset_index = 0 | |
property_index = rowForSearching | |
for i in range(rowForSearching, rowForSearching + len(dataset), 1): | |
link_to_property = dataset.iat[dataset_index, id_link] | |
land = ifNotLand(link_to_property) | |
duplicates = 0 | |
if land == False and "POA" not in str(dataset.iat[dataset_index, id_currPrice]).upper(): | |
""" writing MONTH column """ | |
d = datetime.datetime.strptime(str(datetime.date.today()), '%Y-%m-%d') | |
d= d.strftime('%d-%b-%Y').replace("-20", "-") | |
sheet.cell(row = property_index, column = 2).value = d | |
sheet.cell(row = property_index, column = 2).alignment = rightAligment | |
""" writing SIZE columns """ | |
size = dataset.iat[dataset_index, id_size] | |
if str(size).replace(" ", "") == "0": | |
sheet.cell(row = property_index, column = 9).value = "" | |
else: | |
sheet.cell(row = property_index, column = 9).value = dataset.iat[dataset_index, id_size] | |
""" writing ORIGINAL PRICE columns """ | |
sheet.cell(row = property_index, column = 11).value = dataset.iat[dataset_index, id_origPrice] | |
sheet.cell(row = property_index, column = 11).alignment = rightAligment | |
""" writing ID column """ | |
sheet.cell(row = property_index, column = 3).value = dataset.iat[dataset_index, id_column] | |
sheet.cell(row = property_index, column = 3).alignment = rightAligment | |
""" checking past data by ID : start --> """ | |
for past_row in range(10, property_index-1, 1): | |
past_id = str(sheet.cell(row = past_row, column = 3).value) | |
if str(sheet.cell(row = property_index, column = 3).value) == past_id: | |
duplicates = duplicates + 1 | |
sheet.cell(row = property_index, column = 3).fill = fillGreen | |
formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))' | |
size = sheet.cell(row = past_row, column = 9).value | |
sheet.cell(row = property_index, column = 9).value = size | |
sheet.cell(row = property_index, column = 9).fill = fillGreen | |
print("Duplicate on row:", past_row, ", new property is on", property_index, "row") | |
""" checking past data by ID : end --> """ | |
""" writing AREA column """ | |
sheet.cell(row = property_index, column = 4).value = dataset.iat[dataset_index, id_area] | |
""" writing ADDRESS column """ | |
sheet.cell(row = property_index, column = 5).value = dataset.iat[dataset_index, id_address] | |
""" writing BEDROOMS column """ | |
sheet.cell(row = property_index, column = 6).value = dataset.iat[dataset_index, id_bedrooms] | |
""" writing BATHROOMS columns """ | |
sheet.cell(row = property_index, column = 7).value = dataset.iat[dataset_index, id_bathrooms] | |
""" writing RECEPTION columns """ | |
sheet.cell(row = property_index, column = 8).value = dataset.iat[dataset_index, id_reception] | |
""" writing FIRST LISTED columns """ | |
sheet.cell(row = property_index, column = 10).value = dataset.iat[dataset_index, id_firstListed] | |
sheet.cell(row = property_index, column = 10).alignment = rightAligment | |
""" writing FLOOR PLAN LINK columns """ | |
FullLink = "" | |
if ".com" in dataset.iat[dataset_index, id_floorplan]: | |
FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_floorplan] + '","' + "Floor" + '")' | |
sheet.cell(row = property_index, column = 15).font = HyperlinkBlue | |
else: | |
FullLink = "" | |
sheet.cell(row = property_index, column = 15).value = FullLink | |
sheet.cell(row = property_index, column = 15).alignment = centerAligment | |
""" writing CURRENT PRICE to columns """ | |
sheet.cell(row = property_index, column = 12).value = dataset.iat[dataset_index, id_currPrice] | |
sheet.cell(row = property_index, column = 12).alignment = rightAligment | |
""" writing LINK to column """ | |
FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_link] + '","' + "Link" + '")' | |
sheet.cell(row = property_index, column = 16).value = FullLink | |
sheet.cell(row = property_index, column = 16).font = HyperlinkBlue | |
""" writing LINK URL to column """ | |
sheet.cell(row = property_index, column = 18).value = dataset.iat[dataset_index, id_link] | |
""" writing NEW columns """ | |
sheet.cell(row = property_index, column = 21).value = dataset.iat[dataset_index, id_new] | |
""" writing Prob <<1>> column """ | |
sheet.cell(row = property_index, column = 28).value = dataset.iat[dataset_index, id_1] | |
sheet.cell(row = property_index, column = 28).fill = fillDefault | |
""" writing Prob <<0>> column """ | |
sheet.cell(row = property_index, column = 29).value = dataset.iat[dataset_index, id_0] | |
sheet.cell(row = property_index, column = 29).fill = fillDefault | |
dataset_index = dataset_index + 1 #Go to next row | |
property_index = property_index + 1 | |
""" Write formulas : start """ | |
formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))' | |
sheet.cell(row = property_index, column = 27).value = formula_for_comps | |
sheet.cell(row = property_index, column = 27).fill = fillGreen | |
formula_check_size = '=IF(AND(Y' + str(property_index) + '="new", I' + str(property_index) + '=""), "check", IF(I' + str(property_index) + '="", Y' + str(property_index) + ',I' + str(property_index) + '))' | |
sheet.cell(row = property_index, column = 26).value = formula_check_size | |
sheet.cell(row = property_index, column = 26).fill = fillGreen | |
""" Write formulas : end """ | |
""" Calculation: DISCOUNT column """ | |
sheet.cell(row = property_index, column = 13).value = '=IFERROR((L' + str(property_index) + '-K' + str(property_index) + ')/K' + str(property_index)+ ',"")' | |
sheet.cell(row = property_index, column = 13).number_format = "0%" | |
""" Calculation: PSF column """ | |
sheet.cell(row = property_index, column = 14).value = '=IFERROR(L' + str(property_index) + '/I' + str(property_index) + ',"")' | |
""" Calculation: Type column """ | |
sheet.cell(row = property_index, column = 24).value = '=IF(F' + str(property_index) + '=0,"Studio",IF(F' + str(property_index) + '>=5,"5+",F' + str(property_index) + '))' | |
else: | |
print("Row", i, "- this is LAND FOR SALE") | |
dataset_index = dataset_index + 1 | |
""" SAVE DATA TO EXCEL """ | |
import datetime | |
from datetime import date | |
timeNow = str(datetime.datetime.now()).replace(":","")[:-10].replace(' ', '') | |
wb.save("File_After_SALES - " + area_value + "_" + timeNow + ".xlsx") | |
print('Finished.') | |
return None | |
updateExcel(DF, area_value) | |
# ------------------------- Update EXCEL file : end --------------------------- | |
#------------------------------------------------------------------------------ | |
""" | |
!!! Part below is for EXPERIMENTING !!! | |
""" | |
# ::: -------------------------- XGBOOST ---------------------------------- ### | |
# ::: Ready to train! ::: | |
import xgboost as xgb | |
# ::: Set-up XGBoost classifier ::: | |
classifier = xgb.sklearn.XGBClassifier(nthread = -1, seed = 42) | |
# ::: Train the model with parameters ::: | |
classifier.fit(X_train, y_train) | |
# ::: Evaluation ::: | |
predictions = classifier.predict(X_test) | |
#------------------------------------------------------------------------------ | |
# ::: Handy XGBoost methods | |
%matplotlib inline | |
import matplotlib.pyplot as plt | |
# Plot Feature Importances >>> | |
plt.figure(figsize=(8,5)) | |
xgb.plot_importance(classifier, ax = plt.gca()) | |
# Plot Tress that were built by XGBoost | |
plt.figure(figsize=(8,5)) | |
xgb.plot_tree(classifier, ax = plt.gca()) | |
# Access the characteristics of the model | |
print('Number of boosting trees: {}.'.format(classifier.n_estimators)) | |
print('Max depth of trees: {}.'.format(classifier.max_depth)) | |
print('Objective function: {}.'.format(classifier.objective)) | |
# ----------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment