vb100 · June 27, 2018 13:22
diff --git a/Comps_identifiactor_MachineLearning.py b/Comps_identifiactor_MachineLearning.py
 # -*- coding: utf-8 -*-
 """
 Created on Thu Jun 21 14:26:09 2018

 @author: Vytautas.Bielinskas

 Definitions:
    JN - Jupyter Notebook
    ML - Machine learning
    BOG - Bag Of Words
    RF - Random Forest
 """
 # Get current directory
 import os
 cwd = os.getcwd()
 print('The current directory is {}.'.format(cwd))

 # ::: Use Pandas to load the data in a dataframe :::
 import pandas as pd

 training_set = pd.read_excel('training_set_20180625.xlsx', index_col = 0)

 # ::: Clean dataset :::
 # Data pre-process and data cleaning was performed in JP

 print('The shape of training set after data cleaning is {}.'.format(training_set.shape))

 # ::: Extract target from the features :::
 y = training_set['Comps'].values
 X = training_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']].values

 # -----------------------------------------------------------------------------
 # ::: ------------ RANDOM FOREST: PREPARATION FOR ACTION  ----------------- :::
 # ::: Feature scaling :::
 from sklearn.preprocessing import StandardScaler
 sc = StandardScaler()
 X_train_RF = sc.fit_transform(X)
 #X_test_RF = sc.transform(X_test)

 # ::: Split dataset into train and test sets :::
 """ This part is neccessary for checking accuracies """

 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X_train_RF, y, test_size = 0.2, random_state = 42)

 print('Size of train dataset {} rows.'.format(X_train.shape[0]))
 print('Size of test dataset {} rows.'.format(X_test.shape[0]))

 N = 50
 accur = []
 record = {}

 best_hyper = {}
 this_acc = 0.0

 # ::: Import Classifiers :::
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier

 # ::: Array of Machine learning classifiers if experimenting with more than one :::
 ML_classifiers = []
 ML_classifiers.append(DecisionTreeClassifier())
 ML_classifiers.append(RandomForestClassifier())

 iteration = 0
 while iteration < N:
    # ::: Hyperparameter Tuning :::
    from scipy.stats import randint
    param_dist = {'max_depth' : [3, None],
                  'min_samples_leaf' : randint(1, 9),
                  'criterion' : ['gini', 'entropy']}
     
    # ::: Instantiate a Decision Tree classifier : tree :::
    tree = ML_classifiers[1]            # <------ Choose your ML classifier here!
    
    # ::: Instantiate the RandomizedSearchCV object: tree_cv :::
    from sklearn.model_selection import RandomizedSearchCV
    tree_cv = RandomizedSearchCV(tree, param_dist, cv = 5)
    
    # ::: Fit it to the data :::
    tree_cv.fit(X_train, y_train)
    
    # ::: Print the tuned parameters and score :::
    print('       ---------------')
    print('Sample: {}.'.format(iteration))
    print('Tuned Decision Tree Parameters: {}.'.format(tree_cv.best_params_))
    print('Best score is {}.'.format(tree_cv.best_score_))
    
    record["Best score"] = tree_cv.best_score_ 
    accur.append(dict(record))
    
    # ::: Predicting the Test set results :::
    y_pred = tree_cv.predict(X_test)
    
    # ::: Making Confusion Matrix :::
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    if tree_cv.best_score_ > this_acc:
        
        print('!!! Current acc: {}, got new value is {}.'.format(this_acc, tree_cv.best_score_))
        
        best_hyper['criterion'] = tree_cv.best_params_['criterion']
        best_hyper['max_depth'] = tree_cv.best_params_['max_depth']
        best_hyper['min_samples_leaf'] = tree_cv.best_params_['min_samples_leaf']
        best_hyper['_best_score_'] = tree_cv.best_score_
        
        this_acc = tree_cv.best_score_
        
    # ::: Iteration +1 :::
    iteration = iteration + 1

 acc_df = pd.DataFrame(accur)

 print('\nAverage accuracy is: {}.'.format(acc_df['Best score'].mean()))
 print('Maximum accuracy is: {}.'.format(acc_df['Best score'].max()))
 print('The difference is: {}.'.format(acc_df['Best score'].mean() - acc_df['Best score'].max()))

 print('Selected features: {}'.format(best_hyper))

 # ------------------------------->
 # Plot accuracies
 def plot_acc(acc_df):
    
    N = len(acc_df)
    
    import seaborn as sns
    import numpy as np
    import matplotlib.patches as mpatches
    import matplotlib.pyplot as plt
    
    # Generate plot
    sns.set_style('darkgrid')
    
    # Set plot size
    _ = plt.rcParams['figure.figsize'] = (7,5)
    
    _ = plt.hist(acc_df.values, color = '#1f4e79')
    _ = plt.margins(0.02)
    
    # Label the axes
    _ = plt.xlabel('Test Accuracy', fontsize = 16, family='Arial')
    _ = plt.ylabel('Samples', fontsize = 16)
    
    # Display the plot
    plt.show()
    return None

 """ !!! Open following line only for testing !!! """
 plot_acc(acc_df)

 # -----------------------------------------------------------------------------
 # ---------------- SET THE BEST HYPERPARAMETERS TO THE CLASSIFIER -------------
 param_dist = {'max_depth' : best_hyper['max_depth'],
              'min_samples_leaf' : best_hyper['min_samples_leaf'],
              'criterion' : best_hyper['criterion']}

 if 'DecisionTree'.upper() in str(tree).split('(')[0].upper():
    classifier = DecisionTreeClassifier(max_depth = param_dist['max_depth'],
                                        min_samples_leaf = param_dist['min_samples_leaf'],
                                        criterion = param_dist['criterion'])
    
    print('\nDecision Tree classifier is using now...')
 elif 'RandomForest'.upper() in str(tree).split('(')[0].upper():
    classifier = RandomForestClassifier(max_depth = param_dist['max_depth'],
                                        min_samples_leaf = param_dist['min_samples_leaf'],
                                        criterion = param_dist['criterion'])
    
    print('\nRandom Forest Classifier is using now...\n')

 # ::: Import Test dataset :::
 test_set = pd.read_csv(cwd + '/Tests/TestSet_20180626.csv')

 # --------------------- RANDOM FOREST IMPLEMENTATION --------------------------
 # -----------------------------------------------------------------------------
 # ---------------------RE-ARANGE TRAINING AND TEST SETS -----------------------

 # ::: Extract features from Test set :::
 X_test_RF = test_set[['Price per bedroom', 'Min. dist. to station', 'Evaluation']]       # <-- Pre-process for EVALUATION in JN!

 # ::: Feature Scaling to Test set (Real data) :::
 X_test_RF_scaled = sc.fit_transform(X_test_RF)

 # ::: Add extra column for Comps identification :::
 import numpy as np

 z = np.zeros((len(X_test_RF), 1), dtype = 'float64')
 """ Here zero (0) is default value for Comps. Later this field can be changed based on prediction on RF. """

 # <-----X_test_RF_scaled = np.append(X_test_RF_scaled, z, axis = 1)----->

 # ::: Fit classifier to the Train set. Let algorithm learn right now!
 classifier.fit(X_train_RF, y)

 # ::: Predicting the Test set results :::
 y_pred = classifier.predict(X_test_RF_scaled)

 # ::: Get values of Probabilities :::
 probas = classifier.predict_proba(X_test_RF_scaled)

 # ::: Get importances of features :::
 importances = classifier.feature_importances_

 # ::: Print the feature ranking :::
 indices = np.argsort(importances)[::1]
 print('Feature ranking:')
 for f in range(X_test_RF.shape[1]):
    print('%d. feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))
    
 # ----------------------- SOME RESULTS VISUALIZATIONS -------------------------
 # ::: ------------- Plot the feature importances of the forest -------------:::
 import matplotlib.pyplot as plt    
    
 try:
    plt.figure()
    plt.title('Feature importances')
    std = np.std([tree.feature_importances_ for tree in classifier.estimators_], 
                 axis = 0)
    plt.bar(range(X_test_RF.shape[1]), importances[indices],
            color = '#1f4e79', 
            yerr = std[indices], 
            align = 'center')
    plt.xticks(range(X_test_RF.shape[1]), indices)
    plt.xlim([-1, X_test_RF.shape[1]])
    plt.show()
 except:
    print('\n Feature importance graph can be plotted only for Random Forest Classifier. \n')
    
 # ::: ------------------------ Plot the Scatter -------------------------:::
 fig, ax = plt.subplots()
 x = X_test_RF_scaled[:, 1]
 y = X_test_RF_scaled[:, 2]

 size_multiplier = 500 

 for point in range(0, len(X_test_RF_scaled), 1):
    if y_pred[point] == 1.0:
        ax.scatter(x[point], y[point], 
                   c = '#5b9bd5', 
                   s = probas[point][1] * size_multiplier, 
                   label = 'Result', 
                   alpha = 0.50, 
                   edgecolors = 'none')
    else:
        ax.scatter(x[point], 
                   y[point], 
                   c = '#ff8080', 
                   s = probas[point][0] * size_multiplier,
                   label = 'Result', 
                   alpha = 0.50, 
                   edgecolors = 'none')
        
 plt.xlabel('Distance to the nearest transport hub', family = 'Arial')
 plt.ylabel('Evaluation of description', family = 'Arial')
 ax.grid(True)

 plt.show()

 # -------------------------- STRUCTURING DATA ---------------------------------
 # ::: Create Pandas Dataframe for writing results in Excel :::
 list_of_data = []
 record_data = {}

 index_of_ID        = test_set.columns.get_loc('ID')
 index_of_Address   = test_set.columns.get_loc('Address')
 index_of_Bedrooms  = test_set.columns.get_loc('Bedrooms')
 index_of_Bathrooms = test_set.columns.get_loc('Bathrooms') 
 index_of_Reception = test_set.columns.get_loc('Reception')
 index_of_Size      = test_set.columns.get_loc('Size')
 index_of_flisted   = test_set.columns.get_loc('First Listed')
 index_of_OrigPrice = test_set.columns.get_loc('Original price')
 index_of_CurrPrice = test_set.columns.get_loc('Current price')
 index_of_URL       = test_set.columns.get_loc('URL')  
 index_of_FloorLink = test_set.columns.get_loc('FLOORPLAN LINK')
 index_of_New       = test_set.columns.get_loc('New')

 # Set list of possible areas
 areas = []
 areas.append('West Drayton')
 areas.append('Ilford')

 area_value = areas[1]            # <----- Choose your area index here!

 for row in range(0, len(X_test_RF_scaled), 1):
    record_data['ID']             = test_set.iat[row, index_of_ID]
    record_data['AREA']           = area_value
    record_data['ADDRESS']        = test_set.iat[row, index_of_Address]
    record_data['BEDROOMS']       = test_set.iat[row, index_of_Bedrooms]
    record_data['BATHROOMS']      = test_set.iat[row, index_of_Bathrooms]
    record_data['RECEPTION']      = test_set.iat[row, index_of_Reception]
    record_data['SIZE']           = test_set.iat[row, index_of_Size]
    record_data['FIRST LISTED']   = test_set.iat[row, index_of_flisted]
    record_data['ORIGINAL PRICE'] = test_set.iat[row, index_of_OrigPrice]
    record_data['CURRENT PRICE']  = test_set.iat[row, index_of_CurrPrice]
    record_data['WEB']            = test_set.iat[row, index_of_URL]
    record_data['FLOORPLAN LINK'] = test_set.iat[row, index_of_FloorLink]
    record_data['NEW']            = test_set.iat[row, index_of_New]
    record_data['0']              = probas[row][0]
    record_data['1']              = probas[row][1]
    
    list_of_data.append(dict(record_data))
    
 DF = pd.DataFrame(list_of_data)
 DF = DF[['ID', 'AREA', 'ADDRESS', 'BEDROOMS', 'BATHROOMS', 'RECEPTION', 'SIZE',
         'FIRST LISTED', 'ORIGINAL PRICE', 'CURRENT PRICE', 'WEB', 
         'FLOORPLAN LINK', 'NEW', '1', '0']]

 DF = DF.sort_values(by=['0'])

 #------------------------------------------------------------------------------
 #--------------------------- UPDATING EXCEL FILE ------------------------------

 """ 
            All new data will be written to File_before.xlsx Excel file
            
 """
 def updateExcel(dataset, area_value):
    
    print('# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #')
    print('Writing data to Excel file...')
    
    import requests, re, os
    import pandas as pd
    from bs4 import BeautifulSoup

    import openpyxl, os, datetime
    from pandas import ExcelWriter as ewriter
    from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font, Fill
    from openpyxl.comments import Comment
    
    """ checking if property is NOT LAND : start """
    def ifNotLand(url):
        land = False
        
        r = requests.get(url)
        c = r.content
        soup = BeautifulSoup(c, "html.parser")
        
        title = soup.find("title").text
        if "land for sale".upper() in title.upper():
            land = True
        return land
    """ checking if property is NOT LAND : end """
    
    directory = (cwd + '/Excel')
    os.chdir(directory)
    os.getcwd()
    
    filename = "File_before.xlsx"
    wb = openpyxl.load_workbook(filename, data_only = False)    
    
    #print(wb.get_sheet_names)
    print("---- Start update Excel file ----")       
    
    sheet = wb.get_sheet_by_name("Data Sales")
    
    rowForSearching = 10
    while len(str(sheet.cell(row = rowForSearching, column = 2).value)) > 4:
        rowForSearching = rowForSearching + 1
        
    end_of_table = rowForSearching - 1                          # Get the end of current table
    
    """ Define some Excel cell styles and formatting """
    fillDefault = PatternFill(fill_type = None,
                              start_color = "FFFFFFFF",
                              end_color = "FF000000")
    
    fillGreen = PatternFill(fill_type = "solid",
                            start_color = "56af11",
                            end_color = "56af11")
    
    HyperlinkBlue = Font(color = "0563c1",
                         underline = "single")
    
    rightAligment = Alignment(horizontal = "right")
    centerAligment = Alignment(horizontal = "center")
    
    """ Starting writing data to Excel file """
    id_column      = dataset.columns.get_loc("ID")            # get index of ID column
    id_area        = dataset.columns.get_loc("AREA")          # get index of STATION column
    id_address     = dataset.columns.get_loc("ADDRESS")       # get index of ADDRESS column
    id_bedrooms    = dataset.columns.get_loc("BEDROOMS")      # get index of BEDROOMS column
    id_bathrooms   = dataset.columns.get_loc("BATHROOMS")     # get index of BATHROOMS column
    id_reception   = dataset.columns.get_loc("RECEPTION")     # get index of RECEPTION column
    id_size        = dataset.columns.get_loc("SIZE")          # get index of SIZE column
    id_firstListed = dataset.columns.get_loc("FIRST LISTED")  # get index of FIRST LISTED column
    id_origPrice   = dataset.columns.get_loc("ORIGINAL PRICE")# get index of ORIGINAL PRICE column
    id_currPrice   = dataset.columns.get_loc("CURRENT PRICE") # get index of CURRENT PRICE column
    id_link        = dataset.columns.get_loc("WEB")           # get index of WEB column
    id_floorplan   = dataset.columns.get_loc("FLOORPLAN LINK")# get index of FLOORPLAN LINK column
    id_new         = dataset.columns.get_loc("NEW")           # get index of NEW column
    id_1           = dataset.columns.get_loc("1")             # get index of <<1>> column
    id_0           = dataset.columns.get_loc("0")             # get index of <<0>> column
    
    for x in range(rowForSearching, rowForSearching + len(dataset), 1):
        for j in range(1, sheet.max_column, 1):
            sheet.cell(row = x, column = j).fill = fillDefault
    
    dataset_index = 0
    property_index = rowForSearching
    for i in range(rowForSearching, rowForSearching + len(dataset), 1):
        link_to_property = dataset.iat[dataset_index, id_link]
        land = ifNotLand(link_to_property)
        duplicates = 0
        if land == False and "POA" not in str(dataset.iat[dataset_index, id_currPrice]).upper():
            """ writing MONTH column """
            d = datetime.datetime.strptime(str(datetime.date.today()), '%Y-%m-%d')
            d= d.strftime('%d-%b-%Y').replace("-20", "-")
            sheet.cell(row = property_index, column = 2).value = d
            sheet.cell(row = property_index, column = 2).alignment = rightAligment
            
            """ writing SIZE columns """
            size = dataset.iat[dataset_index, id_size]
            if str(size).replace(" ", "") == "0":
                sheet.cell(row = property_index, column = 9).value = ""
            else:
                sheet.cell(row = property_index, column = 9).value = dataset.iat[dataset_index, id_size]
            
            """ writing ORIGINAL PRICE columns """
            sheet.cell(row = property_index, column = 11).value = dataset.iat[dataset_index, id_origPrice]
            sheet.cell(row = property_index, column = 11).alignment = rightAligment
            
            """ writing ID column """
            sheet.cell(row = property_index, column = 3).value = dataset.iat[dataset_index, id_column]
            sheet.cell(row = property_index, column = 3).alignment = rightAligment
            
            """ checking past data by ID : start --> """
            for past_row in range(10, property_index-1, 1):
                past_id = str(sheet.cell(row = past_row, column = 3).value)
                if str(sheet.cell(row = property_index, column = 3).value) == past_id:
                    duplicates = duplicates + 1
                    sheet.cell(row = property_index, column = 3).fill = fillGreen
                    
                    formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))'
                    
                    size = sheet.cell(row = past_row, column = 9).value
                    sheet.cell(row = property_index, column = 9).value = size
                    sheet.cell(row = property_index, column = 9).fill = fillGreen
                    
                    print("Duplicate on row:", past_row, ", new property is on", property_index, "row")
            """ checking past data by ID : end --> """
            
            """ writing AREA column """
            sheet.cell(row = property_index, column = 4).value = dataset.iat[dataset_index, id_area]
            
            """ writing ADDRESS column """
            sheet.cell(row = property_index, column = 5).value = dataset.iat[dataset_index, id_address]
            
            """ writing BEDROOMS column """
            sheet.cell(row = property_index, column = 6).value = dataset.iat[dataset_index, id_bedrooms]
            
            """ writing BATHROOMS columns """
            sheet.cell(row = property_index, column = 7).value = dataset.iat[dataset_index, id_bathrooms]
            
            """ writing RECEPTION columns """
            sheet.cell(row = property_index, column = 8).value = dataset.iat[dataset_index, id_reception]
            
            """ writing FIRST LISTED columns """
            sheet.cell(row = property_index, column = 10).value = dataset.iat[dataset_index, id_firstListed]
            sheet.cell(row = property_index, column = 10).alignment = rightAligment
            
            """ writing FLOOR PLAN LINK columns """
            FullLink = ""
            if ".com" in dataset.iat[dataset_index, id_floorplan]:
                FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_floorplan] + '","' + "Floor" + '")'
                sheet.cell(row = property_index, column = 15).font = HyperlinkBlue
            else:
                FullLink = ""
            sheet.cell(row = property_index, column = 15).value = FullLink
            sheet.cell(row = property_index, column = 15).alignment = centerAligment
            
            """ writing CURRENT PRICE to columns """
            sheet.cell(row = property_index, column = 12).value = dataset.iat[dataset_index, id_currPrice]
            sheet.cell(row = property_index, column = 12).alignment = rightAligment
            
            """ writing LINK to column """
            FullLink = '=HYPERLINK("' + dataset.iat[dataset_index, id_link] + '","' + "Link" + '")'
            sheet.cell(row = property_index, column = 16).value = FullLink
            sheet.cell(row = property_index, column = 16).font = HyperlinkBlue
            
            """ writing LINK URL to column """
            sheet.cell(row = property_index, column = 18).value = dataset.iat[dataset_index, id_link]
            
            """ writing NEW columns """
            sheet.cell(row = property_index, column = 21).value = dataset.iat[dataset_index, id_new]
            
            """ writing Prob <<1>> column """
            sheet.cell(row = property_index, column = 28).value = dataset.iat[dataset_index, id_1]
            sheet.cell(row = property_index, column = 28).fill = fillDefault
            
            """ writing Prob <<0>> column """
            sheet.cell(row = property_index, column = 29).value = dataset.iat[dataset_index, id_0]
            sheet.cell(row = property_index, column = 29).fill = fillDefault
            
            dataset_index = dataset_index + 1           #Go to next row
            property_index = property_index + 1
            
            """ Write formulas : start """
            formula_for_comps = '=IF(AND(F' + str(property_index) + '<>2, F' + str(property_index) + '<>4), "",IFERROR(VLOOKUP(C' + str(property_index) + ',$C$3:$V$' + str(end_of_table) + ',20,FALSE), "new"))'
                                        
            sheet.cell(row = property_index, column = 27).value = formula_for_comps
            sheet.cell(row = property_index, column = 27).fill = fillGreen
                    
            formula_check_size = '=IF(AND(Y' + str(property_index) + '="new", I' + str(property_index) + '=""), "check", IF(I' + str(property_index) + '="", Y' + str(property_index) + ',I' + str(property_index) + '))'
            sheet.cell(row = property_index, column = 26).value = formula_check_size
            sheet.cell(row = property_index, column = 26).fill = fillGreen
            """ Write formulas : end """
            
            """ Calculation: DISCOUNT column """
            sheet.cell(row = property_index, column = 13).value = '=IFERROR((L' + str(property_index) + '-K' + str(property_index) + ')/K' + str(property_index)+ ',"")'
            sheet.cell(row = property_index, column = 13).number_format = "0%"
            
            """ Calculation: PSF column """
            sheet.cell(row = property_index, column = 14).value = '=IFERROR(L' + str(property_index) + '/I' + str(property_index) + ',"")'
            
            """ Calculation: Type column """
            sheet.cell(row = property_index, column = 24).value = '=IF(F' + str(property_index) + '=0,"Studio",IF(F' + str(property_index) + '>=5,"5+",F' + str(property_index) + '))'        
            
        else:
            print("Row", i, "- this is LAND FOR SALE")
            dataset_index = dataset_index + 1
            
    """ SAVE DATA TO EXCEL """
    import datetime
    from datetime import date 
    timeNow = str(datetime.datetime.now()).replace(":","")[:-10].replace(' ', '')
    wb.save("File_After_SALES - " + area_value + "_" + timeNow + ".xlsx")
    print('Finished.')
    
    return None

 updateExcel(DF, area_value)
 # ------------------------- Update EXCEL file : end ---------------------------

 #------------------------------------------------------------------------------

 """ 

                    !!! Part below is for EXPERIMENTING !!!

 """

 # ::: -------------------------- XGBOOST ---------------------------------- ###
 # ::: Ready to train! :::
 import xgboost as xgb

 # ::: Set-up XGBoost classifier :::
 classifier = xgb.sklearn.XGBClassifier(nthread = -1, seed = 42)

 # ::: Train the model with parameters :::
 classifier.fit(X_train, y_train)

 # ::: Evaluation :::
 predictions = classifier.predict(X_test)

 #------------------------------------------------------------------------------
 # ::: Handy XGBoost methods

 %matplotlib inline
 import matplotlib.pyplot as plt

 # Plot Feature Importances >>>
 plt.figure(figsize=(8,5))
 xgb.plot_importance(classifier, ax = plt.gca())

 # Plot Tress that were built by XGBoost
 plt.figure(figsize=(8,5))
 xgb.plot_tree(classifier, ax = plt.gca())

 # Access the characteristics of the model
 print('Number of boosting trees: {}.'.format(classifier.n_estimators))
 print('Max depth of trees: {}.'.format(classifier.max_depth))
 print('Objective function: {}.'.format(classifier.objective))
 # -----------------------------------------------------------------------------