Created
June 12, 2018 09:52
-
-
Save vb100/9021d0e9b0b48cf3cd2a6af3a927e8c6 to your computer and use it in GitHub Desktop.
Empirical distribution function example with Real Estate data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 01 Calculate ECDF for Zoopla distribution model | |
def ecdf(data): | |
"""Compute ECDF for a one-dimensional array of measurements.""" | |
# Number of data points: n | |
n = len(data) | |
# x-data for the ECDF: x | |
x = np.sort(data) | |
# y-data for the ECDF: y | |
y = np.arange(1, len(x)+1) / n | |
return x, y | |
#02 Upload the dataset | |
#Import modules and packages | |
import numpy as np | |
import pandas as pd | |
ds = pd.read_csv('ecdf_data_1.csv') | |
#03 Building-up data series for ECDF | |
## Construct test series of data called ds_test | |
# Convert Pandas Series to Numpy.ndarray | |
ds_test = ds['Price per bedroom'].values | |
ds_il = ds[(ds['AREA'] == 'Ilford')] | |
ds_wd = ds[(ds['AREA'] == 'West Drayton')] | |
ds_il_1b = ds_il[(ds_il['BEDROOMS'] == 1)] | |
ds_il_2b = ds_il[(ds_il['BEDROOMS'] == 2)] | |
ds_il_3b = ds_il[(ds_il['BEDROOMS'] == 3)] | |
ds_il_4b = ds_il[(ds_il['BEDROOMS'] == 4)] | |
ds_il_5b = ds_il[(ds_il['BEDROOMS'] == 5)] | |
ds_wd_1b = ds_wd[(ds_wd['BEDROOMS'] == 1)] | |
ds_wd_2b = ds_wd[(ds_wd['BEDROOMS'] == 2)] | |
ds_wd_3b = ds_wd[(ds_wd['BEDROOMS'] == 3)] | |
ds_wd_4b = ds_wd[(ds_wd['BEDROOMS'] == 4)] | |
ds_wd_5b = ds_wd[(ds_wd['BEDROOMS'] == 5)] | |
## Get values for x and y axis. | |
x_test, y_test = ecdf(ds_test) | |
x_wd, y_wd = ecdf(ds_wd['Price per bedroom'].values) | |
x_il, y_il = ecdf(ds_il['Price per bedroom'].values) | |
x_il_1b, y_il_1b = ecdf(ds_il_1b['Price per bedroom'].values) | |
x_il_2b, y_il_2b = ecdf(ds_il_2b['Price per bedroom'].values) | |
x_il_3b, y_il_3b = ecdf(ds_il_3b['Price per bedroom'].values) | |
x_il_4b, y_il_4b = ecdf(ds_il_4b['Price per bedroom'].values) | |
x_il_5b, y_il_5b = ecdf(ds_il_5b['Price per bedroom'].values) | |
x_wd_1b, y_wd_1b = ecdf(ds_wd_1b['Price per bedroom'].values) | |
x_wd_2b, y_wd_2b = ecdf(ds_wd_2b['Price per bedroom'].values) | |
x_wd_3b, y_wd_3b = ecdf(ds_wd_3b['Price per bedroom'].values) | |
x_wd_4b, y_wd_4b = ecdf(ds_wd_4b['Price per bedroom'].values) | |
x_wd_5b, y_wd_5b = ecdf(ds_wd_5b['Price per bedroom'].values) | |
#04 Plotting the ECDF | |
# Import matplotlib module for plotting | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import matplotlib.patches as mpatches | |
%matplotlib inline | |
# Generate plot | |
sns.set_style('whitegrid') | |
_ = plt.plot(x_wd, y_wd, marker='.', linestyle='none', color = '#c00000', alpha = 0.5) | |
_ = plt.plot(x_il, y_il, marker='.', linestyle='none', color = '#1f4e79', alpha = 0.5) | |
# Make the margins nice | |
_ = plt.margins(0.02) | |
# Label the axes | |
_ = plt.xlabel('Price per bedroom (£)', fontsize = 16, family='Arial') | |
_ = plt.ylabel('ECDF', fontsize = 16) | |
plt.tick_params(labelsize=14) | |
# Set plot size | |
plt.rcParams['figure.figsize'] = (11,7) | |
# Set axis style | |
_ = plt.grid(b=True, which='major', color='#cccccc', linestyle='--') | |
# Add legend | |
red_patch = mpatches.Patch(color='#c00000', label='Ilford') | |
blue_patch = mpatches.Patch(color='#1f4e79', label='West Drayton') | |
_ = plt.legend(handles=[red_patch, blue_patch], loc = 'upper left', fontsize = 16) | |
# Display the plot | |
plt.show() | |
# Next one plot is below | |
# Generate plot | |
sns.set_style('whitegrid') | |
_ = plt.plot(x_il_1b, y_il_1b, marker='.', linestyle='none', color = '#c00000', alpha = 1, markersize=12) | |
_ = plt.plot(x_il_2b, y_il_2b, marker='.', linestyle='none', color = '#1f4e79', alpha = 1, markersize=12) | |
_ = plt.plot(x_il_3b, y_il_3b, marker='.', linestyle='none', color = '#6e1a18', alpha = 1, markersize=12) | |
_ = plt.plot(x_il_4b, y_il_4b, marker='.', linestyle='none', color = '#808080', alpha = 1, markersize=12) | |
_ = plt.plot(x_il_5b, y_il_5b, marker='.', linestyle='none', color = '#ff8080', alpha = 1, markersize=12) | |
# Make the margins nice | |
_ = plt.margins(0.02) | |
# Label the axes | |
_ = plt.xlabel('Price per bedroom (£)', fontsize = 16, family='Arial') | |
_ = plt.ylabel('ECDF', fontsize = 16) | |
plt.tick_params(labelsize=14) | |
# Set plot size | |
plt.rcParams['figure.figsize'] = (11,7) | |
# Set axis style | |
_ = plt.grid(b=True, which='major', color='#cccccc', linestyle='--') | |
# Add legend | |
patch_01 = mpatches.Patch(color='#c00000', label='1 bedroom properties') | |
patch_02 = mpatches.Patch(color='#1f4e79', label='2 bedroom properties') | |
patch_03 = mpatches.Patch(color='#6e1a18', label='3 bedroom properties') | |
patch_04 = mpatches.Patch(color='#808080', label='4 bedroom properties') | |
patch_05 = mpatches.Patch(color='#ff8080', label='5 bedroom properties') | |
_ = plt.legend(handles=[patch_01, patch_02, patch_03, patch_04, patch_05], loc = 'lower right', fontsize = 16) | |
# Display the plot | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment