Skip to content

Instantly share code, notes, and snippets.

@vb100
Created June 12, 2018 09:52
Show Gist options
  • Save vb100/9021d0e9b0b48cf3cd2a6af3a927e8c6 to your computer and use it in GitHub Desktop.
Save vb100/9021d0e9b0b48cf3cd2a6af3a927e8c6 to your computer and use it in GitHub Desktop.
Empirical distribution function example with Real Estate data
# 01 Calculate ECDF for Zoopla distribution model
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, len(x)+1) / n
return x, y
#02 Upload the dataset
#Import modules and packages
import numpy as np
import pandas as pd
ds = pd.read_csv('ecdf_data_1.csv')
#03 Building-up data series for ECDF
## Construct test series of data called ds_test
# Convert Pandas Series to Numpy.ndarray
ds_test = ds['Price per bedroom'].values
ds_il = ds[(ds['AREA'] == 'Ilford')]
ds_wd = ds[(ds['AREA'] == 'West Drayton')]
ds_il_1b = ds_il[(ds_il['BEDROOMS'] == 1)]
ds_il_2b = ds_il[(ds_il['BEDROOMS'] == 2)]
ds_il_3b = ds_il[(ds_il['BEDROOMS'] == 3)]
ds_il_4b = ds_il[(ds_il['BEDROOMS'] == 4)]
ds_il_5b = ds_il[(ds_il['BEDROOMS'] == 5)]
ds_wd_1b = ds_wd[(ds_wd['BEDROOMS'] == 1)]
ds_wd_2b = ds_wd[(ds_wd['BEDROOMS'] == 2)]
ds_wd_3b = ds_wd[(ds_wd['BEDROOMS'] == 3)]
ds_wd_4b = ds_wd[(ds_wd['BEDROOMS'] == 4)]
ds_wd_5b = ds_wd[(ds_wd['BEDROOMS'] == 5)]
## Get values for x and y axis.
x_test, y_test = ecdf(ds_test)
x_wd, y_wd = ecdf(ds_wd['Price per bedroom'].values)
x_il, y_il = ecdf(ds_il['Price per bedroom'].values)
x_il_1b, y_il_1b = ecdf(ds_il_1b['Price per bedroom'].values)
x_il_2b, y_il_2b = ecdf(ds_il_2b['Price per bedroom'].values)
x_il_3b, y_il_3b = ecdf(ds_il_3b['Price per bedroom'].values)
x_il_4b, y_il_4b = ecdf(ds_il_4b['Price per bedroom'].values)
x_il_5b, y_il_5b = ecdf(ds_il_5b['Price per bedroom'].values)
x_wd_1b, y_wd_1b = ecdf(ds_wd_1b['Price per bedroom'].values)
x_wd_2b, y_wd_2b = ecdf(ds_wd_2b['Price per bedroom'].values)
x_wd_3b, y_wd_3b = ecdf(ds_wd_3b['Price per bedroom'].values)
x_wd_4b, y_wd_4b = ecdf(ds_wd_4b['Price per bedroom'].values)
x_wd_5b, y_wd_5b = ecdf(ds_wd_5b['Price per bedroom'].values)
#04 Plotting the ECDF
# Import matplotlib module for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
%matplotlib inline
# Generate plot
sns.set_style('whitegrid')
_ = plt.plot(x_wd, y_wd, marker='.', linestyle='none', color = '#c00000', alpha = 0.5)
_ = plt.plot(x_il, y_il, marker='.', linestyle='none', color = '#1f4e79', alpha = 0.5)
# Make the margins nice
_ = plt.margins(0.02)
# Label the axes
_ = plt.xlabel('Price per bedroom (£)', fontsize = 16, family='Arial')
_ = plt.ylabel('ECDF', fontsize = 16)
plt.tick_params(labelsize=14)
# Set plot size
plt.rcParams['figure.figsize'] = (11,7)
# Set axis style
_ = plt.grid(b=True, which='major', color='#cccccc', linestyle='--')
# Add legend
red_patch = mpatches.Patch(color='#c00000', label='Ilford')
blue_patch = mpatches.Patch(color='#1f4e79', label='West Drayton')
_ = plt.legend(handles=[red_patch, blue_patch], loc = 'upper left', fontsize = 16)
# Display the plot
plt.show()
# Next one plot is below
# Generate plot
sns.set_style('whitegrid')
_ = plt.plot(x_il_1b, y_il_1b, marker='.', linestyle='none', color = '#c00000', alpha = 1, markersize=12)
_ = plt.plot(x_il_2b, y_il_2b, marker='.', linestyle='none', color = '#1f4e79', alpha = 1, markersize=12)
_ = plt.plot(x_il_3b, y_il_3b, marker='.', linestyle='none', color = '#6e1a18', alpha = 1, markersize=12)
_ = plt.plot(x_il_4b, y_il_4b, marker='.', linestyle='none', color = '#808080', alpha = 1, markersize=12)
_ = plt.plot(x_il_5b, y_il_5b, marker='.', linestyle='none', color = '#ff8080', alpha = 1, markersize=12)
# Make the margins nice
_ = plt.margins(0.02)
# Label the axes
_ = plt.xlabel('Price per bedroom (£)', fontsize = 16, family='Arial')
_ = plt.ylabel('ECDF', fontsize = 16)
plt.tick_params(labelsize=14)
# Set plot size
plt.rcParams['figure.figsize'] = (11,7)
# Set axis style
_ = plt.grid(b=True, which='major', color='#cccccc', linestyle='--')
# Add legend
patch_01 = mpatches.Patch(color='#c00000', label='1 bedroom properties')
patch_02 = mpatches.Patch(color='#1f4e79', label='2 bedroom properties')
patch_03 = mpatches.Patch(color='#6e1a18', label='3 bedroom properties')
patch_04 = mpatches.Patch(color='#808080', label='4 bedroom properties')
patch_05 = mpatches.Patch(color='#ff8080', label='5 bedroom properties')
_ = plt.legend(handles=[patch_01, patch_02, patch_03, patch_04, patch_05], loc = 'lower right', fontsize = 16)
# Display the plot
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment