Created
February 23, 2021 12:19
-
-
Save kiko-datasparq/3070ad063dd3b69d240f23181c00126a to your computer and use it in GitHub Desktop.
GaussianMixture_RealEstatePrices
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"import matplotlib.dates as mdates\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"import scipy\n", | |
"from scipy.stats import gamma\n", | |
"from scipy.stats import norm\n", | |
"from scipy.stats import lognorm\n", | |
"import pickle" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"DOWNLOAD DATA FROM\n", | |
"\n", | |
"https://www.kaggle.com/hm-land-registry/uk-housing-prices-paid\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Selected columns\n", | |
"columns = [\n", | |
" 'Price', 'Date of Transfer',\n", | |
" 'Property Type', 'Old/New', 'Duration', 'Town/City', 'District',\n", | |
" 'County', 'PPDCategory Type'\n", | |
" ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Min date: 2012-01-01 00:00\n", | |
"Min max: 2014-12-30 00:00\n" | |
] | |
} | |
], | |
"source": [ | |
"# Read part of the dataset\n", | |
"df = pd.read_csv(\"data/train_price_houses.csv\", usecols=columns)\n", | |
"print(\"Min date: {}\".format(df['Date of Transfer'].min()))\n", | |
"print(\"Min max: {}\".format(df['Date of Transfer'].max()))\n", | |
"df['Date of Transfer'] = pd.DatetimeIndex(df['Date of Transfer'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(2461243, 9)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Adjust price for inflation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['count_col'] = 0\n", | |
"df['Price_mean'] = df['Price'] \n", | |
"df['Price_std'] = df['Price'] \n", | |
"df_group = df.groupby(by=['Town/City', 'County']).agg({'count_col': 'count', \n", | |
" 'Price_mean' : 'mean', 'Price_std': 'std'})\n", | |
"df_group = df_group.dropna()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_group_date = df.groupby(by='Date of Transfer').agg({'Price_mean': 'mean', 'count_col': 'count'}).reset_index()\n", | |
"df_group_date['Date of Transfer'] = pd.to_datetime(df_group_date['Date of Transfer'])\n", | |
"\n", | |
"# One week rolling window\n", | |
"df_group_date['Price_x_count'] = df_group_date['Price_mean']*df_group_date['count_col']\n", | |
"df_group_date['Price_sum'] = df_group_date['Price_x_count'].rolling(window=20, min_periods=7).sum()\n", | |
"df_group_date['count_sum'] = df_group_date['count_col'].rolling(window=20, min_periods=7).sum()\n", | |
"df_group_date['Price_mean'] = df_group_date['Price_sum']/df_group_date['count_sum']\n", | |
"\n", | |
"df_group_date = df_group_date.dropna().reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x_train = np.array(df_group_date.index, dtype=np.float32)\n", | |
"y_train = np.array(df_group_date['Price_mean'])\n", | |
"\n", | |
"params, _ = scipy.optimize.curve_fit(lambda t,a,b: a*np.exp(b*t), x_train, y_train, p0=(2e5, 0.01))\n", | |
"alpha, beta = params[0], params[1]\n", | |
"y_pred = alpha*np.exp(beta*x_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.00022242820326361787" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"beta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"locator = mdates.YearLocator(1)\n", | |
"locator_fmt = mdates.DateFormatter(\"%Y\")\n", | |
"fig, ax = plt.subplots()\n", | |
"ax.plot(df_group_date['Date of Transfer'], df_group_date['Price_mean'])\n", | |
"ax.xaxis.set_major_locator(locator)\n", | |
"ax.xaxis.set_major_formatter(locator_fmt)\n", | |
"plt.yticks([220e3, 240e3, 260e3, 280e3, 300e3], ['220k', '240k', '260k', '280k', '300k'])\n", | |
"plt.xlabel('Year')\n", | |
"plt.ylabel('Mean price (£)')\n", | |
"plt.grid()\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('figures_mixture/prices.png', dpi=600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"locator = mdates.YearLocator(1)\n", | |
"locator_fmt = mdates.DateFormatter(\"%Y\")\n", | |
"fig, ax = plt.subplots()\n", | |
"ax.plot(df_group_date['Date of Transfer'], df_group_date['Price_mean'])\n", | |
"ax.plot(df_group_date['Date of Transfer'], y_pred, color='r')\n", | |
"ax.xaxis.set_major_locator(locator)\n", | |
"ax.xaxis.set_major_formatter(locator_fmt)\n", | |
"plt.yticks([220e3, 240e3, 260e3, 280e3, 300e3], ['220k', '240k', '260k', '280k', '300k'])\n", | |
"plt.xlabel('Year')\n", | |
"plt.ylabel('Mean price (£)')\n", | |
"plt.grid()\n", | |
"plt.legend(['Mean prices', 'Exponential fit'])\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('figures_mixture/prices_fit.png', dpi=600)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Correction factor\n", | |
"ini_date = df_group_date['Date of Transfer'][0]\n", | |
"df_group_date['factor'] = np.exp(-beta*(df_group_date['Date of Transfer'] - ini_date).dt.days)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"try:\n", | |
" df = df.drop(columns=['factor'])\n", | |
"except:\n", | |
" pass\n", | |
"df = pd.merge(df, df_group_date[['Date of Transfer', 'factor']], on='Date of Transfer', how='left')\n", | |
"df['Price_adj'] = df['Price']*df['factor']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Plot Adjusted Price" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_group_adj = df[df['Date of Transfer'] > ini_date].groupby(by='Date of Transfer').agg({'Price_adj': 'mean', 'count_col': 'count'}).reset_index()\n", | |
"df_group_adj['Date of Transfer'] = pd.to_datetime(df_group_adj['Date of Transfer'])\n", | |
"\n", | |
"# One week rolling window\n", | |
"df_group_adj['Price_x_count'] = df_group_adj['Price_adj']*df_group_adj['count_col']\n", | |
"df_group_adj['Price_sum'] = df_group_adj['Price_x_count'].rolling(window=20, min_periods=7).sum()\n", | |
"df_group_adj['count_sum'] = df_group_adj['count_col'].rolling(window=20, min_periods=7).sum()\n", | |
"df_group_adj['Price_adj'] = df_group_adj['Price_sum']/df_group_adj['count_sum']\n", | |
"df_group_adj = df_group_adj.dropna().reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"locator = mdates.YearLocator(1)\n", | |
"locator_fmt = mdates.DateFormatter(\"%Y\")\n", | |
"fig, ax = plt.subplots()\n", | |
"ax.plot(df_group_adj['Date of Transfer'], df_group_adj['Price_adj'])\n", | |
"plt.hlines(alpha, df_group_adj['Date of Transfer'].min(), df_group_adj['Date of Transfer'].max(), color='r')\n", | |
"ax.xaxis.set_major_locator(locator)\n", | |
"ax.xaxis.set_major_formatter(locator_fmt)\n", | |
"plt.yticks([210e3, 220e3, 230e3, 240e3, 250e3, 260e3], ['210k', '220k', '230k', '240k', '250k', '260k'])\n", | |
"plt.xlabel('Year')\n", | |
"plt.ylabel('Mean price (£)')\n", | |
"plt.grid()\n", | |
"plt.legend(['Adjusted prices', 'Mean price'])\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('figures_mixture/prices_adjusted.png', dpi=600)\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Price Distribution" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Split between old and new\n", | |
"col = 'County'\n", | |
"set1 = ['GREATER LONDON']\n", | |
"set2 = ['WEST MIDLANDS', 'GREATER MANCHESTER']\n", | |
"df_set1 = df[df[col].isin(set1)]\n", | |
"df_set2 = df[df[col].isin(set2)]\n", | |
"df_subset = df[df[col].isin(set1+set2)]\n", | |
"#df_old = df[df['Property Type'] == 'D']\n", | |
"#df_new = df[df['Property Type'] != 'D']\n", | |
"#PPDCategory Type\n", | |
"\n", | |
"plt.figure()\n", | |
"plt.hist(df_subset['Price_adj'], bins=np.arange(0, 1.2e6, 1e3), alpha=.5)\n", | |
"plt.hist(df_set1['Price_adj'], bins=np.arange(0, 1.2e6, 1e3), alpha=.5)\n", | |
"plt.hist(df_set2['Price_adj'], bins=np.arange(0, 1.2e6, 1e3), alpha=.5)\n", | |
"#plt.hist(np.log(df['Price_adj']), bins=np.arange(8, 16, 0.01), alpha=.5)\n", | |
"#plt.hist(np.log(df_old['Price_adj']), bins=np.arange(8, 16, 0.01), alpha=.5)\n", | |
"#plt.hist(np.log(df_new['Price_adj']), bins=np.arange(8, 16, 0.01), alpha=.5)\n", | |
"#plt.yscale('log')\n", | |
"plt.show()\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.figure()\n", | |
"plt.hist(df['Price_adj'], bins=np.arange(0, 1.2e6, 1e3), alpha=1)\n", | |
"plt.xticks([0, 2e5, 4e5, 6e5, 8e5, 1e6, 1.2e6], ['0', '200k', '400k', '600k', '800k', '1M', '1.2M'])\n", | |
"plt.yticks([0, 3e3, 6e3, 9e3, 12e3], ['0', '3k', '6k', '9k', '12k'])\n", | |
"plt.xlabel('Price (£)')\n", | |
"plt.ylabel('Frequency')\n", | |
"plt.grid()\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('figures_mixture/frequencies_price.png', dpi=600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 720x288 with 2 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# plt.figure()\n", | |
"# plt.hist(df['Price_adj'], bins=np.arange(0, 1e8, 5e5), alpha=1, log=True)\n", | |
"# plt.xticks([0, 2e7, 4e7, 6e7, 8e7, 10e7], ['0', '20M', '40M', '60M', '80M', '100M'])\n", | |
"# plt.yticks([1, 1e2, 1e4, 1e6], ['1', '100', '10k', '1M'])\n", | |
"# plt.grid()\n", | |
"# plt.xlabel('Price (£)')\n", | |
"# plt.ylabel('Frequency')\n", | |
"# plt.show()\n", | |
"\n", | |
"bins = np.arange(0, 1.2e6, 1e3)\n", | |
"x = df['Price_adj']\n", | |
"log_x = np.log(x)\n", | |
"log_bins = np.arange(8, 19, 0.01)\n", | |
"\n", | |
"fig, (ax1, ax2) = plt.subplots(1, 2)\n", | |
"fig.set_figwidth(10)\n", | |
"\n", | |
"# Plot 1\n", | |
"ax1.hist(log_x, bins=log_bins, alpha=1, log=True)\n", | |
"plt.setp(ax1, xticks=np.log(10**np.array([4, 5, 6, 7, 8])), xticklabels=['10k', '100k', '1M', '10M', '100M'])\n", | |
"#plt.yticks([0, 4e3, 8e3, 12e3, 16e3], ['0', '4k', '8k', '12k', '16k'])\n", | |
"ax1.grid()\n", | |
"plt.setp(ax1, xlabel='Price (£)')\n", | |
"plt.setp(ax1, ylabel='Frequency')\n", | |
"\n", | |
"# Plot 2\n", | |
"ax2.hist(log_x, bins=log_bins, alpha=1)\n", | |
"plt.setp(ax2, xticks=np.log(10**np.array([4, 5, 6, 7, 8])), xticklabels=['10k', '100k', '1M', '10M', '100M'])\n", | |
"plt.setp(ax2, yticks=[0, 4e3, 8e3, 12e3, 16e3], yticklabels=['0', '4k', '8k', '12k', '16k'])\n", | |
"ax2.grid()\n", | |
"plt.setp(ax2, xlabel='Price (£)')\n", | |
"plt.setp(ax2, ylabel='Frequency')\n", | |
"\n", | |
"plt.savefig('figures_mixture/frequencies_log_price.png', dpi=600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"290114.4940007791\n", | |
"114095.06862380102\n", | |
"1142955.227998204\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df_set1['Price_adj'].median())\n", | |
"print(df_set2['Price_adj'].median())\n", | |
"print(df['Price_adj'].quantile(.99))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data Exploration" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['S' 'D' 'T' 'F' 'O']\n", | |
"['N' 'Y']\n", | |
"['F' 'L']\n", | |
"1150\n", | |
"349\n", | |
"113\n", | |
"['A' 'B']\n" | |
] | |
} | |
], | |
"source": [ | |
"print(df['Property Type'].unique())\n", | |
"print(df['Old/New'].unique())\n", | |
"print(df['Duration'].unique())\n", | |
"print(len(df['Town/City'].unique()))\n", | |
"print(len(df['District'].unique()))\n", | |
"print(len(df['County'].unique()))\n", | |
"print(df['PPDCategory Type'].unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>County</th>\n", | |
" <th>Price_adj</th>\n", | |
" <th>count_col</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>BLAENAU GWENT</td>\n", | |
" <td>73269.808797</td>\n", | |
" <td>2016</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>MERTHYR TYDFIL</td>\n", | |
" <td>92139.130742</td>\n", | |
" <td>1817</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>CITY OF KINGSTON UPON HULL</td>\n", | |
" <td>93040.445431</td>\n", | |
" <td>8930</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>STOKE-ON-TRENT</td>\n", | |
" <td>93357.951855</td>\n", | |
" <td>8884</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CHESHIRE</td>\n", | |
" <td>93582.763717</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>RHONDDA CYNON TAFF</td>\n", | |
" <td>97942.358842</td>\n", | |
" <td>8668</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>NEATH PORT TALBOT</td>\n", | |
" <td>98825.860753</td>\n", | |
" <td>4795</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>BLACKPOOL</td>\n", | |
" <td>102804.562947</td>\n", | |
" <td>5346</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>BLACKBURN WITH DARWEN</td>\n", | |
" <td>104437.944282</td>\n", | |
" <td>4375</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>HARTLEPOOL</td>\n", | |
" <td>106497.267459</td>\n", | |
" <td>3278</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>NORTH EAST LINCOLNSHIRE</td>\n", | |
" <td>107329.134530</td>\n", | |
" <td>6363</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>COUNTY DURHAM</td>\n", | |
" <td>109642.945978</td>\n", | |
" <td>19041</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>CAERPHILLY</td>\n", | |
" <td>111230.417326</td>\n", | |
" <td>6052</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>REDCAR AND CLEVELAND</td>\n", | |
" <td>113482.892914</td>\n", | |
" <td>5042</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>MIDDLESBROUGH</td>\n", | |
" <td>116489.333010</td>\n", | |
" <td>4511</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>NORTH LINCOLNSHIRE</td>\n", | |
" <td>117873.960134</td>\n", | |
" <td>6222</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>TORFAEN</td>\n", | |
" <td>119174.545248</td>\n", | |
" <td>2935</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>DARLINGTON</td>\n", | |
" <td>122229.516852</td>\n", | |
" <td>4389</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>CARMARTHENSHIRE</td>\n", | |
" <td>125318.178312</td>\n", | |
" <td>6601</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>BRIDGEND</td>\n", | |
" <td>126621.160547</td>\n", | |
" <td>5795</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>HALTON</td>\n", | |
" <td>127586.846821</td>\n", | |
" <td>4083</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>MERSEYSIDE</td>\n", | |
" <td>127900.100493</td>\n", | |
" <td>46197</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>DENBIGHSHIRE</td>\n", | |
" <td>128796.602233</td>\n", | |
" <td>3645</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>SOUTH YORKSHIRE</td>\n", | |
" <td>132634.798086</td>\n", | |
" <td>48310</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>TYNE AND WEAR</td>\n", | |
" <td>132836.084261</td>\n", | |
" <td>39882</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>STOCKTON-ON-TEES</td>\n", | |
" <td>133128.442604</td>\n", | |
" <td>7856</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>LANCASHIRE</td>\n", | |
" <td>136263.712622</td>\n", | |
" <td>48494</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>LEICESTER</td>\n", | |
" <td>136328.622265</td>\n", | |
" <td>8937</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>CITY OF NOTTINGHAM</td>\n", | |
" <td>136861.714621</td>\n", | |
" <td>9726</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>WREXHAM</td>\n", | |
" <td>137288.756408</td>\n", | |
" <td>4211</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30</th>\n", | |
" <td>WREKIN</td>\n", | |
" <td>138486.625136</td>\n", | |
" <td>6979</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>NEWPORT</td>\n", | |
" <td>139437.992324</td>\n", | |
" <td>5377</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td>SWANSEA</td>\n", | |
" <td>139771.051919</td>\n", | |
" <td>8619</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>33</th>\n", | |
" <td>WEST YORKSHIRE</td>\n", | |
" <td>140539.665257</td>\n", | |
" <td>85211</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>34</th>\n", | |
" <td>CITY OF DERBY</td>\n", | |
" <td>141649.405825</td>\n", | |
" <td>10028</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>ISLE OF ANGLESEY</td>\n", | |
" <td>143291.146056</td>\n", | |
" <td>2690</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>36</th>\n", | |
" <td>LINCOLNSHIRE</td>\n", | |
" <td>143491.347628</td>\n", | |
" <td>36561</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td>NOTTINGHAMSHIRE</td>\n", | |
" <td>143506.978333</td>\n", | |
" <td>36573</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>CITY OF PLYMOUTH</td>\n", | |
" <td>144657.571347</td>\n", | |
" <td>11370</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>39</th>\n", | |
" <td>GREATER MANCHESTER</td>\n", | |
" <td>145231.369334</td>\n", | |
" <td>97432</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>40</th>\n", | |
" <td>CONWY</td>\n", | |
" <td>145332.987587</td>\n", | |
" <td>5180</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41</th>\n", | |
" <td>GWYNEDD</td>\n", | |
" <td>147036.995239</td>\n", | |
" <td>4208</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>42</th>\n", | |
" <td>DERBYSHIRE</td>\n", | |
" <td>147294.649761</td>\n", | |
" <td>33630</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>43</th>\n", | |
" <td>EAST RIDING OF YORKSHIRE</td>\n", | |
" <td>148328.992704</td>\n", | |
" <td>16176</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>44</th>\n", | |
" <td>WEST MIDLANDS</td>\n", | |
" <td>149291.498659</td>\n", | |
" <td>88194</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45</th>\n", | |
" <td>LUTON</td>\n", | |
" <td>149538.754231</td>\n", | |
" <td>6908</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>46</th>\n", | |
" <td>CUMBRIA</td>\n", | |
" <td>151739.284148</td>\n", | |
" <td>21670</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>47</th>\n", | |
" <td>NORTHUMBERLAND</td>\n", | |
" <td>151911.398850</td>\n", | |
" <td>12656</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>48</th>\n", | |
" <td>CITY OF PETERBOROUGH</td>\n", | |
" <td>152278.652859</td>\n", | |
" <td>8150</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>49</th>\n", | |
" <td>PEMBROKESHIRE</td>\n", | |
" <td>152330.039110</td>\n", | |
" <td>4493</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50</th>\n", | |
" <td>POWYS</td>\n", | |
" <td>154791.276240</td>\n", | |
" <td>4342</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51</th>\n", | |
" <td>FLINTSHIRE</td>\n", | |
" <td>155422.313028</td>\n", | |
" <td>5682</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>52</th>\n", | |
" <td>PORTSMOUTH</td>\n", | |
" <td>156280.267976</td>\n", | |
" <td>9020</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>53</th>\n", | |
" <td>SWINDON</td>\n", | |
" <td>157336.880129</td>\n", | |
" <td>10587</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>54</th>\n", | |
" <td>STAFFORDSHIRE</td>\n", | |
" <td>159243.168849</td>\n", | |
" <td>34041</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>55</th>\n", | |
" <td>TORBAY</td>\n", | |
" <td>162658.155764</td>\n", | |
" <td>7395</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>56</th>\n", | |
" <td>SOUTHAMPTON</td>\n", | |
" <td>164225.810983</td>\n", | |
" <td>9877</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>57</th>\n", | |
" <td>MEDWAY</td>\n", | |
" <td>164857.459768</td>\n", | |
" <td>11829</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>58</th>\n", | |
" <td>WARRINGTON</td>\n", | |
" <td>165903.758442</td>\n", | |
" <td>8687</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>59</th>\n", | |
" <td>CEREDIGION</td>\n", | |
" <td>165913.966997</td>\n", | |
" <td>2371</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>60</th>\n", | |
" <td>NORTHAMPTONSHIRE</td>\n", | |
" <td>167826.572749</td>\n", | |
" <td>35029</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>61</th>\n", | |
" <td>THURROCK</td>\n", | |
" <td>169655.694670</td>\n", | |
" <td>6808</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>62</th>\n", | |
" <td>NORFOLK</td>\n", | |
" <td>171960.048301</td>\n", | |
" <td>45387</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>63</th>\n", | |
" <td>CARDIFF</td>\n", | |
" <td>175319.455507</td>\n", | |
" <td>14751</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>64</th>\n", | |
" <td>ISLE OF WIGHT</td>\n", | |
" <td>177023.087517</td>\n", | |
" <td>8017</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>65</th>\n", | |
" <td>LEICESTERSHIRE</td>\n", | |
" <td>178787.594166</td>\n", | |
" <td>32661</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>66</th>\n", | |
" <td>SHROPSHIRE</td>\n", | |
" <td>180224.994285</td>\n", | |
" <td>12745</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>67</th>\n", | |
" <td>SUFFOLK</td>\n", | |
" <td>186618.696740</td>\n", | |
" <td>37160</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>68</th>\n", | |
" <td>THE VALE OF GLAMORGAN</td>\n", | |
" <td>186984.017697</td>\n", | |
" <td>5470</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>69</th>\n", | |
" <td>CHESHIRE WEST AND CHESTER</td>\n", | |
" <td>187133.783921</td>\n", | |
" <td>14340</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>70</th>\n", | |
" <td>SOMERSET</td>\n", | |
" <td>190042.943176</td>\n", | |
" <td>27498</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>71</th>\n", | |
" <td>WORCESTERSHIRE</td>\n", | |
" <td>190473.349404</td>\n", | |
" <td>26388</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>72</th>\n", | |
" <td>HEREFORDSHIRE</td>\n", | |
" <td>191937.252028</td>\n", | |
" <td>7669</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>73</th>\n", | |
" <td>NORTH YORKSHIRE</td>\n", | |
" <td>192596.686729</td>\n", | |
" <td>28848</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>74</th>\n", | |
" <td>SOUTH GLOUCESTERSHIRE</td>\n", | |
" <td>194889.125019</td>\n", | |
" <td>12794</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75</th>\n", | |
" <td>CORNWALL</td>\n", | |
" <td>199168.369268</td>\n", | |
" <td>27400</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>76</th>\n", | |
" <td>BEDFORD</td>\n", | |
" <td>199342.258983</td>\n", | |
" <td>7971</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>77</th>\n", | |
" <td>YORK</td>\n", | |
" <td>199457.010277</td>\n", | |
" <td>10186</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>78</th>\n", | |
" <td>MONMOUTHSHIRE</td>\n", | |
" <td>200824.646281</td>\n", | |
" <td>3956</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>79</th>\n", | |
" <td>NORTH SOMERSET</td>\n", | |
" <td>201204.299395</td>\n", | |
" <td>11683</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>80</th>\n", | |
" <td>BOURNEMOUTH</td>\n", | |
" <td>202581.476247</td>\n", | |
" <td>10514</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>81</th>\n", | |
" <td>SOUTHEND-ON-SEA</td>\n", | |
" <td>202829.761767</td>\n", | |
" <td>9059</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>82</th>\n", | |
" <td>CHESHIRE EAST</td>\n", | |
" <td>203600.412792</td>\n", | |
" <td>17859</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>83</th>\n", | |
" <td>CITY OF BRISTOL</td>\n", | |
" <td>204345.825712</td>\n", | |
" <td>21591</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>84</th>\n", | |
" <td>WARWICKSHIRE</td>\n", | |
" <td>205537.558500</td>\n", | |
" <td>25625</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>85</th>\n", | |
" <td>SLOUGH</td>\n", | |
" <td>207672.811222</td>\n", | |
" <td>4803</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>86</th>\n", | |
" <td>CENTRAL BEDFORDSHIRE</td>\n", | |
" <td>208143.014559</td>\n", | |
" <td>14946</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>87</th>\n", | |
" <td>MILTON KEYNES</td>\n", | |
" <td>210914.827990</td>\n", | |
" <td>12499</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>88</th>\n", | |
" <td>DEVON</td>\n", | |
" <td>212786.196210</td>\n", | |
" <td>41792</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>89</th>\n", | |
" <td>GLOUCESTERSHIRE</td>\n", | |
" <td>213038.633608</td>\n", | |
" <td>31422</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>90</th>\n", | |
" <td>WILTSHIRE</td>\n", | |
" <td>217053.701071</td>\n", | |
" <td>23671</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>91</th>\n", | |
" <td>KENT</td>\n", | |
" <td>222816.214665</td>\n", | |
" <td>73717</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>92</th>\n", | |
" <td>CAMBRIDGESHIRE</td>\n", | |
" <td>226680.603347</td>\n", | |
" <td>32160</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>93</th>\n", | |
" <td>EAST SUSSEX</td>\n", | |
" <td>229368.771959</td>\n", | |
" <td>31054</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>94</th>\n", | |
" <td>ESSEX</td>\n", | |
" <td>234025.117029</td>\n", | |
" <td>70874</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>95</th>\n", | |
" <td>RUTLAND</td>\n", | |
" <td>235240.240618</td>\n", | |
" <td>1957</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>96</th>\n", | |
" <td>DORSET</td>\n", | |
" <td>239313.188820</td>\n", | |
" <td>22762</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>97</th>\n", | |
" <td>READING</td>\n", | |
" <td>243440.568329</td>\n", | |
" <td>7985</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>98</th>\n", | |
" <td>HAMPSHIRE</td>\n", | |
" <td>250628.699991</td>\n", | |
" <td>67579</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>99</th>\n", | |
" <td>WEST SUSSEX</td>\n", | |
" <td>255318.828667</td>\n", | |
" <td>45916</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>100</th>\n", | |
" <td>BRACKNELL FOREST</td>\n", | |
" <td>264293.307341</td>\n", | |
" <td>6143</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>101</th>\n", | |
" <td>POOLE</td>\n", | |
" <td>265033.774489</td>\n", | |
" <td>8114</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>102</th>\n", | |
" <td>BRIGHTON AND HOVE</td>\n", | |
" <td>276522.260494</td>\n", | |
" <td>14962</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>103</th>\n", | |
" <td>BATH AND NORTH EAST SOMERSET</td>\n", | |
" <td>276606.562034</td>\n", | |
" <td>9021</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>104</th>\n", | |
" <td>WEST BERKSHIRE</td>\n", | |
" <td>281778.408823</td>\n", | |
" <td>7781</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>105</th>\n", | |
" <td>ISLES OF SCILLY</td>\n", | |
" <td>282605.156667</td>\n", | |
" <td>45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>106</th>\n", | |
" <td>OXFORDSHIRE</td>\n", | |
" <td>295648.926991</td>\n", | |
" <td>30887</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>107</th>\n", | |
" <td>WOKINGHAM</td>\n", | |
" <td>306010.842833</td>\n", | |
" <td>8156</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>108</th>\n", | |
" <td>HERTFORDSHIRE</td>\n", | |
" <td>306911.661768</td>\n", | |
" <td>56536</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>109</th>\n", | |
" <td>BUCKINGHAMSHIRE</td>\n", | |
" <td>333595.258420</td>\n", | |
" <td>26067</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>110</th>\n", | |
" <td>SURREY</td>\n", | |
" <td>386774.070890</td>\n", | |
" <td>61605</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>111</th>\n", | |
" <td>WINDSOR AND MAIDENHEAD</td>\n", | |
" <td>442982.051869</td>\n", | |
" <td>7118</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>112</th>\n", | |
" <td>GREATER LONDON</td>\n", | |
" <td>452225.698396</td>\n", | |
" <td>337924</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" County Price_adj count_col\n", | |
"0 BLAENAU GWENT 73269.808797 2016\n", | |
"1 MERTHYR TYDFIL 92139.130742 1817\n", | |
"2 CITY OF KINGSTON UPON HULL 93040.445431 8930\n", | |
"3 STOKE-ON-TRENT 93357.951855 8884\n", | |
"4 CHESHIRE 93582.763717 1\n", | |
"5 RHONDDA CYNON TAFF 97942.358842 8668\n", | |
"6 NEATH PORT TALBOT 98825.860753 4795\n", | |
"7 BLACKPOOL 102804.562947 5346\n", | |
"8 BLACKBURN WITH DARWEN 104437.944282 4375\n", | |
"9 HARTLEPOOL 106497.267459 3278\n", | |
"10 NORTH EAST LINCOLNSHIRE 107329.134530 6363\n", | |
"11 COUNTY DURHAM 109642.945978 19041\n", | |
"12 CAERPHILLY 111230.417326 6052\n", | |
"13 REDCAR AND CLEVELAND 113482.892914 5042\n", | |
"14 MIDDLESBROUGH 116489.333010 4511\n", | |
"15 NORTH LINCOLNSHIRE 117873.960134 6222\n", | |
"16 TORFAEN 119174.545248 2935\n", | |
"17 DARLINGTON 122229.516852 4389\n", | |
"18 CARMARTHENSHIRE 125318.178312 6601\n", | |
"19 BRIDGEND 126621.160547 5795\n", | |
"20 HALTON 127586.846821 4083\n", | |
"21 MERSEYSIDE 127900.100493 46197\n", | |
"22 DENBIGHSHIRE 128796.602233 3645\n", | |
"23 SOUTH YORKSHIRE 132634.798086 48310\n", | |
"24 TYNE AND WEAR 132836.084261 39882\n", | |
"25 STOCKTON-ON-TEES 133128.442604 7856\n", | |
"26 LANCASHIRE 136263.712622 48494\n", | |
"27 LEICESTER 136328.622265 8937\n", | |
"28 CITY OF NOTTINGHAM 136861.714621 9726\n", | |
"29 WREXHAM 137288.756408 4211\n", | |
"30 WREKIN 138486.625136 6979\n", | |
"31 NEWPORT 139437.992324 5377\n", | |
"32 SWANSEA 139771.051919 8619\n", | |
"33 WEST YORKSHIRE 140539.665257 85211\n", | |
"34 CITY OF DERBY 141649.405825 10028\n", | |
"35 ISLE OF ANGLESEY 143291.146056 2690\n", | |
"36 LINCOLNSHIRE 143491.347628 36561\n", | |
"37 NOTTINGHAMSHIRE 143506.978333 36573\n", | |
"38 CITY OF PLYMOUTH 144657.571347 11370\n", | |
"39 GREATER MANCHESTER 145231.369334 97432\n", | |
"40 CONWY 145332.987587 5180\n", | |
"41 GWYNEDD 147036.995239 4208\n", | |
"42 DERBYSHIRE 147294.649761 33630\n", | |
"43 EAST RIDING OF YORKSHIRE 148328.992704 16176\n", | |
"44 WEST MIDLANDS 149291.498659 88194\n", | |
"45 LUTON 149538.754231 6908\n", | |
"46 CUMBRIA 151739.284148 21670\n", | |
"47 NORTHUMBERLAND 151911.398850 12656\n", | |
"48 CITY OF PETERBOROUGH 152278.652859 8150\n", | |
"49 PEMBROKESHIRE 152330.039110 4493\n", | |
"50 POWYS 154791.276240 4342\n", | |
"51 FLINTSHIRE 155422.313028 5682\n", | |
"52 PORTSMOUTH 156280.267976 9020\n", | |
"53 SWINDON 157336.880129 10587\n", | |
"54 STAFFORDSHIRE 159243.168849 34041\n", | |
"55 TORBAY 162658.155764 7395\n", | |
"56 SOUTHAMPTON 164225.810983 9877\n", | |
"57 MEDWAY 164857.459768 11829\n", | |
"58 WARRINGTON 165903.758442 8687\n", | |
"59 CEREDIGION 165913.966997 2371\n", | |
"60 NORTHAMPTONSHIRE 167826.572749 35029\n", | |
"61 THURROCK 169655.694670 6808\n", | |
"62 NORFOLK 171960.048301 45387\n", | |
"63 CARDIFF 175319.455507 14751\n", | |
"64 ISLE OF WIGHT 177023.087517 8017\n", | |
"65 LEICESTERSHIRE 178787.594166 32661\n", | |
"66 SHROPSHIRE 180224.994285 12745\n", | |
"67 SUFFOLK 186618.696740 37160\n", | |
"68 THE VALE OF GLAMORGAN 186984.017697 5470\n", | |
"69 CHESHIRE WEST AND CHESTER 187133.783921 14340\n", | |
"70 SOMERSET 190042.943176 27498\n", | |
"71 WORCESTERSHIRE 190473.349404 26388\n", | |
"72 HEREFORDSHIRE 191937.252028 7669\n", | |
"73 NORTH YORKSHIRE 192596.686729 28848\n", | |
"74 SOUTH GLOUCESTERSHIRE 194889.125019 12794\n", | |
"75 CORNWALL 199168.369268 27400\n", | |
"76 BEDFORD 199342.258983 7971\n", | |
"77 YORK 199457.010277 10186\n", | |
"78 MONMOUTHSHIRE 200824.646281 3956\n", | |
"79 NORTH SOMERSET 201204.299395 11683\n", | |
"80 BOURNEMOUTH 202581.476247 10514\n", | |
"81 SOUTHEND-ON-SEA 202829.761767 9059\n", | |
"82 CHESHIRE EAST 203600.412792 17859\n", | |
"83 CITY OF BRISTOL 204345.825712 21591\n", | |
"84 WARWICKSHIRE 205537.558500 25625\n", | |
"85 SLOUGH 207672.811222 4803\n", | |
"86 CENTRAL BEDFORDSHIRE 208143.014559 14946\n", | |
"87 MILTON KEYNES 210914.827990 12499\n", | |
"88 DEVON 212786.196210 41792\n", | |
"89 GLOUCESTERSHIRE 213038.633608 31422\n", | |
"90 WILTSHIRE 217053.701071 23671\n", | |
"91 KENT 222816.214665 73717\n", | |
"92 CAMBRIDGESHIRE 226680.603347 32160\n", | |
"93 EAST SUSSEX 229368.771959 31054\n", | |
"94 ESSEX 234025.117029 70874\n", | |
"95 RUTLAND 235240.240618 1957\n", | |
"96 DORSET 239313.188820 22762\n", | |
"97 READING 243440.568329 7985\n", | |
"98 HAMPSHIRE 250628.699991 67579\n", | |
"99 WEST SUSSEX 255318.828667 45916\n", | |
"100 BRACKNELL FOREST 264293.307341 6143\n", | |
"101 POOLE 265033.774489 8114\n", | |
"102 BRIGHTON AND HOVE 276522.260494 14962\n", | |
"103 BATH AND NORTH EAST SOMERSET 276606.562034 9021\n", | |
"104 WEST BERKSHIRE 281778.408823 7781\n", | |
"105 ISLES OF SCILLY 282605.156667 45\n", | |
"106 OXFORDSHIRE 295648.926991 30887\n", | |
"107 WOKINGHAM 306010.842833 8156\n", | |
"108 HERTFORDSHIRE 306911.661768 56536\n", | |
"109 BUCKINGHAMSHIRE 333595.258420 26067\n", | |
"110 SURREY 386774.070890 61605\n", | |
"111 WINDSOR AND MAIDENHEAD 442982.051869 7118\n", | |
"112 GREATER LONDON 452225.698396 337924" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_sum_county = df.groupby('County').agg({'Price_adj': 'mean', 'count_col': 'count'}).sort_values(by='Price_adj').reset_index()\n", | |
"pd.set_option('display.max_rows', None)\n", | |
"df_sum_county" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Greater London" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[<matplotlib.lines.Line2D at 0x1dca6f31188>]" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 2 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"bins = np.arange(0, 1.2e6, 1e3)\n", | |
"df_set1 = df_set1.dropna()\n", | |
"df_set1_noout = df_set1[df_set1['Price_adj'] < df_set1['Price_adj'].quantile(.95)]\n", | |
"y, x = np.histogram(df_set1_noout['Price_adj'], bins=bins, density=True)\n", | |
"alpha, x0, inv_beta = gamma.fit(df_set1_noout['Price_adj'])\n", | |
"\n", | |
"# Plot of fit\n", | |
"fig, ax = plt.subplots()\n", | |
"ax2 = ax.twinx()\n", | |
"ax.plot(bins[1:], y)\n", | |
"ax.plot(bins, gamma.pdf(bins, alpha, loc=x0, scale=inv_beta), color='r')\n", | |
"#plt.ylim([0, 3e-6])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Greater Manchester & Midlands" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(0.0, 4.2e-06)" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"bins = np.arange(0, 1.2e6, 1e3)\n", | |
"df_subset = df_subset.dropna()\n", | |
"df_subset_noout = df_subset[df_subset['Price_adj'] < df_subset['Price_adj'].quantile(.95)]\n", | |
"y, x = np.histogram(df_subset_noout['Price_adj'], bins=bins, density=True)\n", | |
"alpha, x0, inv_beta = gamma.fit(df_subset_noout['Price_adj'])\n", | |
"\n", | |
"# Plot of fit\n", | |
"fig, ax = plt.subplots()\n", | |
"ax.plot(bins[1:], y)\n", | |
"ax.plot(bins, gamma.pdf(bins, alpha, loc=x0, scale=inv_beta), color='r')\n", | |
"plt.ylim([0, 4.2e-6])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## UK" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 720x288 with 2 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"bins = np.arange(0, 1.2e6, 1e3)\n", | |
"fig, (ax1, ax2) = plt.subplots(1, 2)\n", | |
"fig.set_figwidth(10)\n", | |
"\n", | |
"\n", | |
"#=============================\n", | |
"# Plot 1\n", | |
"#=============================\n", | |
"perc = df['Price_adj'].quantile(.95)\n", | |
"df_noout = df[df['Price_adj'] < perc]\n", | |
"#df_noout = df[df['Price_adj'] < 5e5]\n", | |
"y, x = np.histogram(df_noout['Price_adj'], bins=bins, density=True)\n", | |
"alpha, x0, inv_beta = gamma.fit(df_noout['Price_adj'])\n", | |
"\n", | |
"ax1.axis([0, 1.2e6, 0, 7e-6])\n", | |
"ax1.plot(bins[1:], y)\n", | |
"ax1.plot(bins, gamma.pdf(bins, alpha, loc=x0, scale=inv_beta), color='r')\n", | |
"ax1.axvline(perc, color='g', linestyle='dashed')\n", | |
"ax1.grid()\n", | |
"\n", | |
"plt.setp(ax1, xticks=[0, 2e5, 4e5, 6e5, 8e5, 1e6, 1.2e6], \n", | |
" xticklabels=['0', '200k', '400k', '600k', '800k', '1M', '1.2M'])\n", | |
"plt.setp(ax1, xlabel='Price (£)')\n", | |
"plt.setp(ax1, ylabel='pdf')\n", | |
"ax1.legend(['Empirical distribution (p95)', 'Gamma fit', 'p95 threshold'])\n", | |
"\n", | |
"#=============================\n", | |
"# Plot 2\n", | |
"#=============================\n", | |
"perc = df['Price_adj'].quantile(.99)\n", | |
"df_noout = df[df['Price_adj'] < perc]\n", | |
"y, x = np.histogram(df_noout['Price_adj'], bins=bins, density=True)\n", | |
"alpha, x0, inv_beta = gamma.fit(df_noout['Price_adj'])\n", | |
"\n", | |
"\n", | |
"ax2.axis([0, 1.2e6, 0, 7e-6])\n", | |
"ax2.plot(bins[1:], y)\n", | |
"ax2.plot(bins, gamma.pdf(bins, alpha, loc=x0, scale=inv_beta), color='r')\n", | |
"ax2.axvline(perc, color='g', linestyle='dashed')\n", | |
"ax2.grid()\n", | |
"plt.setp(ax2, xticks=[0, 2e5, 4e5, 6e5, 8e5, 1e6, 1.2e6], \n", | |
" xticklabels=['0', '200k', '400k', '600k', '800k', '1M', '1.2M'])\n", | |
"plt.setp(ax2, xlabel='Price(£)')\n", | |
"plt.setp(ax2, ylabel='pdf')\n", | |
"ax2.legend(['Empirical distribution (p99)', 'Gamma fit', 'p99 threshold'])\n", | |
"\n", | |
"plt.tight_layout()\n", | |
"#plt.savefig('figures_mixture/gamma_fit.png', dpi=600)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD4CAYAAADsKpHdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAU50lEQVR4nO3df5Bd5X3f8ffHUux6kjCIsNVQRCriKJ5i2pHtLdAfzuBgg8BphDseR/rDyCm17DHMNHUzrUg6A2OXCU3HccuMjSPHGomODabBHjRBhCiME/rDxCw1BYF/aMFQpBHSxnJMp7i04G//uM/GB7Errfbe3b139/2aubPnfM+P+9yzuudzz/Ocu0pVIUnS65a6AZKk4WAgSJIAA0GS1BgIkiTAQJAkNauXugHzdfbZZ9f69euXuhmSNFIeeeSRv6iqsZmWjWwgrF+/nomJiaVuhiSNlCTPzrbMLiNJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIQ239jnuXuglaQQwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkScAcAiHJriTHkhzo1L6U5NH2eCbJo62+PskPO8s+29nm7UkeTzKZ5NYkafWzkuxPcrD9XLMAr1OSdApzuULYDWzqFqrqV6tqY1VtBO4GvtxZ/NT0sqr6SKd+G/AhYEN7TO9zB/BAVW0AHmjzkqRFdspAqKoHgeMzLWuf8t8P3HGyfSQ5Bzijqh6qqgJuB65uizcDe9r0nk5dkrSI+h1DeAdwtKoOdmrnJ/lGkj9L8o5WOxc41FnnUKsBrK2qI236eWDtbE+WZHuSiSQTU1NTfTZdktTVbyBs5dVXB0eAn62qtwIfA76Y5Iy57qxdPdRJlu+sqvGqGh8bG5tvmyVJM5h3ICRZDfxj4EvTtap6qaq+16YfAZ4CfgE4DKzrbL6u1QCOti6l6a6lY/NtkzQq/KN1Gkb9XCG8C/hWVf1VV1CSsSSr2vTP0Rs8frp1Cb2Q5JI27nANcE/bbC+wrU1v69QlSYtoLred3gF8DXhzkkNJrm2LtvDaweRfBB5rt6H+AfCRqpoekP4o8PvAJL0rh/ta/Rbg3UkO0guZW+b/ciRJ87X6VCtU1dZZ6h+coXY3vdtQZ1p/Arhwhvr3gMtO1Q5pubC7SMPKbypLkgADQZLUGAjSErL7SMPEQJAkAQaCNPS8itBiMRCkJeKJXsPGQJCWmMGgYWEgSJIAA0GS1BgI0iKye0jDzECQJAEGgrQo1u+4t++rA68utNAMBEkSYCBIkhoDQRoCdgdpGBgI0gLzZK9RYSBIkgADQRoag7gTSeqHgSAtEE/uGjWnDIQku5IcS3KgU7spyeEkj7bHVZ1lNySZTPLtJFd06ptabTLJjk79/CR/3upfSvL6Qb5AaRR5taClMJcrhN3Aphnqn6qqje2xDyDJBcAW4C1tm88kWZVkFfBp4ErgAmBrWxfg37Z9/TzwfeDafl6QJGl+ThkIVfUgcHyO+9sM3FlVL1XVd4FJ4KL2mKyqp6vq/wJ3ApuTBPgl4A/a9nuAq0/vJUjDy0/5GiX9jCFcn+Sx1qW0ptXOBZ7rrHOo1War/wzwl1X18gn1GSXZnmQiycTU1FQfTZcWVj9BMNO2BosWw3wD4TbgTcBG4AjwyUE16GSqamdVjVfV+NjY2GI8pSStGKvns1FVHZ2eTvI54A/b7GHgvM6q61qNWerfA85MsrpdJXTXlyQtonldISQ5pzP7XmD6DqS9wJYkb0hyPrAB+DrwMLCh3VH0enoDz3urqoCvAu9r228D7plPmyRJ/ZnLbad3AF8D3pzkUJJrgd9J8niSx4B3Av8coKqeAO4CngT+CLiuql5pn/6vB+4Hvgnc1dYF+FfAx5JM0htT+PxAX6G0jDiWoIV0yi6jqto6Q3nWk3ZV3QzcPEN9H7BvhvrT9O5CkiQtIb+pLEkCDARp4OzW0agyECRJgIEgSWoMBGmA7C7SKDMQJEmAgSBJagwEacTYLaWFYiBIkgADQZLUGAiSJMBAkCQ18/r/ECS9mgO9Wg68QpAkAQaCJKkxEKQ+2V2k5cJAkEaQIaSFYCBIkgADQZLUnDIQkuxKcizJgU7t3yX5VpLHknwlyZmtvj7JD5M82h6f7Wzz9iSPJ5lMcmuStPpZSfYnOdh+rlmA1ylJOoW5XCHsBjadUNsPXFhVfwf4DnBDZ9lTVbWxPT7Sqd8GfAjY0B7T+9wBPFBVG4AH2rwkaZGdMhCq6kHg+Am1P66ql9vsQ8C6k+0jyTnAGVX1UFUVcDtwdVu8GdjTpvd06pJOwoFlDdogxhD+CXBfZ/78JN9I8mdJ3tFq5wKHOuscajWAtVV1pE0/D6yd7YmSbE8ykWRiampqAE2X+uNJWctJX4GQ5LeAl4EvtNIR4Ger6q3Ax4AvJjljrvtrVw91kuU7q2q8qsbHxsb6aLkk6UTz/ltGST4I/DJwWTuRU1UvAS+16UeSPAX8AnCYV3crrWs1gKNJzqmqI61r6dh82yRJmr95XSEk2QT8S+BXqurFTn0syao2/XP0Bo+fbl1CLyS5pN1ddA1wT9tsL7CtTW/r1CVJi+iUVwhJ7gAuBc5Ocgi4kd5dRW8A9re7Rx9qdxT9IvDxJP8P+BHwkaqaHpD+KL07lt5Ib8xhetzhFuCuJNcCzwLvH8grkySdllMGQlVtnaH8+VnWvRu4e5ZlE8CFM9S/B1x2qnZIkhaW31SWJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCNNL889saJANBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIM2bd/houTEQJEmAgSBJagwESRIwx0BIsivJsSQHOrWzkuxPcrD9XNPqSXJrkskkjyV5W2ebbW39g0m2depvT/J42+bWJBnki5QGbdjGD4atPRpNc71C2A1sOqG2A3igqjYAD7R5gCuBDe2xHbgNegEC3AhcDFwE3DgdIm2dD3W2O/G5JEkLbE6BUFUPAsdPKG8G9rTpPcDVnfrt1fMQcGaSc4ArgP1Vdbyqvg/sBza1ZWdU1UNVVcDtnX1JmiOvEtSvfsYQ1lbVkTb9PLC2TZ8LPNdZ71Crnax+aIb6ayTZnmQiycTU1FQfTZcknWggg8rtk30NYl+neJ6dVTVeVeNjY2ML/XSStKL0EwhHW3cP7eexVj8MnNdZb12rnay+boa6JGkR9RMIe4HpO4W2Afd06te0u40uAX7QupbuBy5PsqYNJl8O3N+WvZDkknZ30TWdfUk6DY4jqB+r57JSkjuAS4Gzkxyid7fQLcBdSa4FngXe31bfB1wFTAIvAr8GUFXHk3wCeLit9/Gqmh6o/ii9O5neCNzXHpKkRTSnQKiqrbMsumyGdQu4bpb97AJ2zVCfAC6cS1skvZpXBRoUv6ksSQIMBElSYyBIkgADQZLUGAjSMuMgs+bLQJBOkydcLVcGgiQJMBAkSY2BIEkCDARJUmMgSKdhVAaUR6WdGi4GgiQJMBCkZcurBJ0uA0GSBBgIkqTGQJDmyC4YLXcGgiQJMBCkZc2rGp0OA0GSBPQRCEnenOTRzuOFJL+e5KYkhzv1qzrb3JBkMsm3k1zRqW9qtckkO/p9UZKk0zfvQKiqb1fVxqraCLwdeBH4Slv8qellVbUPIMkFwBbgLcAm4DNJViVZBXwauBK4ANja1pU0AHYbaa5WD2g/lwFPVdWzSWZbZzNwZ1W9BHw3ySRwUVs2WVVPAyS5s6375IDaJvXNk6pWgkGNIWwB7ujMX5/ksSS7kqxptXOB5zrrHGq12eqvkWR7kokkE1NTUwNquiQJBhAISV4P/Arwn1rpNuBNwEbgCPDJfp9jWlXtrKrxqhofGxsb1G4lSQzmCuFK4L9X1VGAqjpaVa9U1Y+Az/HjbqHDwHmd7da12mx1acnZVaSVZBCBsJVOd1GSczrL3gscaNN7gS1J3pDkfGAD8HXgYWBDkvPb1caWtq6kATHYNBd9DSon+Ung3cCHO+XfSbIRKOCZ6WVV9USSu+gNFr8MXFdVr7T9XA/cD6wCdlXVE/20S5J0+voKhKr638DPnFD7wEnWvxm4eYb6PmBfP22RFoqfrrVS+E1lSRJgIEiSGgNBmsVy6ypabq9Hg2cgSJIAA0GS1BgI0gzsXtFKZCBIkgADQZLUGAjSCrJ+x712h2lWBoLU4clSK5mBIK1ABp9mYiBIJ/BkqZXKQJAkAQaCJKkxECRJgIEgSWoMBKlZaYPJK+316tQMBEkSYCBIkpq+AyHJM0keT/JokolWOyvJ/iQH2881rZ4ktyaZTPJYkrd19rOtrX8wybZ+2yWdDrtPpMFdIbyzqjZW1Xib3wE8UFUbgAfaPMCVwIb22A7cBr0AAW4ELgYuAm6cDhFJC8sw1LSF6jLaDOxp03uAqzv126vnIeDMJOcAVwD7q+p4VX0f2A9sWqC2SWoMA3UNIhAK+OMkjyTZ3mprq+pIm34eWNumzwWe62x7qNVmq79Kku1JJpJMTE1NDaDpkqRpqwewj39YVYeT/HVgf5JvdRdWVSWpATwPVbUT2AkwPj4+kH1Kknr6vkKoqsPt5zHgK/TGAI62riDaz2Nt9cPAeZ3N17XabHVJ0iLpKxCS/GSSn56eBi4HDgB7gek7hbYB97TpvcA17W6jS4AftK6l+4HLk6xpg8mXt5q04OxH9xiop98uo7XAV5JM7+uLVfVHSR4G7kpyLfAs8P62/j7gKmASeBH4NYCqOp7kE8DDbb2PV9XxPtsmSToNqRrNrvjx8fGamJhY6mZoGfDT8Y89c8t7lroJWmBJHul8ReBV/KayJAkwELTCeXUg/ZiBIEkCDARJHV4xrWwGgiQJMBAkSY2BoBXL7hHp1QwESa9hWK5MBoJWJE94s/PYrFwGgiQJMBAkSY2BIEkCDAStQPaRSzMzECTNyOBceQwErSie5KTZGQhaMQwD6eQMBEmzMkRXFgNBkgQYCJKkZt6BkOS8JF9N8mSSJ5L8s1a/KcnhJI+2x1WdbW5IMpnk20mu6NQ3tdpkkh39vSTptez6mD+P3crRzxXCy8C/qKoLgEuA65Jc0JZ9qqo2tsc+gLZsC/AWYBPwmSSrkqwCPg1cCVwAbO3sR9IQMBRWhtXz3bCqjgBH2vT/SvJN4NyTbLIZuLOqXgK+m2QSuKgtm6yqpwGS3NnWfXK+bZM0eNOh8Mwt71nilmihDGQMIcl64K3An7fS9UkeS7IryZpWOxd4rrPZoVabrT7T82xPMpFkYmpqahBNlyQ1fQdCkp8C7gZ+vapeAG4D3gRspHcF8cl+n2NaVe2sqvGqGh8bGxvUbrXM2d0hzU1fgZDkJ+iFwReq6ssAVXW0ql6pqh8Bn+PH3UKHgfM6m69rtdnqkoaQAbt89XOXUYDPA9+sqt/t1M/prPZe4ECb3gtsSfKGJOcDG4CvAw8DG5Kcn+T19Aae9863XZKk+Zn3oDLwD4APAI8nebTVfpPeXUIbgQKeAT4MUFVPJLmL3mDxy8B1VfUKQJLrgfuBVcCuqnqij3ZJQO+TrAOgC8MB5uUpVbXUbZiX8fHxmpiYWOpmaEjZrbE4DITRk+SRqhqfaZnfVJY0bwbv8mIgSOqLobB8GAiSJMBAkDQAXiUsDwaClh1PTkvD4z76DAQtK56UlpbHf7QZCFo2PBkNh/U77vV3MaIMBEkSYCBoGfAT6XDydzJ6DASNLE84w8+wHi0GgkaaJ5vR4O9pNBgIkiTAQNAIshtiNPk7G34GgqRFYygMNwNBI8UTyujzCm949fMf5EgLzhPH8uV/sjN8DARJS2qm0DckloaBoKHl1cHKdeLv3oBYHAaChoohoJl4FbE4hiYQkmwC/gOwCvj9qrpliZukRWAAaL5mC4n1O+41LOYpVbXUbSDJKuA7wLuBQ8DDwNaqenK2bcbHx2tiYmKRWqj5mH5jetLXMOqGxkoKkSSPVNX4jMuGJBD+HnBTVV3R5m8AqKrfnm0bA2HhzXRC9wQvzd1M75elDqJRCIT3AZuq6p+2+Q8AF1fV9Sestx3Y3mYvBA4sakOXn7OBv1jqRow4j+FgeBz7N9dj+DeramymBUMzhjAXVbUT2AmQZGK2lNPceAz75zEcDI9j/wZxDIflm8qHgfM68+taTZK0SIYlEB4GNiQ5P8nrgS3A3iVukyStKEPRZVRVLye5Hrif3m2nu6rqiVNstnPhW7bseQz75zEcDI9j//o+hkMxqCxJWnrD0mUkSVpiBoIkCRiRQEiyK8mxJAc6tT9N4m1qJzHLcTsryf4kB9vPNa1+U5LfWLrWDo/TPG5JcmuSySSPJXlbq1+a5A+X6jUMo1mO6+4kLyb56U7t3yepJGcvTUuHz2n+m/xgO37v6qx7dau972TPMxKBAOwGNi11I0bQbl573HYAD1TVBuCBNq9X283cj9uVwIb22A7ctkhtHEW7mfl9PAlsBkjyOuCX8LbzE+3m9N7Lj9O7W3PaVuB/nOpJRiIQqupB4PhMy5K8rn3K+DeL3KyhN8tx2wzsadN7gKtP3C7Jh5Lcl+SNC9vC4XSax20zcHv1PAScmeSc7oZJ/m6SbyR50wI2e+id5H18J/CrbfpS4L8CLy9Ss0bCPN7L/xm4KMlPJPkp4OeBR0/1PCMRCCexGvgCcLCq/vVSN2ZErK2qI236eWBtd2G7/feXgaur6oeL3bghNttxOxd4rrPeoVYDIMnfBz4LbK6qpxajoSPoO8BY6/LYSi8gdGoney8X8CfAFfSCY07f6xr1QPg94EBV3bzUDRlF1bvnuHvf8TX0ukDeV1UvLU2rht8Mx202f4veveH/qKr+58K2auR9mV4Xx8X0Pt3qNMzyb/JOesd0C3DHXPYz6oHw34B3JvlrS92QEXJ0ukuj/TzWWfY4sJ7enw7Rq8123E72Z1eOAP8HeOtiNXKEfQn4BLC/qn601I0ZESd7L1NVXwf+NnB2VX1nLjsc9UD4PLAPuCvJUHzregTsBba16W3APZ1l3wA+DOxN8jcWu2FDbrbjthe4pt1tdAnwg85l/F8C7wF+O8mli9fU0VNVzwK/BXxmqdsyQk72Xp62A/jNue5wJAIhyR3A14A3JzmU5NrpZVX1u/ROZP+x3aGgZpbjdgvw7iQHgXe1+b9SVf8F+A3g3pV6299pHrd9wNP07pT5HPDR7r6q6ii9MZlPJ7l4kV7CUDrZ+xigqn7PcZaZzee9DFBV91XVV+f8PP7pCkkSjMgVgiRp4RkIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElS8/8BLhad+nsOUQMAAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"x = df['Price_adj']\n", | |
"log_x = np.log(x)\n", | |
"log_bins = np.arange(8, 16, 0.01)\n", | |
"plt.hist(log_x, bins=log_bins, alpha=1)\n", | |
"plt.xticks(np.log(10**np.array([3, 4, 5, 6, 7])), ['1k', '10k', '100k', '1M', '10M'])\n", | |
"plt.show()\n", | |
"\n", | |
"# Gamma fit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 1080x288 with 3 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.rcParams['font.size'] = 12\n", | |
"\n", | |
"df = df.dropna()\n", | |
"x = df['Price_adj']\n", | |
"log_x = np.log(x)\n", | |
"log_bins = np.arange(8, 19, 0.05)\n", | |
"bins = np.exp(log_bins) #np.arange(1e3, 1e8, 1e5)\n", | |
"y, _ = np.histogram(x, bins=bins, density=True)\n", | |
"log_y, _ = np.histogram(log_x, bins=log_bins, density=True)\n", | |
"\n", | |
"# Normal fit - Baseline\n", | |
"mu_singleFit, std_singleFit = norm.fit(log_x)\n", | |
"pdf_single = norm.pdf(log_bins, loc=mu_singleFit, scale=std_singleFit)\n", | |
"\n", | |
"# Plot\n", | |
"norm_y = sum(np.diff(bins)*y)\n", | |
"norm_log_y = sum(np.diff(log_bins)*log_y)\n", | |
"norm_single = sum(np.diff(log_bins)*pdf_single[1:])\n", | |
"\n", | |
"# Dif bins\n", | |
"dif_bins = np.diff(bins)\n", | |
"dif_log_bins = np.diff(log_bins)\n", | |
"#=====================================================================\n", | |
"# PLOTS\n", | |
"#=====================================================================\n", | |
"fig, (ax1, ax2, ax3) = plt.subplots(1, 3)\n", | |
"fig.set_figwidth(15)\n", | |
"\n", | |
"# Plot 1\n", | |
"ax1.semilogy(log_bins[1:], log_y/norm_log_y/dif_bins*dif_log_bins)\n", | |
"ax1.semilogy(log_bins[1:], pdf_single[1:]/norm_single/dif_bins*dif_log_bins)\n", | |
"plt.setp(ax1, ylim=[1e-12, 1e-4])\n", | |
"plt.setp(ax1, xticks=np.log(10**np.array([4, 5, 6, 7, 8])), xticklabels=['10k', '100k', '1M', '10M', '100M'])\n", | |
"ax1.grid()\n", | |
"ax1.legend(['Empirical distribution', 'Log-normal fit'])\n", | |
"plt.setp(ax1, xlabel='Price (£)')\n", | |
"plt.setp(ax1, ylabel='pdf')\n", | |
"\n", | |
"# Plot 2\n", | |
"ax2.plot(log_bins[1:], log_y/norm_log_y/dif_bins*dif_log_bins)\n", | |
"ax2.plot(log_bins[1:], pdf_single[1:]/norm_single/dif_bins*dif_log_bins)\n", | |
"plt.setp(ax2, ylim=[1e-11, 6e-6])\n", | |
"plt.setp(ax2, xticks=np.log(10**np.array([4, 5, 6, 7, 8])), xticklabels=['10k', '100k', '1M', '10M', '100M'])\n", | |
"ax2.grid()\n", | |
"ax2.legend(['Empirical distribution', 'Log-normal fit'])\n", | |
"plt.setp(ax2, xlabel='Price (£)')\n", | |
"plt.setp(ax2, ylabel='pdf')\n", | |
"\n", | |
"# Plot 3\n", | |
"ax3.plot(bins[1:], y/norm_y)\n", | |
"ax3.plot(bins, pdf_single/norm_single/np.exp(log_bins))\n", | |
"plt.setp(ax3, xticks=[0, 2e5, 4e5, 6e5, 8e5, 1e6, 1.2e6], \n", | |
" xticklabels=['0', '200k', '400k', '600k', '800k', '1M', '1.2M'])\n", | |
"ax3.axis([0, 1.2e6, 0, 6e-6])\n", | |
"ax3.grid()\n", | |
"ax3.legend(['Empirical distribution', 'Log-normal fit'])\n", | |
"plt.setp(ax3, xlabel='Price (£)')\n", | |
"plt.setp(ax3, ylabel='pdf')\n", | |
"plt.tight_layout()\n", | |
"plt.savefig('figures_mixture/lognormal_fit.png', dpi=600)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## EM Algorithm" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Log Transformation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x = df['Price_adj']\n", | |
"### Run the EM algorithm\n", | |
"## Initialize the parameters\n", | |
"KK_vec = np.arange(6, 7)\n", | |
"\n", | |
"# Parameters\n", | |
"w_vec = []\n", | |
"mu_vec = []\n", | |
"std_vec = []\n", | |
"w_ini_vec = []\n", | |
"mu_ini_vec = []\n", | |
"std_ini_vec = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"===========================================\n", | |
"Mixture with 6 components\n", | |
"[1, -4345968.634045105, inf]\n", | |
"[2, -4561142.752665209, 0.04717548436614283]\n", | |
"[3, -4677573.910375801, 0.02489135606223656]\n", | |
"[4, -4761019.682703069, 0.01752686984900095]\n", | |
"[5, -4828835.309894625, 0.014043889020732737]\n", | |
"[6, -4887251.656693353, 0.011952801063297795]\n", | |
"[7, -4938957.65514636, 0.01046901027773133]\n", | |
"[8, -4985323.820098668, 0.009300532247349636]\n", | |
"[9, -5027157.514330835, 0.008321540376030017]\n", | |
"[10, -5064990.490991956, 0.007469505960259191]\n", | |
"[11, -5099205.211596792, 0.006709814409316905]\n", | |
"[12, -5130101.277128316, 0.006022505962849053]\n", | |
"[13, -5157936.177906341, 0.005396519037450155]\n", | |
"[14, -5182951.3166292375, 0.004826427491733671]\n", | |
"[15, -5205386.634308244, 0.004310019457755051]\n", | |
"[16, -5225485.672313755, 0.0038463483139954276]\n", | |
"[17, -5243493.514417743, 0.003434321422248882]\n", | |
"[18, -5259650.712175641, 0.0030719145894033168]\n", | |
"[19, -5274186.172111279, 0.0027559626189341577]\n", | |
"[20, -5287311.137114098, 0.0024823515511861456]\n", | |
"[21, -5299215.302163264, 0.0022464014708566244]\n", | |
"[22, -5310065.197849173, 0.0020432697681949433]\n", | |
"[23, -5320004.439879939, 0.0018682770180150768]\n", | |
"[24, -5329155.250590265, 0.0017171221854180886]\n", | |
"[25, -5337620.687107154, 0.001585994399590318]\n", | |
"[26, -5345487.138687189, 0.001471606118570984]\n", | |
"[27, -5352826.801465474, 0.0013711750913136048]\n", | |
"[28, -5359699.962458478, 0.0012823779392776903]\n", | |
"[29, -5366157.014123997, 0.0012032916011445708]\n", | |
"[30, -5372240.178371306, 0.0011323328900669297]\n", | |
"[31, -5377984.952031568, 0.0010682018844422373]\n", | |
"[32, -5383421.302217368, 0.001009831830096815]\n", | |
"[33, -5388574.646027783, 0.0009563463715240004]\n", | |
"[34, -5393466.649170534, 0.00090702389779367]\n", | |
"[35, -5398115.875193564, 0.0008612682888849171]\n", | |
"[36, -5402538.312908189, 0.0008185851646916964]\n", | |
"[37, -5406747.805238593, 0.0007785627297662053]\n", | |
"[38, -5410756.398655732, 0.0007408563834318437]\n", | |
"[39, -5414574.628767919, 0.0007051763756104083]\n", | |
"[40, -5418211.7546072975, 0.0006712779057197069]\n", | |
"[41, -5421675.951648219, 0.0006389531709042741]\n", | |
"[42, -5424974.4715629835, 0.0006080249652888372]\n", | |
"[43, -5428113.775090402, 0.0005783415118940171]\n", | |
"[44, -5431099.643094906, 0.0005497722746258005]\n", | |
"[45, -5433937.26986467, 0.0005222045505569517]\n", | |
"[46, -5436631.341883773, 0.0004955406849730401]\n", | |
"[47, -5439186.104670285, 0.0004696957848746249]\n", | |
"[48, -5441605.419764624, 0.00044459583297816127]\n", | |
"[49, -5443892.813550451, 0.0004201761247270126]\n", | |
"[50, -5446051.519271882, 0.0003963799669893727]\n", | |
"[51, -5448084.513357526, 0.0003731575897288264]\n", | |
"[52, -5449994.54695982, 0.0003504652318156886]\n", | |
"[53, -5451784.1734563215, 0.00032826436989465043]\n", | |
"[54, -5453455.772529614, 0.00030652106536059934]\n", | |
"[55, -5455011.571337275, 0.000285205409248958]\n", | |
"[56, -5456453.663198323, 0.00026429104873997846]\n", | |
"[57, -5457784.024152769, 0.00024375478189654152]\n", | |
"[58, -5459004.527694096, 0.000223576209753141]\n", | |
"[59, -5460116.957927387, 0.00020373743673681656]\n", | |
"[60, -5461123.021367006, 0.0001842228119898611]\n", | |
"[61, -5462024.357555112, 0.00016501870535585062]\n", | |
"[62, -5462822.548655455, 0.00014611331289528825]\n", | |
"[63, -5463519.128153566, 0.0001274964874785172]\n", | |
"[64, -5464115.588775476, 0.0001091595908284653]\n", | |
"[65, -5464613.389720304, 9.109536381181689e-05]\n" | |
] | |
} | |
], | |
"source": [ | |
"np.random.seed(1)\n", | |
"log_x = np.log(x)\n", | |
"for KK in KK_vec:\n", | |
" # INITIALISATION\n", | |
" w = 1/KK*np.ones((KK, 1)) #Assign equal weight to each component to start with\n", | |
" mu = np.random.normal(loc=log_x.mean(), scale=log_x.std()/KK, size=KK)#\n", | |
" std = log_x.std()*np.ones(KK)/KK\n", | |
"\n", | |
" # Initial parameters\n", | |
" w_ini = w.copy()\n", | |
" mu_ini = mu.copy()\n", | |
" std_ini = std.copy()\n", | |
" # Parameters\n", | |
" sw = False\n", | |
" QQ = -np.inf\n", | |
" epsilon = 1e-4\n", | |
" max_iter = 100\n", | |
" i = 0\n", | |
" # x = df_noout['Price_adj']\n", | |
" print(\"===========================================\")\n", | |
" print(\"Mixture with {} components\".format(KK))\n", | |
" while((~sw) & (i < max_iter)):\n", | |
" i+=1\n", | |
" ## E step\n", | |
" L = np.zeros([KK, len(x)])\n", | |
" v = np.zeros([KK, len(x)])\n", | |
" for k in range(KK):\n", | |
" L[k, :] = norm.logpdf(log_x, loc=mu[k], scale=std[k])\n", | |
" Lmax = np.amax(L, axis=0)\n", | |
" for k in range(KK):\n", | |
" L[k, :] -= Lmax\n", | |
" denom = (w*np.exp(L)).sum(axis=0)\n", | |
" for k in range(KK):\n", | |
" v[k, :] = w[k]*np.exp(L[k, :])/denom\n", | |
"\n", | |
" ## M step\n", | |
" for k in range(KK):\n", | |
" w[k] = v[k,:].mean()\n", | |
" mu[k] = (v[k,:]*log_x).sum()/v[k, :].sum()\n", | |
" std[k] = np.sqrt((v[k,:]*(log_x-mu[k])**2).sum()/v[k,:].sum())\n", | |
"\n", | |
" ##Check convergence\n", | |
" QQn = 0\n", | |
" for k in range(KK):\n", | |
" QQn += (v[k, :]*(np.log(w[k]) + norm.logpdf(log_x, loc=mu[k], scale=std[k]))).sum()\n", | |
" rel_error = abs(QQn-QQ)/abs(QQn)\n", | |
" if(rel_error < epsilon):\n", | |
" sw=True\n", | |
"\n", | |
" QQ = QQn\n", | |
" print([i, QQ, rel_error])\n", | |
"\n", | |
" ## ASSIGN Results\n", | |
" w_vec.append(w)\n", | |
" mu_vec.append(mu)\n", | |
" std_vec.append(std)\n", | |
" w_ini_vec.append(w_ini)\n", | |
" mu_ini_vec.append(mu_ini)\n", | |
" std_ini_vec.append(std_ini)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### BIC for mixture" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# n = len(x)\n", | |
"# BIC_vec = []\n", | |
"# for index, KK in enumerate(KK_vec):\n", | |
"# LL = np.zeros(n)\n", | |
"# for k in range(KK):\n", | |
"# LL += w_vec[index][k]*lognorm.pdf(x, loc=mu_vec[index][k], scale=std_vec[index][k])\n", | |
"# LL = np.log(LL).sum()\n", | |
"# BIC_vec.append(-2*LL + (3*KK-1)*np.log(n))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Store and plot results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('EMfit_LogScale.pickle', 'wb') as f:\n", | |
" pickle.dump([KK_vec, w_vec, mu_vec, std_vec], f)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### No transformation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"x = df['Price_adj']\n", | |
"### Run the EM algorithm\n", | |
"## Initialize the parameters\n", | |
"KK_vec = np.arange(6, 7)\n", | |
"\n", | |
"# Parameters\n", | |
"w_vec = []\n", | |
"mu_vec = []\n", | |
"std_vec = []\n", | |
"w_ini_vec = []\n", | |
"mu_ini_vec = []\n", | |
"std_ini_vec = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"===========================================\n", | |
"Mixture with 6 components\n", | |
"[1, -36051070.66555333, inf]\n", | |
"[2, -35575849.8257936, 0.013357961709608407]\n", | |
"[3, -35306239.79036588, 0.007636328224941378]\n", | |
"[4, -35168907.97662805, 0.0039049211829124074]\n", | |
"[5, -35100674.77568613, 0.0019439284679844588]\n", | |
"[6, -35062128.22731208, 0.0010993784554134125]\n", | |
"[7, -35035059.4184294, 0.0007726206072436282]\n", | |
"[8, -35011893.844969004, 0.0006616486832437202]\n", | |
"[9, -34989671.84047764, 0.0006351018264097872]\n", | |
"[10, -34967525.91719405, 0.0006333282868232589]\n", | |
"[11, -34945450.67152252, 0.0006317058514721757]\n", | |
"[12, -34923684.17535287, 0.0006232588767083917]\n", | |
"[13, -34902456.603622735, 0.0006081970667913826]\n", | |
"[14, -34881917.07921315, 0.0005888301483815127]\n", | |
"[15, -34862128.14693682, 0.0005676340868499061]\n", | |
"[16, -34843079.18813082, 0.0005467071008032383]\n", | |
"[17, -34824701.54171546, 0.0005277187054523998]\n", | |
"[18, -34806880.9080733, 0.0005119859400566394]\n", | |
"[19, -34789466.99835528, 0.0005005512076067454]\n", | |
"[20, -34772281.60942785, 0.0004942266694047025]\n", | |
"[21, -34755126.53253923, 0.0004935984587065763]\n", | |
"[22, -34737792.65489288, 0.000498991914039055]\n", | |
"[23, -34720071.383203976, 0.0005104042412042289]\n", | |
"[24, -34701768.984793566, 0.0005274197525327848]\n", | |
"[25, -34682723.4932734, 0.0005491348314632848]\n", | |
"[26, -34662822.54579042, 0.0005741294569041849]\n", | |
"[27, -34642019.2960363, 0.0006005207022240826]\n", | |
"[28, -34620342.98380056, 0.000626114889904052]\n", | |
"[29, -34597901.308159865, 0.0006486426861794059]\n", | |
"[30, -34574873.40194717, 0.0006660300948896047]\n", | |
"[31, -34551494.31964694, 0.0006766446071461768]\n", | |
"[32, -34528033.708807595, 0.0006794655912699745]\n", | |
"[33, -34504772.19738355, 0.0006741534559618444]\n", | |
"[34, -34481978.9369646, 0.0006610194983477931]\n", | |
"[35, -34459892.979228675, 0.0006409177692234454]\n", | |
"[36, -34438710.1014483, 0.0006150891748842542]\n", | |
"[37, -34418575.61802432, 0.0005849888632065897]\n", | |
"[38, -34399582.78650502, 0.0005521238916522312]\n", | |
"[39, -34381775.75027318, 0.0005179207834166821]\n", | |
"[40, -34365155.627515234, 0.000483632983889113]\n", | |
"[41, -34349688.34797278, 0.0004502887882348552]\n", | |
"[42, -34335313.057657816, 0.00041867363465785627]\n", | |
"[43, -34321950.226527154, 0.0003893377573962288]\n", | |
"[44, -34309508.90458345, 0.00036262022806294713]\n", | |
"[45, -34297892.82993694, 0.00033868187483433634]\n", | |
"[46, -34287005.28767691, 0.0003175413591440578]\n", | |
"[47, -34276752.75592885, 0.0002991103568374933]\n", | |
"[48, -34267047.4632041, 0.00028322523950079047]\n", | |
"[49, -34257809.02745552, 0.00026967386446614494]\n", | |
"[50, -34248965.361238606, 0.0002582170329421451]\n", | |
"[51, -34240453.018675745, 0.0002486048463850306]\n", | |
"[52, -34232217.137733735, 0.00024058859257851235]\n", | |
"[53, -34224211.10338515, 0.0002339289669643496]\n", | |
"[54, -34216396.0291489, 0.0002284014432611127]\n", | |
"[55, -34208740.129656255, 0.00022379951625306184]\n", | |
"[56, -34201218.03668112, 0.00021993640598018038]\n", | |
"[57, -34193810.09558194, 0.00021664567588327704]\n", | |
"[58, -34186501.667691156, 0.0002137810988040364]\n", | |
"[59, -34179282.45598682, 0.0002112160111503919]\n", | |
"[60, -34172145.86556546, 0.00020884232583576502]\n", | |
"[61, -34165088.40633261, 0.00020656932447850153]\n", | |
"[62, -34158109.14242763, 0.0002043223140918068]\n", | |
"[63, -34151209.1908302, 0.00020204120910841686]\n", | |
"[64, -34144391.27009482, 0.0001996790829114292]\n", | |
"[65, -34137659.29903924, 0.00019720072183659144]\n", | |
"[66, -34131018.044351526, 0.0001945812070147343]\n", | |
"[67, -34124472.81539794, 0.0001918045441754527]\n", | |
"[68, -34118029.20396184, 0.00018886235771644661]\n", | |
"[69, -34111692.86619716, 0.00018575266227726132]\n", | |
"[70, -34105469.34372704, 0.0001824787223245983]\n", | |
"[71, -34099363.92055333, 0.00017904800769701957]\n", | |
"[72, -34093381.51226489, 0.00017547125052063293]\n", | |
"[73, -34087526.58394172, 0.00017176160636806745]\n", | |
"[74, -34081803.09314253, 0.0001679339201493191]\n", | |
"[75, -34076214.4544325, 0.00016400409492382668]\n", | |
"[76, -34070763.52204552, 0.00015998855979415985]\n", | |
"[77, -34065452.58747229, 0.00015590383129640984]\n", | |
"[78, -34060283.3890098, 0.0001517661612925454]\n", | |
"[79, -34055257.13058643, 0.00014759126334297482]\n", | |
"[80, -34050374.507477485, 0.0001433941088628426]\n", | |
"[81, -34045635.73683703, 0.0001391887840510461]\n", | |
"[82, -34041040.59128145, 0.0001349883985848269]\n", | |
"[83, -34036588.43405976, 0.00013080503735899194]\n", | |
"[84, -34032278.25462815, 0.0001266497470243167]\n", | |
"[85, -34028108.70370457, 0.00012253254977772274]\n", | |
"[86, -34024078.127110854, 0.00011846247762133498]\n", | |
"[87, -34020184.59791211, 0.00011444762116259525]\n", | |
"[88, -34016425.94653518, 0.00011049518790834949]\n", | |
"[89, -34012799.78869053, 0.00010661156585684798]\n", | |
"[90, -34009303.551036365, 0.00010280238902621964]\n", | |
"[91, -34005934.49461243, 9.90726022972865e-05]\n" | |
] | |
} | |
], | |
"source": [ | |
"np.random.seed(1)\n", | |
"for KK in KK_vec:\n", | |
" # INITIALISATION\n", | |
" w = 1/KK*np.ones((KK, 1)) #Assign equal weight to each component to start with\n", | |
" mu = np.random.normal(loc=x.mean(), scale=x.std()/KK, size=KK)#\n", | |
" std = x.std()*np.ones(KK)/KK\n", | |
"\n", | |
" # Initial parameters\n", | |
" w_ini = w.copy()\n", | |
" mu_ini = mu.copy()\n", | |
" std_ini = std.copy()\n", | |
" # Parameters\n", | |
" sw = False\n", | |
" QQ = -np.inf\n", | |
" epsilon = 1e-4\n", | |
" max_iter = 100\n", | |
" i = 0\n", | |
" # x = df_noout['Price_adj']\n", | |
" print(\"===========================================\")\n", | |
" print(\"Mixture with {} components\".format(KK))\n", | |
" while((~sw) & (i < max_iter)):\n", | |
" i+=1\n", | |
" ## E step\n", | |
" L = np.zeros([KK, len(x)])\n", | |
" v = np.zeros([KK, len(x)])\n", | |
" for k in range(KK):\n", | |
" L[k, :] = norm.logpdf(x, loc=mu[k], scale=std[k])\n", | |
" Lmax = np.amax(L, axis=0)\n", | |
" for k in range(KK):\n", | |
" L[k, :] -= Lmax\n", | |
" denom = (w*np.exp(L)).sum(axis=0)\n", | |
" for k in range(KK):\n", | |
" v[k, :] = w[k]*np.exp(L[k, :])/denom\n", | |
"\n", | |
" ## M step\n", | |
" for k in range(KK):\n", | |
" w[k] = v[k,:].mean()\n", | |
" mu[k] = (v[k,:]*x).sum()/v[k, :].sum()\n", | |
" std[k] = np.sqrt((v[k,:]*(x-mu[k])**2).sum()/v[k,:].sum())\n", | |
"\n", | |
" ##Check convergence\n", | |
" QQn = 0\n", | |
" for k in range(KK):\n", | |
" QQn += (v[k, :]*(np.log(w[k]) + norm.logpdf(x, loc=mu[k], scale=std[k]))).sum()\n", | |
" rel_error = abs(QQn-QQ)/abs(QQn)\n", | |
" if(rel_error < epsilon):\n", | |
" sw=True\n", | |
"\n", | |
" QQ = QQn\n", | |
" print([i, QQ, rel_error])\n", | |
"\n", | |
" ## ASSIGN Results\n", | |
" w_vec.append(w)\n", | |
" mu_vec.append(mu)\n", | |
" std_vec.append(std)\n", | |
" w_ini_vec.append(w_ini)\n", | |
" mu_ini_vec.append(mu_ini)\n", | |
" std_ini_vec.append(std_ini)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Store results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('EMfit.pickle', 'wb') as f:\n", | |
" pickle.dump([KK_vec, w_vec, mu_vec, std_vec], f)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## PLOT" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('EMfit_LogScale.pickle', 'rb') as f:\n", | |
" [KK_vec, w_vec, mu_vec, std_vec] = pickle.load(f)\n", | |
"\n", | |
"KK_log = KK_vec[0]\n", | |
"w_log = w_vec[0]\n", | |
"mu_log = mu_vec[0]\n", | |
"std_log = std_vec[0]\n", | |
"\n", | |
"with open('EMfit.pickle', 'rb') as f:\n", | |
" [KK_vec, w_vec, mu_vec, std_vec] = pickle.load(f)\n", | |
"\n", | |
"KK_lin = KK_vec[0]\n", | |
"w_lin = w_vec[0]\n", | |
"mu_lin = mu_vec[0]\n", | |
"std_lin = std_vec[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": " |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment