Last active
July 18, 2020 16:11
-
-
Save knaaptime/31f7fc7dfe57e7403c9621436756d573 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from libpysal.api import Queen\n", | |
"import pandas as pd\n", | |
"import geopandas as gpd\n", | |
"import matplotlib.pyplot as plt\n", | |
"from sklearn.cluster import AgglomerativeClustering\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"la = gpd.GeoDataFrame.from_file('/Users/knaaptime/projects/libpysal_test/data/la_msa.shp')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'geopandas.geodataframe.GeoDataFrame'>\n", | |
"RangeIndex: 2779 entries, 0 to 2778\n", | |
"Data columns (total 2 columns):\n", | |
"GEOID 2779 non-null object\n", | |
"geometry 2779 non-null object\n", | |
"dtypes: object(2)\n", | |
"memory usage: 43.5+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"la.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = pd.read_csv('/Users/knaaptime/projects/libpysal_test/data/la_data.csv', converters={'geoid':str})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data= data[data.year==2010]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = data[['geoid', 'pct_bachelor_or_greater', 'pct_black', 'median_household_income']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"la = la.merge(data, left_on='GEOID', right_on='geoid', how='left')\n", | |
"la = la.dropna(subset=['median_household_income'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>GEOID</th>\n", | |
" <th>geometry</th>\n", | |
" <th>geoid</th>\n", | |
" <th>pct_bachelor_or_greater</th>\n", | |
" <th>pct_black</th>\n", | |
" <th>median_household_income</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>06037101110</td>\n", | |
" <td>POLYGON ((-118.2979312137039 34.26322687612055...</td>\n", | |
" <td>06037101110</td>\n", | |
" <td>18.48</td>\n", | |
" <td>0.76</td>\n", | |
" <td>56139.23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>06037101122</td>\n", | |
" <td>POLYGON ((-118.2774342104291 34.25990787645111...</td>\n", | |
" <td>06037101122</td>\n", | |
" <td>35.72</td>\n", | |
" <td>0.00</td>\n", | |
" <td>87500.06</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>06037101210</td>\n", | |
" <td>POLYGON ((-118.2859362111909 34.25589987630816...</td>\n", | |
" <td>06037101210</td>\n", | |
" <td>13.63</td>\n", | |
" <td>3.71</td>\n", | |
" <td>42867.84</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>06037101220</td>\n", | |
" <td>POLYGON ((-118.2849922110132 34.25589387633028...</td>\n", | |
" <td>06037101220</td>\n", | |
" <td>16.98</td>\n", | |
" <td>0.72</td>\n", | |
" <td>49666.60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>06037101300</td>\n", | |
" <td>POLYGON ((-118.2652762074589 34.25238487698516...</td>\n", | |
" <td>06037101300</td>\n", | |
" <td>33.59</td>\n", | |
" <td>0.32</td>\n", | |
" <td>65891.66</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" GEOID geometry \\\n", | |
"0 06037101110 POLYGON ((-118.2979312137039 34.26322687612055... \n", | |
"1 06037101122 POLYGON ((-118.2774342104291 34.25990787645111... \n", | |
"2 06037101210 POLYGON ((-118.2859362111909 34.25589987630816... \n", | |
"3 06037101220 POLYGON ((-118.2849922110132 34.25589387633028... \n", | |
"4 06037101300 POLYGON ((-118.2652762074589 34.25238487698516... \n", | |
"\n", | |
" geoid pct_bachelor_or_greater pct_black median_household_income \n", | |
"0 06037101110 18.48 0.76 56139.23 \n", | |
"1 06037101122 35.72 0.00 87500.06 \n", | |
"2 06037101210 13.63 3.71 42867.84 \n", | |
"3 06037101220 16.98 0.72 49666.60 \n", | |
"4 06037101300 33.59 0.32 65891.66 " | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"la.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"W = Queen.from_dataframe(la)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X = la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = AgglomerativeClustering(linkage='ward', n_clusters=6).fit(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def lag_expected(X,W, n_clusters):\n", | |
" model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit_predict(W.sparse @ X)\n", | |
" return model\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"lag = lag_expected(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values, W, 6)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 2, 2, ..., 0, 0, 0])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lag" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"How does this compare to the categorical lag?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from libpysal.api import lag_categorical" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cat_lag = lag_categorical(W, model.labels_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"767" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(lag[lag == cat_lag])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1996" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(lag[lag != cat_lag])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"they usually disagree" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
the logic is just slightly off here, as we'd want to make predictions from a model that's already been fit. I.e., cell 12 should take a fitted cluster model and simply predict class labels using the new data