knaaptime · July 18, 2020 16:11 · knaaptime · Jul 18, 2020
diff --git a/cluster_lag.ipynb b/cluster_lag.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from libpysal.api import Queen\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "la = gpd.GeoDataFrame.from_file('/Users/knaaptime/projects/libpysal_test/data/la_msa.shp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'geopandas.geodataframe.GeoDataFrame'>\n",
      "RangeIndex: 2779 entries, 0 to 2778\n",
      "Data columns (total 2 columns):\n",
      "GEOID       2779 non-null object\n",
      "geometry    2779 non-null object\n",
      "dtypes: object(2)\n",
      "memory usage: 43.5+ KB\n"
     ]
    }
   ],
   "source": [
    "la.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('/Users/knaaptime/projects/libpysal_test/data/la_data.csv', converters={'geoid':str})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data= data[data.year==2010]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['geoid', 'pct_bachelor_or_greater', 'pct_black', 'median_household_income']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "la = la.merge(data, left_on='GEOID', right_on='geoid', how='left')\n",
    "la = la.dropna(subset=['median_household_income'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GEOID</th>\n",
       "      <th>geometry</th>\n",
       "      <th>geoid</th>\n",
       "      <th>pct_bachelor_or_greater</th>\n",
       "      <th>pct_black</th>\n",
       "      <th>median_household_income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>06037101110</td>\n",
       "      <td>POLYGON ((-118.2979312137039 34.26322687612055...</td>\n",
       "      <td>06037101110</td>\n",
       "      <td>18.48</td>\n",
       "      <td>0.76</td>\n",
       "      <td>56139.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>06037101122</td>\n",
       "      <td>POLYGON ((-118.2774342104291 34.25990787645111...</td>\n",
       "      <td>06037101122</td>\n",
       "      <td>35.72</td>\n",
       "      <td>0.00</td>\n",
       "      <td>87500.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>06037101210</td>\n",
       "      <td>POLYGON ((-118.2859362111909 34.25589987630816...</td>\n",
       "      <td>06037101210</td>\n",
       "      <td>13.63</td>\n",
       "      <td>3.71</td>\n",
       "      <td>42867.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06037101220</td>\n",
       "      <td>POLYGON ((-118.2849922110132 34.25589387633028...</td>\n",
       "      <td>06037101220</td>\n",
       "      <td>16.98</td>\n",
       "      <td>0.72</td>\n",
       "      <td>49666.60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>06037101300</td>\n",
       "      <td>POLYGON ((-118.2652762074589 34.25238487698516...</td>\n",
       "      <td>06037101300</td>\n",
       "      <td>33.59</td>\n",
       "      <td>0.32</td>\n",
       "      <td>65891.66</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         GEOID                                           geometry  \\\n",
       "0  06037101110  POLYGON ((-118.2979312137039 34.26322687612055...   \n",
       "1  06037101122  POLYGON ((-118.2774342104291 34.25990787645111...   \n",
       "2  06037101210  POLYGON ((-118.2859362111909 34.25589987630816...   \n",
       "3  06037101220  POLYGON ((-118.2849922110132 34.25589387633028...   \n",
       "4  06037101300  POLYGON ((-118.2652762074589 34.25238487698516...   \n",
       "\n",
       "         geoid  pct_bachelor_or_greater  pct_black  median_household_income  \n",
       "0  06037101110                    18.48       0.76                 56139.23  \n",
       "1  06037101122                    35.72       0.00                 87500.06  \n",
       "2  06037101210                    13.63       3.71                 42867.84  \n",
       "3  06037101220                    16.98       0.72                 49666.60  \n",
       "4  06037101300                    33.59       0.32                 65891.66  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "la.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "W = Queen.from_dataframe(la)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AgglomerativeClustering(linkage='ward', n_clusters=6).fit(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lag_expected(X,W, n_clusters):\n",
    "    model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit_predict(W.sparse @ X)\n",
    "    return model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "lag = lag_expected(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values, W, 6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 2, 2, ..., 0, 0, 0])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lag"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How does this compare to the categorical lag?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from libpysal.api import lag_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_lag = lag_categorical(W, model.labels_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "767"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lag[lag == cat_lag])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1996"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lag[lag != cat_lag])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "they usually disagree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from libpysal.api import Queen\n",
	"import pandas as pd\n",
	"import geopandas as gpd\n",
	"import matplotlib.pyplot as plt\n",
	"from sklearn.cluster import AgglomerativeClustering\n",
	"%matplotlib inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"la = gpd.GeoDataFrame.from_file('/Users/knaaptime/projects/libpysal_test/data/la_msa.shp')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'geopandas.geodataframe.GeoDataFrame'>\n",
	"RangeIndex: 2779 entries, 0 to 2778\n",
	"Data columns (total 2 columns):\n",
	"GEOID 2779 non-null object\n",
	"geometry 2779 non-null object\n",
	"dtypes: object(2)\n",
	"memory usage: 43.5+ KB\n"
	]
	}
	],
	"source": [
	"la.info()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = pd.read_csv('/Users/knaaptime/projects/libpysal_test/data/la_data.csv', converters={'geoid':str})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"data= data[data.year==2010]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = data[['geoid', 'pct_bachelor_or_greater', 'pct_black', 'median_household_income']]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"la = la.merge(data, left_on='GEOID', right_on='geoid', how='left')\n",
	"la = la.dropna(subset=['median_household_income'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>GEOID</th>\n",
	" <th>geometry</th>\n",
	" <th>geoid</th>\n",
	" <th>pct_bachelor_or_greater</th>\n",
	" <th>pct_black</th>\n",
	" <th>median_household_income</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>06037101110</td>\n",
	" <td>POLYGON ((-118.2979312137039 34.26322687612055...</td>\n",
	" <td>06037101110</td>\n",
	" <td>18.48</td>\n",
	" <td>0.76</td>\n",
	" <td>56139.23</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>06037101122</td>\n",
	" <td>POLYGON ((-118.2774342104291 34.25990787645111...</td>\n",
	" <td>06037101122</td>\n",
	" <td>35.72</td>\n",
	" <td>0.00</td>\n",
	" <td>87500.06</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>06037101210</td>\n",
	" <td>POLYGON ((-118.2859362111909 34.25589987630816...</td>\n",
	" <td>06037101210</td>\n",
	" <td>13.63</td>\n",
	" <td>3.71</td>\n",
	" <td>42867.84</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>06037101220</td>\n",
	" <td>POLYGON ((-118.2849922110132 34.25589387633028...</td>\n",
	" <td>06037101220</td>\n",
	" <td>16.98</td>\n",
	" <td>0.72</td>\n",
	" <td>49666.60</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>06037101300</td>\n",
	" <td>POLYGON ((-118.2652762074589 34.25238487698516...</td>\n",
	" <td>06037101300</td>\n",
	" <td>33.59</td>\n",
	" <td>0.32</td>\n",
	" <td>65891.66</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" GEOID geometry \\\n",
	"0 06037101110 POLYGON ((-118.2979312137039 34.26322687612055... \n",
	"1 06037101122 POLYGON ((-118.2774342104291 34.25990787645111... \n",
	"2 06037101210 POLYGON ((-118.2859362111909 34.25589987630816... \n",
	"3 06037101220 POLYGON ((-118.2849922110132 34.25589387633028... \n",
	"4 06037101300 POLYGON ((-118.2652762074589 34.25238487698516... \n",
	"\n",
	" geoid pct_bachelor_or_greater pct_black median_household_income \n",
	"0 06037101110 18.48 0.76 56139.23 \n",
	"1 06037101122 35.72 0.00 87500.06 \n",
	"2 06037101210 13.63 3.71 42867.84 \n",
	"3 06037101220 16.98 0.72 49666.60 \n",
	"4 06037101300 33.59 0.32 65891.66 "
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"la.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"W = Queen.from_dataframe(la)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"X = la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"model = AgglomerativeClustering(linkage='ward', n_clusters=6).fit(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"def lag_expected(X,W, n_clusters):\n",
	" model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters).fit_predict(W.sparse @ X)\n",
	" return model\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"lag = lag_expected(la[['pct_bachelor_or_greater', 'pct_black', 'median_household_income']].values, W, 6)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0, 2, 2, ..., 0, 0, 0])"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"lag"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"How does this compare to the categorical lag?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [],
	"source": [
	"from libpysal.api import lag_categorical"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"cat_lag = lag_categorical(W, model.labels_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"767"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(lag[lag == cat_lag])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1996"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(lag[lag != cat_lag])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"they usually disagree"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}