knaaptime · December 16, 2022 23:52
diff --git a/beijing_tianjin_seg.ipynb b/beijing_tianjin_seg.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "79c40a99-ec29-492b-be48-f218c7368620",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import geopandas as gpd                                                                                                 #实现空间匹配\n",
    "from segregation.singlegroup import *\n",
    "from segregation import inference\n",
    "import shapely.wkt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19116c7d-e2a0-4fed-ab45-2701dee2649c",
   "metadata": {},
   "source": [
    "Note in the cell below you need to set the CRS to 4326, not 3857 because the input coordinates are stored in latitude/longitude"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c3ca5c1e-4777-4da5-b8d9-d857a0dc7801",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transform the original data into geodf\n",
    "df = pd.read_csv('beijing_tianjin_seg_v2.csv', sep='\\s+', encoding='utf-8') \n",
    "df.columns = ['city', 'county', 'border','highschool_and_below','college','undergraduate_and_above','2499andbelow','2500~3999','4000~7999','8000~19999','20000andabove']  \n",
    "df_border = df['border'].str.split(',', expand=True)\n",
    "\n",
    "border_combine = \"POLYGON((\" + df_border[0]+\" \"+df_border[1]+ \",\" +df_border[2]+\" \"+df_border[3]+ \",\" +df_border[4]+\" \"+df_border[5]+ \",\" +df_border[6]+\" \"+df_border[7]+\",\"+df_border[8]+\" \"+df_border[9]+ \"))\"\n",
    "border_combine=border_combine.tolist()\n",
    "\n",
    "P=[]\n",
    "for i in range(len(border_combine)):\n",
    "    P.insert(i-1,shapely.wkt.loads(border_combine[i]))\n",
    "p=gpd.GeoSeries(P)\n",
    "df=df.drop(['border'], axis=1)\n",
    "geo_df = gpd.GeoDataFrame(data=df,geometry=p)\n",
    "geo_df.crs=4326  # geo_df (note the change here)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ae994a08-c660-4fce-9087-b14258719477",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>county</th>\n",
       "      <th>highschool_and_below</th>\n",
       "      <th>college</th>\n",
       "      <th>undergraduate_and_above</th>\n",
       "      <th>2499andbelow</th>\n",
       "      <th>2500~3999</th>\n",
       "      <th>4000~7999</th>\n",
       "      <th>8000~19999</th>\n",
       "      <th>20000andabove</th>\n",
       "      <th>geometry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>孝感市</td>\n",
       "      <td>云梦县</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((113.91656 36.01402, 113.91655 36.021...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>安阳市</td>\n",
       "      <td>林州市</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((105.62528 35.22905, 105.62528 35.236...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>定西市</td>\n",
       "      <td>通渭县</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((104.82565 28.57691, 104.82565 28.584...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>宜宾市</td>\n",
       "      <td>长宁县</td>\n",
       "      <td>65</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>56</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((107.17934 23.56500, 107.17934 23.573...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>百色市</td>\n",
       "      <td>田东县</td>\n",
       "      <td>227</td>\n",
       "      <td>81</td>\n",
       "      <td>13</td>\n",
       "      <td>201</td>\n",
       "      <td>56</td>\n",
       "      <td>32</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>POLYGON ((113.05431 26.84158, 113.05431 26.849...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city county  highschool_and_below  college  undergraduate_and_above  \\\n",
       "0  孝感市    云梦县                     9        1                        2   \n",
       "1  安阳市    林州市                     6        0                        0   \n",
       "2  定西市    通渭县                     1        0                        0   \n",
       "3  宜宾市    长宁县                    65       10                        1   \n",
       "4  百色市    田东县                   227       81                       13   \n",
       "\n",
       "   2499andbelow  2500~3999  4000~7999  8000~19999  20000andabove  \\\n",
       "0             9          1          2           0              0   \n",
       "1             5          0          0           0              0   \n",
       "2             0          0          1           0              0   \n",
       "3            56          6          3           0              0   \n",
       "4           201         56         32           4              1   \n",
       "\n",
       "                                            geometry  \n",
       "0  POLYGON ((113.91656 36.01402, 113.91655 36.021...  \n",
       "1  POLYGON ((105.62528 35.22905, 105.62528 35.236...  \n",
       "2  POLYGON ((104.82565 28.57691, 104.82565 28.584...  \n",
       "3  POLYGON ((107.17934 23.56500, 107.17934 23.573...  \n",
       "4  POLYGON ((113.05431 26.84158, 113.05431 26.849...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "geo_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2f7837cc-cb8f-4351-8378-f9686f8a40e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "    Cauculate segregation index in county level\n",
    "\"\"\"\n",
    "\n",
    "df_code = pd.read_csv('county_list_merge.csv', sep='\\s+', encoding='utf-8')     #read county list                                  \n",
    "df_code.columns =['city', 'county', 'county_id']   \n",
    "new_geodf = pd.merge(geo_df, df_code, how='inner', on=['city', 'county'])      # merge geodataframe with county list                                       \n",
    "new_geodf['id'] = new_geodf.groupby(u'county_id', as_index=False).ngroup() \n",
    "new_geodf['edu_low']= new_geodf['highschool_and_below']    # change three groups into two groups, edu_low refers to the low education level group\n",
    "new_geodf['income_low']=new_geodf['2499andbelow'] + new_geodf['2500~3999'] #calculate the number of population in low income group\n",
    "new_geodf['poptotal']= new_geodf['highschool_and_below']+new_geodf['college']+new_geodf['undergraduate_and_above']  #calculate the number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e2102764-b7f0-484a-bcac-20c860491ce7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>county</th>\n",
       "      <th>highschool_and_below</th>\n",
       "      <th>college</th>\n",
       "      <th>undergraduate_and_above</th>\n",
       "      <th>2499andbelow</th>\n",
       "      <th>2500~3999</th>\n",
       "      <th>4000~7999</th>\n",
       "      <th>8000~19999</th>\n",
       "      <th>20000andabove</th>\n",
       "      <th>geometry</th>\n",
       "      <th>county_id</th>\n",
       "      <th>id</th>\n",
       "      <th>edu_low</th>\n",
       "      <th>income_low</th>\n",
       "      <th>poptotal</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>孝感市</td>\n",
       "      <td>云梦县</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((113.91656 36.01402, 113.91655 36.021...</td>\n",
       "      <td>420923</td>\n",
       "      <td>46</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>安阳市</td>\n",
       "      <td>林州市</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((105.62528 35.22905, 105.62528 35.236...</td>\n",
       "      <td>410581</td>\n",
       "      <td>38</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>定西市</td>\n",
       "      <td>通渭县</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((104.82565 28.57691, 104.82565 28.584...</td>\n",
       "      <td>621121</td>\n",
       "      <td>89</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>宜宾市</td>\n",
       "      <td>长宁县</td>\n",
       "      <td>65</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>56</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>POLYGON ((107.17934 23.56500, 107.17934 23.573...</td>\n",
       "      <td>511524</td>\n",
       "      <td>74</td>\n",
       "      <td>65</td>\n",
       "      <td>62</td>\n",
       "      <td>76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>百色市</td>\n",
       "      <td>田东县</td>\n",
       "      <td>227</td>\n",
       "      <td>81</td>\n",
       "      <td>13</td>\n",
       "      <td>201</td>\n",
       "      <td>56</td>\n",
       "      <td>32</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>POLYGON ((113.05431 26.84158, 113.05431 26.849...</td>\n",
       "      <td>451022</td>\n",
       "      <td>62</td>\n",
       "      <td>227</td>\n",
       "      <td>257</td>\n",
       "      <td>321</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  city county  highschool_and_below  college  undergraduate_and_above  \\\n",
       "0  孝感市    云梦县                     9        1                        2   \n",
       "1  安阳市    林州市                     6        0                        0   \n",
       "2  定西市    通渭县                     1        0                        0   \n",
       "3  宜宾市    长宁县                    65       10                        1   \n",
       "4  百色市    田东县                   227       81                       13   \n",
       "\n",
       "   2499andbelow  2500~3999  4000~7999  8000~19999  20000andabove  \\\n",
       "0             9          1          2           0              0   \n",
       "1             5          0          0           0              0   \n",
       "2             0          0          1           0              0   \n",
       "3            56          6          3           0              0   \n",
       "4           201         56         32           4              1   \n",
       "\n",
       "                                            geometry  county_id  id  edu_low  \\\n",
       "0  POLYGON ((113.91656 36.01402, 113.91655 36.021...     420923  46        9   \n",
       "1  POLYGON ((105.62528 35.22905, 105.62528 35.236...     410581  38        6   \n",
       "2  POLYGON ((104.82565 28.57691, 104.82565 28.584...     621121  89        1   \n",
       "3  POLYGON ((107.17934 23.56500, 107.17934 23.573...     511524  74       65   \n",
       "4  POLYGON ((113.05431 26.84158, 113.05431 26.849...     451022  62      227   \n",
       "\n",
       "   income_low  poptotal  \n",
       "0          10        12  \n",
       "1           5         6  \n",
       "2           0         1  \n",
       "3          62        76  \n",
       "4         257       321  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_geodf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f360147-df88-468c-a188-9b33d2a2e582",
   "metadata": {},
   "source": [
    "These are really small polygons, maybe a sample from the original grid? It might be easier to just represent them as points, but since the resulting dataframe has one observation per county, i assume the full set exists somewhere."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8b2f61b-9cfa-43de-be11-f00b5f61dae9",
   "metadata": {},
   "source": [
    "After the merge operations above, we have only one observation (one grid cell) per county. That means our observations are counties and we're computing segregation at the *national* level, not the county level (as the comment in the cell above says). \n",
    "\n",
    "(again, i assume this is just a simplified dataset and you have more observations in each county, so you *could* compute a county-level index, I just want to make sure we're on the same page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "da5a0776-fe75-4211-b1ed-57303bfe1140",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_geodf = new_geodf.to_crs(new_geodf.estimate_utm_crs())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "421472d0-caf2-4b00-a9b7-1675e3e9d4e1",
   "metadata": {},
   "source": [
    "Instead of doing the iteration yourself, it's easier to use the `batch_compute` function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "93870676-a95e-482e-b0fa-7bcd67b90008",
   "metadata": {},
   "outputs": [],
   "source": [
    "from segregation.batch import batch_compute_multigroup, batch_compute_singlegroup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "01cd1d12-a75d-4cab-a367-d4097ebac0ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/json": {
       "ascii": false,
       "bar_format": null,
       "colour": null,
       "elapsed": 0.010767221450805664,
       "initial": 0,
       "n": 0,
       "ncols": null,
       "nrows": null,
       "postfix": null,
       "prefix": "",
       "rate": null,
       "total": 27,
       "unit": "it",
       "unit_divisor": 1000,
       "unit_scale": false
      },
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0ceed32030d74238b1d3b2966dde15ff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/27 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
     ]
    }
   ],
   "source": [
    "income_indices = batch_compute_singlegroup(new_geodf, group_pop_var='income_low',total_pop_var='poptotal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fcec5bc5-c791-47ff-99e7-841d01a5f08a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Statistic</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AbsoluteCentralization</th>\n",
       "      <td>-0.4476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AbsoluteClustering</th>\n",
       "      <td>0.6437</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AbsoluteConcentration</th>\n",
       "      <td>1.0375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Atkinson</th>\n",
       "      <td>0.0184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BiasCorrectedDissim</th>\n",
       "      <td>0.0623</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BoundarySpatialDissim</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ConProf</th>\n",
       "      <td>0.0877</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CorrelationR</th>\n",
       "      <td>0.0146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Delta</th>\n",
       "      <td>0.8392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DensityCorrectedDissim</th>\n",
       "      <td>0.0618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dissim</th>\n",
       "      <td>0.0636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DistanceDecayInteraction</th>\n",
       "      <td>0.3357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DistanceDecayIsolation</th>\n",
       "      <td>0.6709</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Entropy</th>\n",
       "      <td>0.0122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gini</th>\n",
       "      <td>0.0853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Interaction</th>\n",
       "      <td>0.3314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Isolation</th>\n",
       "      <td>0.6686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MinMax</th>\n",
       "      <td>0.1197</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ModifiedDissim</th>\n",
       "      <td>0.0509</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ModifiedGini</th>\n",
       "      <td>0.0635</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PARDissim</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeCentralization</th>\n",
       "      <td>0.0138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeClustering</th>\n",
       "      <td>-0.0894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeConcentration</th>\n",
       "      <td>0.1176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialDissim</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialProxProf</th>\n",
       "      <td>1.9730</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialProximity</th>\n",
       "      <td>1.0007</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          Statistic\n",
       "Name                               \n",
       "AbsoluteCentralization      -0.4476\n",
       "AbsoluteClustering           0.6437\n",
       "AbsoluteConcentration        1.0375\n",
       "Atkinson                     0.0184\n",
       "BiasCorrectedDissim          0.0623\n",
       "BoundarySpatialDissim           NaN\n",
       "ConProf                      0.0877\n",
       "CorrelationR                 0.0146\n",
       "Delta                        0.8392\n",
       "DensityCorrectedDissim       0.0618\n",
       "Dissim                       0.0636\n",
       "DistanceDecayInteraction     0.3357\n",
       "DistanceDecayIsolation       0.6709\n",
       "Entropy                      0.0122\n",
       "Gini                         0.0853\n",
       "Interaction                  0.3314\n",
       "Isolation                    0.6686\n",
       "MinMax                       0.1197\n",
       "ModifiedDissim               0.0509\n",
       "ModifiedGini                 0.0635\n",
       "PARDissim                       NaN\n",
       "RelativeCentralization       0.0138\n",
       "RelativeClustering          -0.0894\n",
       "RelativeConcentration        0.1176\n",
       "SpatialDissim                   NaN\n",
       "SpatialProxProf              1.9730\n",
       "SpatialProximity             1.0007"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "income_indices"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d838124-286a-4db9-8afc-2a0a9637d6d4",
   "metadata": {},
   "source": [
    "Now you get back a dataframe of *all* the available segregation indices without having to build it from scratch. Doing the same for education:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9b41d470-b4da-4ba0-84f8-b1823d5ca314",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/json": {
       "ascii": false,
       "bar_format": null,
       "colour": null,
       "elapsed": 0.007842063903808594,
       "initial": 0,
       "n": 0,
       "ncols": null,
       "nrows": null,
       "postfix": null,
       "prefix": "",
       "rate": null,
       "total": 27,
       "unit": "it",
       "unit_divisor": 1000,
       "unit_scale": false
      },
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "707fc9e3bcee4dbda6beff6f96b872c5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/27 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "edu_indices = batch_compute_singlegroup(new_geodf, group_pop_var='edu_low',total_pop_var='poptotal')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae4bcda5-1ba2-4da0-a38b-9f90efc91c1c",
   "metadata": {},
   "source": [
    "Rename the index columns and stick the results together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8d105741-fb1f-4103-afd6-e0af3182fe42",
   "metadata": {},
   "outputs": [],
   "source": [
    "income_indices = income_indices.rename(columns={'Statistic': 'income_index'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "351d6b92-c17b-4a9f-a494-7cbc293ba7af",
   "metadata": {},
   "outputs": [],
   "source": [
    "edu_indices = edu_indices.rename(columns={'Statistic': 'edu_index'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3b67e132-ee92-461b-8907-51bdab1acf13",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.concat([income_indices, edu_indices], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "56e92bc6-23c4-479f-b843-dbf75521d0a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>income_index</th>\n",
       "      <th>edu_index</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AbsoluteCentralization</th>\n",
       "      <td>-0.4476</td>\n",
       "      <td>-0.4743</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AbsoluteClustering</th>\n",
       "      <td>0.6437</td>\n",
       "      <td>0.7629</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AbsoluteConcentration</th>\n",
       "      <td>1.0375</td>\n",
       "      <td>0.8582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Atkinson</th>\n",
       "      <td>0.0184</td>\n",
       "      <td>0.0359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BiasCorrectedDissim</th>\n",
       "      <td>0.0623</td>\n",
       "      <td>0.1502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BoundarySpatialDissim</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ConProf</th>\n",
       "      <td>0.0877</td>\n",
       "      <td>0.2157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CorrelationR</th>\n",
       "      <td>0.0146</td>\n",
       "      <td>0.0288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Delta</th>\n",
       "      <td>0.8392</td>\n",
       "      <td>0.8446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DensityCorrectedDissim</th>\n",
       "      <td>0.0618</td>\n",
       "      <td>0.1505</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dissim</th>\n",
       "      <td>0.0636</td>\n",
       "      <td>0.1533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DistanceDecayInteraction</th>\n",
       "      <td>0.3357</td>\n",
       "      <td>0.2550</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DistanceDecayIsolation</th>\n",
       "      <td>0.6709</td>\n",
       "      <td>0.7593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Entropy</th>\n",
       "      <td>0.0122</td>\n",
       "      <td>0.0243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Gini</th>\n",
       "      <td>0.0853</td>\n",
       "      <td>0.1732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Interaction</th>\n",
       "      <td>0.3314</td>\n",
       "      <td>0.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Isolation</th>\n",
       "      <td>0.6686</td>\n",
       "      <td>0.7500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MinMax</th>\n",
       "      <td>0.1197</td>\n",
       "      <td>0.2658</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ModifiedDissim</th>\n",
       "      <td>0.0509</td>\n",
       "      <td>0.1392</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ModifiedGini</th>\n",
       "      <td>0.0635</td>\n",
       "      <td>0.1496</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PARDissim</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeCentralization</th>\n",
       "      <td>0.0138</td>\n",
       "      <td>-0.0901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeClustering</th>\n",
       "      <td>-0.0894</td>\n",
       "      <td>0.2845</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RelativeConcentration</th>\n",
       "      <td>0.1176</td>\n",
       "      <td>-0.5766</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialDissim</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialProxProf</th>\n",
       "      <td>1.9730</td>\n",
       "      <td>2.8851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SpatialProximity</th>\n",
       "      <td>1.0007</td>\n",
       "      <td>1.0117</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          income_index  edu_index\n",
       "Name                                             \n",
       "AbsoluteCentralization         -0.4476    -0.4743\n",
       "AbsoluteClustering              0.6437     0.7629\n",
       "AbsoluteConcentration           1.0375     0.8582\n",
       "Atkinson                        0.0184     0.0359\n",
       "BiasCorrectedDissim             0.0623     0.1502\n",
       "BoundarySpatialDissim              NaN        NaN\n",
       "ConProf                         0.0877     0.2157\n",
       "CorrelationR                    0.0146     0.0288\n",
       "Delta                           0.8392     0.8446\n",
       "DensityCorrectedDissim          0.0618     0.1505\n",
       "Dissim                          0.0636     0.1533\n",
       "DistanceDecayInteraction        0.3357     0.2550\n",
       "DistanceDecayIsolation          0.6709     0.7593\n",
       "Entropy                         0.0122     0.0243\n",
       "Gini                            0.0853     0.1732\n",
       "Interaction                     0.3314     0.2500\n",
       "Isolation                       0.6686     0.7500\n",
       "MinMax                          0.1197     0.2658\n",
       "ModifiedDissim                  0.0509     0.1392\n",
       "ModifiedGini                    0.0635     0.1496\n",
       "PARDissim                          NaN        NaN\n",
       "RelativeCentralization          0.0138    -0.0901\n",
       "RelativeClustering             -0.0894     0.2845\n",
       "RelativeConcentration           0.1176    -0.5766\n",
       "SpatialDissim                      NaN        NaN\n",
       "SpatialProxProf                 1.9730     2.8851\n",
       "SpatialProximity                1.0007     1.0117"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddf0775b-f78e-484e-8dc0-480a6808a140",
   "metadata": {},
   "source": [
    "The spatial dissimilarity indices fail here because they default to a queen contiguity matrix, and none of the observations in this simple dataset touch one another. To get around that, you could pass a different weights object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "93a23349-de17-4825-85b3-7099879d6364",
   "metadata": {},
   "outputs": [],
   "source": [
    "from libpysal.weights import KNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c5bd7f2c-731b-43e7-b779-5f373bcbf589",
   "metadata": {},
   "outputs": [],
   "source": [
    "w = KNN.from_dataframe(new_geodf, k=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "207710ce-c763-4e95-9090-d8f1d8def3a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "sd = SpatialDissim(new_geodf, group_pop_var='edu_low', w=w, total_pop_var='poptotal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "9f341b43-5f39-45f4-927e-cba454ade2f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.023636657648990783"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sd.statistic"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff002fea-3774-433d-899b-c3b13839d19c",
   "metadata": {},
   "source": [
    "but a contiguity matrix would work fine here if you were operating on the full grid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5a5e730d-7176-4c76-a5e8-7a86f33500df",
   "metadata": {},
   "outputs": [],
   "source": [
    "modified_dissim = ModifiedDissim(new_geodf, group_pop_var='edu_low',total_pop_var='poptotal')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3a2f17a-c7f1-4afb-8b07-5d568a137bf6",
   "metadata": {},
   "source": [
    "In the previous version of this code, the modified dissimilarity index could fail because an issue in the synthetic population generator could create observations with a group population that exceeds the total. Now that's fixed\n",
    "\n",
    "(when you got a result in the previous version, it was still valid, but the old implementation could easily produce invalid data and fail out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f550b753-325a-407c-86ae-7690bc1512d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.13858042904893253"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modified_dissim.statistic"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "011e1071-c6e4-49f6-99ea-cad69f929d75",
   "metadata": {},
   "source": [
    "## Inference"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02f0c094-c3a1-406d-889a-012fef34dd33",
   "metadata": {},
   "source": [
    "A traditional dissimilarity index works fine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a0285263-9ceb-4e93-b555-af6756467009",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/json": {
       "ascii": false,
       "bar_format": null,
       "colour": null,
       "elapsed": 0.008296012878417969,
       "initial": 0,
       "n": 0,
       "ncols": null,
       "nrows": null,
       "postfix": null,
       "prefix": "",
       "rate": null,
       "total": 500,
       "unit": "it",
       "unit_divisor": 1000,
       "unit_scale": false
      },
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "64fe62d6328a438b9ef7d1fb01ee463e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "inf_example = inference.SingleValueTest(\n",
    "    Dissim(new_geodf, group_pop_var=\"edu_low\", total_pop_var=\"poptotal\"),\n",
    "    null_approach=\"bootstrap\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03719454-1670-4756-b124-fb5d2a341f6c",
   "metadata": {},
   "source": [
    "But note that the inference methods here aren't really compatible with the \"modified\" indices defined in [carrington and troske](https://www.jstor.org/stable/1392486#metadata_info_tab_contents) because those are already based on comparing an observed value to a null distriubtion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "22dc5b71-e30e-4a82-b0cf-38020a9de23f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mInit signature:\u001b[0m\n",
       "\u001b[0minference\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSingleValueTest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mseg_class\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0miterations_under_null\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mnull_approach\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'systematic'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtwo_tailed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m     \n",
       "Statistical inference for a single segregation measure.\n",
       "\n",
       "Parameters\n",
       "----------\n",
       "seg_class : segregation.singlegroup or segregation.multigroup object\n",
       "    fitted segregation index class\n",
       "iterations_under_null : int\n",
       "    number of iterations under null hyphothesis\n",
       "null_approach : str\n",
       "    Which counterfactual approach to use when generating null hypothesis distribution. One of the following:.\n",
       "\n",
       "    * ``bootstrap``:\n",
       "    Generate bootstrap replications of the units with replacement of the same size of the\n",
       "    original data to create a distribution of the segregation index. Then the `null_value` argument\n",
       "    is tested against this distribution. The null_value may be 0, or may be estimated empirically using\n",
       "    the `simulate_null` function.\n",
       "\n",
       "    * ``systematic``:\n",
       "    assumes that every group has the same probability with restricted conditional probabilities\n",
       "    p_0_j = p_1_j = p_j = n_j/n (multinomial distribution).\n",
       "\n",
       "    * ``evenness``:\n",
       "    Generate a distribution of segregation indices under the assumption of evenness, which\n",
       "    assumes that each spatial unit has the same global probability of drawing elements from the\n",
       "    minority group of the fixed total unit population (binomial distribution). Then test the observed\n",
       "    segregation index against this distribution\n",
       "\n",
       "    * ``person_permutation``:\n",
       "    Generate a distribution of segregation indices under the assumption of individual-level randomization,\n",
       "    which randomly allocates individuals into units keeping the total population of each\n",
       "    equal to the original.Then test the observed segregation index against this distribution\n",
       "\n",
       "    * ``geographic_permutation``:\n",
       "    Generate a distribution of segregation indices under the assumption of geographit unit-level randomization,\n",
       "    which randomly allocates the units over space keeping the original values. Then test the observed segregation\n",
       "    index against this distribution\n",
       "\n",
       "    * ``systematic_permutation``:\n",
       "    Generate a distribution of segregation indices under the assumption of systemic randomization,\n",
       "    then randomly allocate units over space. Then test the observed segregation index against this distribution\n",
       "\n",
       "    * ``even_permutation``:\n",
       "    Generate a distribution of segregation indices under the assumption of evenness, then randomly allocating\n",
       "    the units over space. Then test the observed segregation index against this distribution\n",
       "\n",
       "two_tailed : boolean\n",
       "    If True, p_value is two-tailed. Otherwise, it is right one-tailed. The one-tailed p_value attribute\n",
       "    might not be appropriate for some measures, as the two-tailed. Therefore, it is better to rely on the\n",
       "    est_sim attribute.\n",
       "n_jobs: int, optional\n",
       "    number of cores to use for estimation. If -1 all available cpus will be used\n",
       "backend: str, optional\n",
       "    which backend to use with joblib. Options include \"loky\", \"multiprocessing\", or \"threading\"\n",
       "index_kwargs : dict, optional\n",
       "    additional keyword arguments passed to the index class\n",
       "\n",
       "Attributes\n",
       "----------\n",
       "p_value : float\n",
       "    Pseudo One or Two-Tailed p-value estimated from the simulations\n",
       "est_sim : numpy array\n",
       "   Estimates of the segregation measure under the null hypothesis\n",
       "statistic : float\n",
       "    The value of the segregation index being tested\n",
       "\n",
       "Notes\n",
       "-----\n",
       "1) The different approaches for the null hypothesis affect directly the results of the inference depending on the\n",
       "combination of the index type of seg_class and the null_approach chosen. Therefore, the user needs to be aware of\n",
       "how these approaches are affecting the data generation process of the simulations in order to draw meaningful\n",
       "conclusions. For example, the Modified Dissimilarity (ModifiedDissim) and  Modified Gini (ModifiedGiniSeg) indexes,\n",
       "rely exactly on the distance between evenness through sampling which, therefore, the \"evenness\" value for null\n",
       "approach would not be the most appropriate for these indexes.\n",
       "\n",
       "Examples\n",
       "--------\n",
       "Several examples can be found here https://github.com/pysal/segregation/blob/master/notebooks/inference_wrappers_example.ipynb.\n",
       "\u001b[0;31mFile:\u001b[0m           ~/Dropbox/projects/segregation/segregation/inference/inference_wrappers.py\n",
       "\u001b[0;31mType:\u001b[0m           type\n",
       "\u001b[0;31mSubclasses:\u001b[0m     \n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "inference.SingleValueTest?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5c54a35-b971-460e-9a48-734973f46847",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:segregation]",
   "language": "python",
   "name": "conda-env-segregation-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }