Skip to content

Instantly share code, notes, and snippets.

@knaaptime
Created December 16, 2022 23:52
Show Gist options
  • Save knaaptime/325115c493557725ef241b44d5c5c0a4 to your computer and use it in GitHub Desktop.
Save knaaptime/325115c493557725ef241b44d5c5c0a4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "79c40a99-ec29-492b-be48-f218c7368620",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import geopandas as gpd #实现空间匹配\n",
"from segregation.singlegroup import *\n",
"from segregation import inference\n",
"import shapely.wkt"
]
},
{
"cell_type": "markdown",
"id": "19116c7d-e2a0-4fed-ab45-2701dee2649c",
"metadata": {},
"source": [
"Note in the cell below you need to set the CRS to 4326, not 3857 because the input coordinates are stored in latitude/longitude"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c3ca5c1e-4777-4da5-b8d9-d857a0dc7801",
"metadata": {},
"outputs": [],
"source": [
"# Transform the original data into geodf\n",
"df = pd.read_csv('beijing_tianjin_seg_v2.csv', sep='\\s+', encoding='utf-8') \n",
"df.columns = ['city', 'county', 'border','highschool_and_below','college','undergraduate_and_above','2499andbelow','2500~3999','4000~7999','8000~19999','20000andabove'] \n",
"df_border = df['border'].str.split(',', expand=True)\n",
"\n",
"border_combine = \"POLYGON((\" + df_border[0]+\" \"+df_border[1]+ \",\" +df_border[2]+\" \"+df_border[3]+ \",\" +df_border[4]+\" \"+df_border[5]+ \",\" +df_border[6]+\" \"+df_border[7]+\",\"+df_border[8]+\" \"+df_border[9]+ \"))\"\n",
"border_combine=border_combine.tolist()\n",
"\n",
"P=[]\n",
"for i in range(len(border_combine)):\n",
" P.insert(i-1,shapely.wkt.loads(border_combine[i]))\n",
"p=gpd.GeoSeries(P)\n",
"df=df.drop(['border'], axis=1)\n",
"geo_df = gpd.GeoDataFrame(data=df,geometry=p)\n",
"geo_df.crs=4326 # geo_df (note the change here)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ae994a08-c660-4fce-9087-b14258719477",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>city</th>\n",
" <th>county</th>\n",
" <th>highschool_and_below</th>\n",
" <th>college</th>\n",
" <th>undergraduate_and_above</th>\n",
" <th>2499andbelow</th>\n",
" <th>2500~3999</th>\n",
" <th>4000~7999</th>\n",
" <th>8000~19999</th>\n",
" <th>20000andabove</th>\n",
" <th>geometry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>孝感市</td>\n",
" <td>云梦县</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((113.91656 36.01402, 113.91655 36.021...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>安阳市</td>\n",
" <td>林州市</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((105.62528 35.22905, 105.62528 35.236...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>定西市</td>\n",
" <td>通渭县</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((104.82565 28.57691, 104.82565 28.584...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>宜宾市</td>\n",
" <td>长宁县</td>\n",
" <td>65</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>56</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((107.17934 23.56500, 107.17934 23.573...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>百色市</td>\n",
" <td>田东县</td>\n",
" <td>227</td>\n",
" <td>81</td>\n",
" <td>13</td>\n",
" <td>201</td>\n",
" <td>56</td>\n",
" <td>32</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>POLYGON ((113.05431 26.84158, 113.05431 26.849...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" city county highschool_and_below college undergraduate_and_above \\\n",
"0 孝感市 云梦县 9 1 2 \n",
"1 安阳市 林州市 6 0 0 \n",
"2 定西市 通渭县 1 0 0 \n",
"3 宜宾市 长宁县 65 10 1 \n",
"4 百色市 田东县 227 81 13 \n",
"\n",
" 2499andbelow 2500~3999 4000~7999 8000~19999 20000andabove \\\n",
"0 9 1 2 0 0 \n",
"1 5 0 0 0 0 \n",
"2 0 0 1 0 0 \n",
"3 56 6 3 0 0 \n",
"4 201 56 32 4 1 \n",
"\n",
" geometry \n",
"0 POLYGON ((113.91656 36.01402, 113.91655 36.021... \n",
"1 POLYGON ((105.62528 35.22905, 105.62528 35.236... \n",
"2 POLYGON ((104.82565 28.57691, 104.82565 28.584... \n",
"3 POLYGON ((107.17934 23.56500, 107.17934 23.573... \n",
"4 POLYGON ((113.05431 26.84158, 113.05431 26.849... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"geo_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2f7837cc-cb8f-4351-8378-f9686f8a40e3",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
" Cauculate segregation index in county level\n",
"\"\"\"\n",
"\n",
"df_code = pd.read_csv('county_list_merge.csv', sep='\\s+', encoding='utf-8') #read county list \n",
"df_code.columns =['city', 'county', 'county_id'] \n",
"new_geodf = pd.merge(geo_df, df_code, how='inner', on=['city', 'county']) # merge geodataframe with county list \n",
"new_geodf['id'] = new_geodf.groupby(u'county_id', as_index=False).ngroup() \n",
"new_geodf['edu_low']= new_geodf['highschool_and_below'] # change three groups into two groups, edu_low refers to the low education level group\n",
"new_geodf['income_low']=new_geodf['2499andbelow'] + new_geodf['2500~3999'] #calculate the number of population in low income group\n",
"new_geodf['poptotal']= new_geodf['highschool_and_below']+new_geodf['college']+new_geodf['undergraduate_and_above'] #calculate the number"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2102764-b7f0-484a-bcac-20c860491ce7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>city</th>\n",
" <th>county</th>\n",
" <th>highschool_and_below</th>\n",
" <th>college</th>\n",
" <th>undergraduate_and_above</th>\n",
" <th>2499andbelow</th>\n",
" <th>2500~3999</th>\n",
" <th>4000~7999</th>\n",
" <th>8000~19999</th>\n",
" <th>20000andabove</th>\n",
" <th>geometry</th>\n",
" <th>county_id</th>\n",
" <th>id</th>\n",
" <th>edu_low</th>\n",
" <th>income_low</th>\n",
" <th>poptotal</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>孝感市</td>\n",
" <td>云梦县</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((113.91656 36.01402, 113.91655 36.021...</td>\n",
" <td>420923</td>\n",
" <td>46</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>安阳市</td>\n",
" <td>林州市</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((105.62528 35.22905, 105.62528 35.236...</td>\n",
" <td>410581</td>\n",
" <td>38</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>定西市</td>\n",
" <td>通渭县</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((104.82565 28.57691, 104.82565 28.584...</td>\n",
" <td>621121</td>\n",
" <td>89</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>宜宾市</td>\n",
" <td>长宁县</td>\n",
" <td>65</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>56</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>POLYGON ((107.17934 23.56500, 107.17934 23.573...</td>\n",
" <td>511524</td>\n",
" <td>74</td>\n",
" <td>65</td>\n",
" <td>62</td>\n",
" <td>76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>百色市</td>\n",
" <td>田东县</td>\n",
" <td>227</td>\n",
" <td>81</td>\n",
" <td>13</td>\n",
" <td>201</td>\n",
" <td>56</td>\n",
" <td>32</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>POLYGON ((113.05431 26.84158, 113.05431 26.849...</td>\n",
" <td>451022</td>\n",
" <td>62</td>\n",
" <td>227</td>\n",
" <td>257</td>\n",
" <td>321</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" city county highschool_and_below college undergraduate_and_above \\\n",
"0 孝感市 云梦县 9 1 2 \n",
"1 安阳市 林州市 6 0 0 \n",
"2 定西市 通渭县 1 0 0 \n",
"3 宜宾市 长宁县 65 10 1 \n",
"4 百色市 田东县 227 81 13 \n",
"\n",
" 2499andbelow 2500~3999 4000~7999 8000~19999 20000andabove \\\n",
"0 9 1 2 0 0 \n",
"1 5 0 0 0 0 \n",
"2 0 0 1 0 0 \n",
"3 56 6 3 0 0 \n",
"4 201 56 32 4 1 \n",
"\n",
" geometry county_id id edu_low \\\n",
"0 POLYGON ((113.91656 36.01402, 113.91655 36.021... 420923 46 9 \n",
"1 POLYGON ((105.62528 35.22905, 105.62528 35.236... 410581 38 6 \n",
"2 POLYGON ((104.82565 28.57691, 104.82565 28.584... 621121 89 1 \n",
"3 POLYGON ((107.17934 23.56500, 107.17934 23.573... 511524 74 65 \n",
"4 POLYGON ((113.05431 26.84158, 113.05431 26.849... 451022 62 227 \n",
"\n",
" income_low poptotal \n",
"0 10 12 \n",
"1 5 6 \n",
"2 0 1 \n",
"3 62 76 \n",
"4 257 321 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_geodf.head()"
]
},
{
"cell_type": "markdown",
"id": "3f360147-df88-468c-a188-9b33d2a2e582",
"metadata": {},
"source": [
"These are really small polygons, maybe a sample from the original grid? It might be easier to just represent them as points, but since the resulting dataframe has one observation per county, i assume the full set exists somewhere."
]
},
{
"cell_type": "markdown",
"id": "c8b2f61b-9cfa-43de-be11-f00b5f61dae9",
"metadata": {},
"source": [
"After the merge operations above, we have only one observation (one grid cell) per county. That means our observations are counties and we're computing segregation at the *national* level, not the county level (as the comment in the cell above says). \n",
"\n",
"(again, i assume this is just a simplified dataset and you have more observations in each county, so you *could* compute a county-level index, I just want to make sure we're on the same page)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "da5a0776-fe75-4211-b1ed-57303bfe1140",
"metadata": {},
"outputs": [],
"source": [
"new_geodf = new_geodf.to_crs(new_geodf.estimate_utm_crs())"
]
},
{
"cell_type": "markdown",
"id": "421472d0-caf2-4b00-a9b7-1675e3e9d4e1",
"metadata": {},
"source": [
"Instead of doing the iteration yourself, it's easier to use the `batch_compute` function"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "93870676-a95e-482e-b0fa-7bcd67b90008",
"metadata": {},
"outputs": [],
"source": [
"from segregation.batch import batch_compute_multigroup, batch_compute_singlegroup"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "01cd1d12-a75d-4cab-a367-d4097ebac0ee",
"metadata": {},
"outputs": [
{
"data": {
"application/json": {
"ascii": false,
"bar_format": null,
"colour": null,
"elapsed": 0.010767221450805664,
"initial": 0,
"n": 0,
"ncols": null,
"nrows": null,
"postfix": null,
"prefix": "",
"rate": null,
"total": 27,
"unit": "it",
"unit_divisor": 1000,
"unit_scale": false
},
"application/vnd.jupyter.widget-view+json": {
"model_id": "0ceed32030d74238b1d3b2966dde15ff",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/27 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
]
}
],
"source": [
"income_indices = batch_compute_singlegroup(new_geodf, group_pop_var='income_low',total_pop_var='poptotal')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fcec5bc5-c791-47ff-99e7-841d01a5f08a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Statistic</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Name</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AbsoluteCentralization</th>\n",
" <td>-0.4476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AbsoluteClustering</th>\n",
" <td>0.6437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AbsoluteConcentration</th>\n",
" <td>1.0375</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Atkinson</th>\n",
" <td>0.0184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BiasCorrectedDissim</th>\n",
" <td>0.0623</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BoundarySpatialDissim</th>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ConProf</th>\n",
" <td>0.0877</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CorrelationR</th>\n",
" <td>0.0146</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Delta</th>\n",
" <td>0.8392</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DensityCorrectedDissim</th>\n",
" <td>0.0618</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dissim</th>\n",
" <td>0.0636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DistanceDecayInteraction</th>\n",
" <td>0.3357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DistanceDecayIsolation</th>\n",
" <td>0.6709</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Entropy</th>\n",
" <td>0.0122</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gini</th>\n",
" <td>0.0853</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Interaction</th>\n",
" <td>0.3314</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Isolation</th>\n",
" <td>0.6686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MinMax</th>\n",
" <td>0.1197</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ModifiedDissim</th>\n",
" <td>0.0509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ModifiedGini</th>\n",
" <td>0.0635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PARDissim</th>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeCentralization</th>\n",
" <td>0.0138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeClustering</th>\n",
" <td>-0.0894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeConcentration</th>\n",
" <td>0.1176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialDissim</th>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialProxProf</th>\n",
" <td>1.9730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialProximity</th>\n",
" <td>1.0007</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Statistic\n",
"Name \n",
"AbsoluteCentralization -0.4476\n",
"AbsoluteClustering 0.6437\n",
"AbsoluteConcentration 1.0375\n",
"Atkinson 0.0184\n",
"BiasCorrectedDissim 0.0623\n",
"BoundarySpatialDissim NaN\n",
"ConProf 0.0877\n",
"CorrelationR 0.0146\n",
"Delta 0.8392\n",
"DensityCorrectedDissim 0.0618\n",
"Dissim 0.0636\n",
"DistanceDecayInteraction 0.3357\n",
"DistanceDecayIsolation 0.6709\n",
"Entropy 0.0122\n",
"Gini 0.0853\n",
"Interaction 0.3314\n",
"Isolation 0.6686\n",
"MinMax 0.1197\n",
"ModifiedDissim 0.0509\n",
"ModifiedGini 0.0635\n",
"PARDissim NaN\n",
"RelativeCentralization 0.0138\n",
"RelativeClustering -0.0894\n",
"RelativeConcentration 0.1176\n",
"SpatialDissim NaN\n",
"SpatialProxProf 1.9730\n",
"SpatialProximity 1.0007"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"income_indices"
]
},
{
"cell_type": "markdown",
"id": "4d838124-286a-4db9-8afc-2a0a9637d6d4",
"metadata": {},
"source": [
"Now you get back a dataframe of *all* the available segregation indices without having to build it from scratch. Doing the same for education:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9b41d470-b4da-4ba0-84f8-b1823d5ca314",
"metadata": {},
"outputs": [
{
"data": {
"application/json": {
"ascii": false,
"bar_format": null,
"colour": null,
"elapsed": 0.007842063903808594,
"initial": 0,
"n": 0,
"ncols": null,
"nrows": null,
"postfix": null,
"prefix": "",
"rate": null,
"total": 27,
"unit": "it",
"unit_divisor": 1000,
"unit_scale": false
},
"application/vnd.jupyter.widget-view+json": {
"model_id": "707fc9e3bcee4dbda6beff6f96b872c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/27 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"edu_indices = batch_compute_singlegroup(new_geodf, group_pop_var='edu_low',total_pop_var='poptotal')"
]
},
{
"cell_type": "markdown",
"id": "ae4bcda5-1ba2-4da0-a38b-9f90efc91c1c",
"metadata": {},
"source": [
"Rename the index columns and stick the results together"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8d105741-fb1f-4103-afd6-e0af3182fe42",
"metadata": {},
"outputs": [],
"source": [
"income_indices = income_indices.rename(columns={'Statistic': 'income_index'})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "351d6b92-c17b-4a9f-a494-7cbc293ba7af",
"metadata": {},
"outputs": [],
"source": [
"edu_indices = edu_indices.rename(columns={'Statistic': 'edu_index'})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3b67e132-ee92-461b-8907-51bdab1acf13",
"metadata": {},
"outputs": [],
"source": [
"results = pd.concat([income_indices, edu_indices], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "56e92bc6-23c4-479f-b843-dbf75521d0a4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>income_index</th>\n",
" <th>edu_index</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Name</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>AbsoluteCentralization</th>\n",
" <td>-0.4476</td>\n",
" <td>-0.4743</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AbsoluteClustering</th>\n",
" <td>0.6437</td>\n",
" <td>0.7629</td>\n",
" </tr>\n",
" <tr>\n",
" <th>AbsoluteConcentration</th>\n",
" <td>1.0375</td>\n",
" <td>0.8582</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Atkinson</th>\n",
" <td>0.0184</td>\n",
" <td>0.0359</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BiasCorrectedDissim</th>\n",
" <td>0.0623</td>\n",
" <td>0.1502</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BoundarySpatialDissim</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ConProf</th>\n",
" <td>0.0877</td>\n",
" <td>0.2157</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CorrelationR</th>\n",
" <td>0.0146</td>\n",
" <td>0.0288</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Delta</th>\n",
" <td>0.8392</td>\n",
" <td>0.8446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DensityCorrectedDissim</th>\n",
" <td>0.0618</td>\n",
" <td>0.1505</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Dissim</th>\n",
" <td>0.0636</td>\n",
" <td>0.1533</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DistanceDecayInteraction</th>\n",
" <td>0.3357</td>\n",
" <td>0.2550</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DistanceDecayIsolation</th>\n",
" <td>0.6709</td>\n",
" <td>0.7593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Entropy</th>\n",
" <td>0.0122</td>\n",
" <td>0.0243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gini</th>\n",
" <td>0.0853</td>\n",
" <td>0.1732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Interaction</th>\n",
" <td>0.3314</td>\n",
" <td>0.2500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Isolation</th>\n",
" <td>0.6686</td>\n",
" <td>0.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MinMax</th>\n",
" <td>0.1197</td>\n",
" <td>0.2658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ModifiedDissim</th>\n",
" <td>0.0509</td>\n",
" <td>0.1392</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ModifiedGini</th>\n",
" <td>0.0635</td>\n",
" <td>0.1496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PARDissim</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeCentralization</th>\n",
" <td>0.0138</td>\n",
" <td>-0.0901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeClustering</th>\n",
" <td>-0.0894</td>\n",
" <td>0.2845</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RelativeConcentration</th>\n",
" <td>0.1176</td>\n",
" <td>-0.5766</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialDissim</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialProxProf</th>\n",
" <td>1.9730</td>\n",
" <td>2.8851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SpatialProximity</th>\n",
" <td>1.0007</td>\n",
" <td>1.0117</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" income_index edu_index\n",
"Name \n",
"AbsoluteCentralization -0.4476 -0.4743\n",
"AbsoluteClustering 0.6437 0.7629\n",
"AbsoluteConcentration 1.0375 0.8582\n",
"Atkinson 0.0184 0.0359\n",
"BiasCorrectedDissim 0.0623 0.1502\n",
"BoundarySpatialDissim NaN NaN\n",
"ConProf 0.0877 0.2157\n",
"CorrelationR 0.0146 0.0288\n",
"Delta 0.8392 0.8446\n",
"DensityCorrectedDissim 0.0618 0.1505\n",
"Dissim 0.0636 0.1533\n",
"DistanceDecayInteraction 0.3357 0.2550\n",
"DistanceDecayIsolation 0.6709 0.7593\n",
"Entropy 0.0122 0.0243\n",
"Gini 0.0853 0.1732\n",
"Interaction 0.3314 0.2500\n",
"Isolation 0.6686 0.7500\n",
"MinMax 0.1197 0.2658\n",
"ModifiedDissim 0.0509 0.1392\n",
"ModifiedGini 0.0635 0.1496\n",
"PARDissim NaN NaN\n",
"RelativeCentralization 0.0138 -0.0901\n",
"RelativeClustering -0.0894 0.2845\n",
"RelativeConcentration 0.1176 -0.5766\n",
"SpatialDissim NaN NaN\n",
"SpatialProxProf 1.9730 2.8851\n",
"SpatialProximity 1.0007 1.0117"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results"
]
},
{
"cell_type": "markdown",
"id": "ddf0775b-f78e-484e-8dc0-480a6808a140",
"metadata": {},
"source": [
"The spatial dissimilarity indices fail here because they default to a queen contiguity matrix, and none of the observations in this simple dataset touch one another. To get around that, you could pass a different weights object"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "93a23349-de17-4825-85b3-7099879d6364",
"metadata": {},
"outputs": [],
"source": [
"from libpysal.weights import KNN"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c5bd7f2c-731b-43e7-b779-5f373bcbf589",
"metadata": {},
"outputs": [],
"source": [
"w = KNN.from_dataframe(new_geodf, k=6)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "207710ce-c763-4e95-9090-d8f1d8def3a5",
"metadata": {},
"outputs": [],
"source": [
"sd = SpatialDissim(new_geodf, group_pop_var='edu_low', w=w, total_pop_var='poptotal')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9f341b43-5f39-45f4-927e-cba454ade2f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.023636657648990783"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sd.statistic"
]
},
{
"cell_type": "markdown",
"id": "ff002fea-3774-433d-899b-c3b13839d19c",
"metadata": {},
"source": [
"but a contiguity matrix would work fine here if you were operating on the full grid"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5a5e730d-7176-4c76-a5e8-7a86f33500df",
"metadata": {},
"outputs": [],
"source": [
"modified_dissim = ModifiedDissim(new_geodf, group_pop_var='edu_low',total_pop_var='poptotal')"
]
},
{
"cell_type": "markdown",
"id": "b3a2f17a-c7f1-4afb-8b07-5d568a137bf6",
"metadata": {},
"source": [
"In the previous version of this code, the modified dissimilarity index could fail because an issue in the synthetic population generator could create observations with a group population that exceeds the total. Now that's fixed\n",
"\n",
"(when you got a result in the previous version, it was still valid, but the old implementation could easily produce invalid data and fail out)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "f550b753-325a-407c-86ae-7690bc1512d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13858042904893253"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modified_dissim.statistic"
]
},
{
"cell_type": "markdown",
"id": "011e1071-c6e4-49f6-99ea-cad69f929d75",
"metadata": {},
"source": [
"## Inference"
]
},
{
"cell_type": "markdown",
"id": "02f0c094-c3a1-406d-889a-012fef34dd33",
"metadata": {},
"source": [
"A traditional dissimilarity index works fine"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a0285263-9ceb-4e93-b555-af6756467009",
"metadata": {},
"outputs": [
{
"data": {
"application/json": {
"ascii": false,
"bar_format": null,
"colour": null,
"elapsed": 0.008296012878417969,
"initial": 0,
"n": 0,
"ncols": null,
"nrows": null,
"postfix": null,
"prefix": "",
"rate": null,
"total": 500,
"unit": "it",
"unit_divisor": 1000,
"unit_scale": false
},
"application/vnd.jupyter.widget-view+json": {
"model_id": "64fe62d6328a438b9ef7d1fb01ee463e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/500 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"inf_example = inference.SingleValueTest(\n",
" Dissim(new_geodf, group_pop_var=\"edu_low\", total_pop_var=\"poptotal\"),\n",
" null_approach=\"bootstrap\",\n",
")"
]
},
{
"cell_type": "markdown",
"id": "03719454-1670-4756-b124-fb5d2a341f6c",
"metadata": {},
"source": [
"But note that the inference methods here aren't really compatible with the \"modified\" indices defined in [carrington and troske](https://www.jstor.org/stable/1392486#metadata_info_tab_contents) because those are already based on comparing an observed value to a null distriubtion"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "22dc5b71-e30e-4a82-b0cf-38020a9de23f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mInit signature:\u001b[0m\n",
"\u001b[0minference\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSingleValueTest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mseg_class\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0miterations_under_null\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mnull_approach\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'systematic'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mtwo_tailed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m \n",
"Statistical inference for a single segregation measure.\n",
"\n",
"Parameters\n",
"----------\n",
"seg_class : segregation.singlegroup or segregation.multigroup object\n",
" fitted segregation index class\n",
"iterations_under_null : int\n",
" number of iterations under null hyphothesis\n",
"null_approach : str\n",
" Which counterfactual approach to use when generating null hypothesis distribution. One of the following:.\n",
"\n",
" * ``bootstrap``:\n",
" Generate bootstrap replications of the units with replacement of the same size of the\n",
" original data to create a distribution of the segregation index. Then the `null_value` argument\n",
" is tested against this distribution. The null_value may be 0, or may be estimated empirically using\n",
" the `simulate_null` function.\n",
"\n",
" * ``systematic``:\n",
" assumes that every group has the same probability with restricted conditional probabilities\n",
" p_0_j = p_1_j = p_j = n_j/n (multinomial distribution).\n",
"\n",
" * ``evenness``:\n",
" Generate a distribution of segregation indices under the assumption of evenness, which\n",
" assumes that each spatial unit has the same global probability of drawing elements from the\n",
" minority group of the fixed total unit population (binomial distribution). Then test the observed\n",
" segregation index against this distribution\n",
"\n",
" * ``person_permutation``:\n",
" Generate a distribution of segregation indices under the assumption of individual-level randomization,\n",
" which randomly allocates individuals into units keeping the total population of each\n",
" equal to the original.Then test the observed segregation index against this distribution\n",
"\n",
" * ``geographic_permutation``:\n",
" Generate a distribution of segregation indices under the assumption of geographit unit-level randomization,\n",
" which randomly allocates the units over space keeping the original values. Then test the observed segregation\n",
" index against this distribution\n",
"\n",
" * ``systematic_permutation``:\n",
" Generate a distribution of segregation indices under the assumption of systemic randomization,\n",
" then randomly allocate units over space. Then test the observed segregation index against this distribution\n",
"\n",
" * ``even_permutation``:\n",
" Generate a distribution of segregation indices under the assumption of evenness, then randomly allocating\n",
" the units over space. Then test the observed segregation index against this distribution\n",
"\n",
"two_tailed : boolean\n",
" If True, p_value is two-tailed. Otherwise, it is right one-tailed. The one-tailed p_value attribute\n",
" might not be appropriate for some measures, as the two-tailed. Therefore, it is better to rely on the\n",
" est_sim attribute.\n",
"n_jobs: int, optional\n",
" number of cores to use for estimation. If -1 all available cpus will be used\n",
"backend: str, optional\n",
" which backend to use with joblib. Options include \"loky\", \"multiprocessing\", or \"threading\"\n",
"index_kwargs : dict, optional\n",
" additional keyword arguments passed to the index class\n",
"\n",
"Attributes\n",
"----------\n",
"p_value : float\n",
" Pseudo One or Two-Tailed p-value estimated from the simulations\n",
"est_sim : numpy array\n",
" Estimates of the segregation measure under the null hypothesis\n",
"statistic : float\n",
" The value of the segregation index being tested\n",
"\n",
"Notes\n",
"-----\n",
"1) The different approaches for the null hypothesis affect directly the results of the inference depending on the\n",
"combination of the index type of seg_class and the null_approach chosen. Therefore, the user needs to be aware of\n",
"how these approaches are affecting the data generation process of the simulations in order to draw meaningful\n",
"conclusions. For example, the Modified Dissimilarity (ModifiedDissim) and Modified Gini (ModifiedGiniSeg) indexes,\n",
"rely exactly on the distance between evenness through sampling which, therefore, the \"evenness\" value for null\n",
"approach would not be the most appropriate for these indexes.\n",
"\n",
"Examples\n",
"--------\n",
"Several examples can be found here https://github.com/pysal/segregation/blob/master/notebooks/inference_wrappers_example.ipynb.\n",
"\u001b[0;31mFile:\u001b[0m ~/Dropbox/projects/segregation/segregation/inference/inference_wrappers.py\n",
"\u001b[0;31mType:\u001b[0m type\n",
"\u001b[0;31mSubclasses:\u001b[0m \n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"inference.SingleValueTest?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5c54a35-b971-460e-9a48-734973f46847",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:segregation]",
"language": "python",
"name": "conda-env-segregation-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment