-
-
Save mtigas/834285 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# usat_import | |
# Fork by Mike Tigas | |
# Based on work by Ryan Nagle, Chris Groskopf and Brian Boyer | |
# | |
# Updated to allow import of multiple states into the same database. | |
# Unlike the original script, this *only* generates super-wide tables. | |
# | |
# Shapes are stored in WGS84 (EPSG:4326) for portability. | |
# | |
# This is excessive, I know. | |
# ===== | |
# | |
# How-to: | |
# | |
# 0: Requires Python, PostgreSQL+PostGIS, and a GIS stack. | |
# Build your own with Homebrew on OSX (what I do): | |
# http://blog.apps.chicagotribune.com/2010/02/17/quick-install-pythonpostgis-geo-stack-on-snow-leopard/ | |
# | |
# Also requires usat_table.py helper script, which generates the create statement | |
# for those extra-wide tables. | |
# | |
# 1: Get data. | |
# | |
# Join IRE (it's cheap!), and download the data for your state: | |
# http://www.ire.org/getcensus/ | |
# | |
# Download the state, county, and tract shapefiles for your state, from the US Census: | |
# http://www.census.gov/cgi-bin/geo/shapefiles2010/main | |
# | |
# 3: Unzip all those files in the same directory. | |
# | |
# 4: Make sure the settings below (DATABASE_* and CENSUS_*_TABLE) are what you want. | |
# | |
# 5: Run this. | |
# | |
# cd into the directory your data is in. | |
# | |
# /path/to/import_usat.sh (state_abbr) (state_number) | |
# where `state_number` is the middle bit in the shapefile file names: "46" if you got tl_2010_46_state10 | |
# i.e.: /path/to/import_usat.sh NJ 34 | |
# | |
# DB info | |
DATABASE_NAME=census2010geo | |
DATABASE_USER=mtigas | |
DATABASE_HOST="127.0.0.1" | |
# Where to create the extra-huge tables | |
CENSUS_STATE_TABLE="census2010_state" | |
CENSUS_COUNTY_TABLE="census2010_county" | |
CENSUS_TRACT_TABLE="census2010_censustract" | |
# ===== | |
#dropdb census2010geo | |
#createdb census2010geo -T template_postgis | |
# ===== | |
#STATE="NJ" | |
#STATE_NUM="34" | |
#STATE="TX" | |
#STATE_NUM="48" | |
#STATE="SD" | |
#STATE_NUM="46" | |
STATE="$1" | |
STATE_NUM="$2" | |
BINDIR="`dirname $0`" | |
# ===== | |
# Do our target state/county/tract tables exist? | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from ${CENSUS_STATE_TABLE}" > /dev/null 2>&1 | |
CREATE_STATE_TABLE=$? | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from $CENSUS_COUNTY_TABLE" > /dev/null 2>&1 | |
CREATE_COUNTY_TABLE=$? | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select 111 from $CENSUS_TRACT_TABLE" > /dev/null 2>&1 | |
CREATE_TRACT_TABLE=$? | |
# ===== | |
# If we have leftover temporary tables (from a previously-failed import), drop 'em. | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_STATE_TABLE}_tempdata; drop table if exists ${CENSUS_STATE_TABLE}_tempgeo;"> /dev/null 2>&1 | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_COUNTY_TABLE}_tempdata; drop table if exists ${CENSUS_COUNTY_TABLE}_tempgeo;"> /dev/null 2>&1 | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table if exists ${CENSUS_TRACT_TABLE}_tempdata; drop table if exists ${CENSUS_TRACT_TABLE}_tempgeo;"> /dev/null 2>&1 | |
# ===== | |
# State-level table | |
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_state10/tl_2010_${STATE_NUM}_state10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_STATE_TABLE}_tempgeo | |
python $BINDIR/usat_table.py ${CENSUS_STATE_TABLE}_tempdata $STATE/State$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_STATE_TABLE}_tempdata from '$PWD/$STATE/State$STATE.csv' delimiters ',' CSV HEADER;" | |
if [ "$CREATE_STATE_TABLE" = "1" ]; then | |
echo "===== CREATING STATE TABLE WITH $STATE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_STATE_TABLE from ${CENSUS_STATE_TABLE}_tempdata inner join ${CENSUS_STATE_TABLE}_tempgeo on ${CENSUS_STATE_TABLE}_tempdata.state = cast(${CENSUS_STATE_TABLE}_tempgeo.statefp10 as int);" | |
CREATE_STATE_TABLE=0 | |
else | |
echo "===== IMPORTING $STATE INTO STATE TABLE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_STATE_TABLE SELECT * FROM ${CENSUS_STATE_TABLE}_tempdata inner join ${CENSUS_STATE_TABLE}_tempgeo on ${CENSUS_STATE_TABLE}_tempdata.state = cast(${CENSUS_STATE_TABLE}_tempgeo.statefp10 as int);" | |
fi | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_STATE_TABLE}_tempdata; drop table ${CENSUS_STATE_TABLE}_tempgeo;" | |
# ===== | |
# County-level table | |
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_county10/tl_2010_${STATE_NUM}_county10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_COUNTY_TABLE}_tempgeo | |
python $BINDIR/usat_table.py ${CENSUS_COUNTY_TABLE}_tempdata $STATE/County$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_COUNTY_TABLE}_tempdata from '$PWD/$STATE/County$STATE.csv' delimiters ',' CSV HEADER;" | |
if [ "$CREATE_COUNTY_TABLE" = "1" ]; then | |
echo "===== CREATING COUNTY TABLE WITH $STATE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_COUNTY_TABLE from ${CENSUS_COUNTY_TABLE}_tempdata inner join ${CENSUS_COUNTY_TABLE}_tempgeo on ${CENSUS_COUNTY_TABLE}_tempdata.state = cast(${CENSUS_COUNTY_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_COUNTY_TABLE}_tempdata.county = cast(${CENSUS_COUNTY_TABLE}_tempgeo.countyfp10 as int);" | |
else | |
echo "===== IMPORTING $STATE INTO COUNTY TABLE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_COUNTY_TABLE SELECT * FROM ${CENSUS_COUNTY_TABLE}_tempdata inner join ${CENSUS_COUNTY_TABLE}_tempgeo on ${CENSUS_COUNTY_TABLE}_tempdata.state = cast(${CENSUS_COUNTY_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_COUNTY_TABLE}_tempdata.county = cast(${CENSUS_COUNTY_TABLE}_tempgeo.countyfp10 as int);" | |
fi | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_COUNTY_TABLE}_tempdata; drop table ${CENSUS_COUNTY_TABLE}_tempgeo;" | |
# ===== | |
# Census Tract-level table | |
ogr2ogr -f PostgreSQL PG:"host=$DATABASE_HOST user=$DATABASE_USER dbname=$DATABASE_NAME" tl_2010_${STATE_NUM}_tract10/tl_2010_${STATE_NUM}_tract10.shp -a_srs EPSG:4326 -s_srs EPSG:4326 -nlt multipolygon -nln ${CENSUS_TRACT_TABLE}_tempgeo | |
python $BINDIR/usat_table.py ${CENSUS_TRACT_TABLE}_tempdata $STATE/Tract$STATE.csv | psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "copy ${CENSUS_TRACT_TABLE}_tempdata from '$PWD/$STATE/Tract$STATE.csv' delimiters ',' CSV HEADER;" | |
if [ "$CREATE_TRACT_TABLE" = "1" ]; then | |
echo "===== CREATING TRACT TABLE WITH $STATE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "select * into $CENSUS_TRACT_TABLE from ${CENSUS_TRACT_TABLE}_tempdata inner join ${CENSUS_TRACT_TABLE}_tempgeo on ${CENSUS_TRACT_TABLE}_tempdata.state = cast(${CENSUS_TRACT_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.county = cast(${CENSUS_TRACT_TABLE}_tempgeo.countyfp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.tract = cast(${CENSUS_TRACT_TABLE}_tempgeo.tractce10 as int);" | |
else | |
echo "===== IMPORTING $STATE INTO TRACT TABLE =====" | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "INSERT INTO $CENSUS_TRACT_TABLE SELECT * FROM ${CENSUS_TRACT_TABLE}_tempdata inner join ${CENSUS_TRACT_TABLE}_tempgeo on ${CENSUS_TRACT_TABLE}_tempdata.state = cast(${CENSUS_TRACT_TABLE}_tempgeo.statefp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.county = cast(${CENSUS_TRACT_TABLE}_tempgeo.countyfp10 as int) and ${CENSUS_TRACT_TABLE}_tempdata.tract = cast(${CENSUS_TRACT_TABLE}_tempgeo.tractce10 as int);" | |
fi | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "drop table ${CENSUS_TRACT_TABLE}_tempdata; drop table ${CENSUS_TRACT_TABLE}_tempgeo;" | |
# ===== | |
psql -d $DATABASE_NAME -h $DATABASE_HOST -U $DATABASE_USER -c "VACUUM" | |
exit | |
# tests | |
# SELECT name FROM "census2010_state" WHERE ST_Contains("census2010_state"."wkb_geometry", ST_GeomFromEWKB(E'\\001\\001\\000\\000 \\346\\020\\000\\000\\000\\000\\000\\000\\200fY\\3005\\201\\377\\310\\363\\014@@'::bytea)); | |
# explain SELECT name FROM "census2010_state" WHERE ST_Contains("census2010_state"."wkb_geometry", ST_GeomFromEWKB(E'\\001\\001\\000\\000 \\346\\020\\000\\000\\000\\000\\000\\000\\200fY\\3005\\201\\377\\310\\363\\014@@'::bytea)); | |
# | |
# SELECT name, _2000pop, p0010001 as _2010pop, pctchgtotpop2010_2000 from "census2010_state" WHERE "name" = 'Texas'; | |
# EXPLAIN SELECT name, _2000pop, p0010001 as _2010pop, pctchgtotpop2010_2000 from "census2010_state" WHERE "name" = 'Texas'; | |
# | |
# SELECT s.name, t._2000pop, t.p0010001 as _2010pop, t.pctchgtotpop2010_2000 from "census2010_state" s, "census2010_censustract" t WHERE t."state" = '34' AND t."state"=s."state"; | |
# EXPLAIN SELECT s.name, t._2000pop, t.p0010001 as _2010pop, t.pctchgtotpop2010_2000 from "census2010_state" s, "census2010_censustract" t WHERE t."state" = '34' AND t."state"=s."state"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create_table.py | |
# | |
# Python helper for PosegreSQL importer of pre-processed census files | |
# provided to IRE members by Paul Overberg and Anthony DeBarros from USA Today. | |
# | |
# See usat_import for usage | |
# | |
import csv | |
import sys | |
table_name = sys.argv[1] | |
file_name = sys.argv[2] | |
r = csv.reader(open(file_name,'rb')) | |
headers = r.next() | |
values = r.next() | |
s = "create table %s (\n" % (table_name) | |
for i,val in enumerate(headers): | |
col = headers[i] | |
value = values[i] | |
if value == '': #blank, skip it! | |
col += " numeric" | |
else: | |
try: | |
float(value) | |
col += " numeric" | |
except ValueError: | |
col += " varchar(50)" | |
if len(headers) - 1 != i: | |
col += ",\n" | |
s += col | |
s += "\n);" | |
print s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment