These are my notes for taking the Microsoft US Building Footprints and splitting them into more manageable chunks based on US Census Tracts.
All of this happened on an m5.xlarge in AWS and used up about ~300GB of EBS over the course of a few hours.
-
Make a filesystem on the EBS volume and mount it:
sudo mkfs.xfs /dev/nvme1n1 mount /dev/nvme1n1 /mnt
-
Make the directory to download into
mkdir -p /mnt/source/buildings
-
Run the
download_buildings.sh
script to grab the data to the EBS volume -
Download the US Census Tract shapefiles
mkdir -p /mnt/source/tracts wget --recursive --continue --accept=*.zip \ --no-parent --cut-dirs=3 --no-host-directories \ --directory-prefix=/mnt/source/tracts \ ftp://ftp2.census.gov/geo/tiger/TIGER2017/TRACT/
-
Unzip the downloaded stuff
sudo apt-get install -y unzip cd /mnt/source/tracts; mv TRACTS/*.zip .; for i in *.zip; do unzip $i; done cd /mnt/source/buildings; for i in *.zip; do unzip $i; done
-
Dump the Features from these downloaded files into 1 million line chunks
cat *.json | grep '"Feature",' | split -l 1000000 --additional-suffix .json
-
Add missing commas from where one file ended and was concatenated with another file
for i in x*.json; do echo $i; sed -i 's/}}[^,]/}},/' $i; done
-
Add prefix and suffix for a GeoJSON
FeatureCollection
soogr2ogr
will read it properly.# Make it a FeatureCollection for i in x*.json; do sed -i '1s/^/{"type":"FeatureCollection","features":[/' $i; done # Chop off the comma on the last line for i in x*.json; do truncate $i --size=-3; done # Add the chars that close the FeatureCollection for i in x*.json; do sed -i -e '$a]}' $i; done
-
Install and setup PostGIS
sudo apt-get update sudo apt-get install -y postgresql postgis gdal-bin # Stop postgres to move the datadir sudo /etc/init.d/postgresql stop # Copy over the datadir sudo cp -pr /var/lib/postgresql/ /mnt/postgresql/ # Modify the config file to change data_directory to '/mnt/postgresql/9.5/main' sudo vi /etc/postgresql/9.5/main/postgresql.conf # Start postgres sudo /etc/init.d/postgresql start # Create user sudo -u postgres createuser ubuntu -s # Create database createdb # Install postgis extension psql -c "CREATE EXTENSION postgis;"
-
Import the buildings into PostGIS
for i in x*.json; do echo $i; ogr2ogr -f "PostgreSQL" PG:"" $i -nln buildings -append; done
-
Import the tracts into PostGIS
cd /mnt/source/tracts shp2pgsql -p tl_2017_01_tract.shp tracts | psql for i in *.shp; do shp2pgsql -a $i tracts | psql; done psql -c "create index tracts_geo_idx on tracts using gist (geom);" psql -c "select UpdateGeometrySRID('tracts', 'geom', 4326);"
-
Add a column to the buildings table for the tract ID and give it an index
psql -c "alter table buildings add tract_geoid varchar(12);" psql -c "create index buildings_tract_idx on buildings(tract_geoid);"
-
Add the tract IDs to the buildings
# ST_Within is faster, so get the buildings that are completely contained in a tract first psql -c "update buildings b set tract_geoid=t.geoid from tracts t where st_within(b.wkb_geometry, t.geom);" # ... then go back and pick one of the tracts where they overlap psql -c "update buildings b set tract_geoid=t.geoid from tracts t where b.tract_geoid is null and st_intersects(b.wkb_geometry, t.geom);"
-
Output the buildings (now with a tract ID) as CSV with WKT geometry
mkdir -p /mnt/output ogr2ogr -f CSV /mnt/output/output.csv -lco GEOMETRY=AS_WKT "PG:" -sql "select tract_geoid, ogc_fid as id, wkb_geometry from buildings order by tract_geoid"
-
Split up the CSV into per-tract GeoJSON feature collections using Python
sudo apt-get install -y python-pip pip install shapely mkdir -p /mnt/output/bytract python split_buildings_csv.py
-
Post the tracts to S3
cd /mnt/output/bytract mkdir -p /mnt/output/bystate for i in *; do tar -zcf /mnt/output/bystate/$i.tar.gz $i; done sudo apt-get -y install awscli aws s3 sync --acl="public-read" /mnt/output/bystate/ s3://data.openstreetmap.us/bingbuildings/bystate/ aws s3 sync --acl="public-read" --content-type="application/json" /mnt/output/bytract/ s3://data.openstreetmap.us/bingbuildings/bytract/
-
Build Tippecanoe to generate an mbtiles of the dataset
curl -L https://github.com/mapbox/tippecanoe/archive/master.tar.gz | tar -xz cd tippecanoe-master/ sudo apt-get install -y build-essential libsqlite3-dev zlib1g-dev make -j sudo make install
-
Generate the mbtiles of the dataset
mkdir -p /mnt/tmp (find /mnt/output/bytract -type f -name '*.geojson' -exec cat {} \;) | \ tippecanoe \ --no-line-simplification \ --buffer=0 \ --read-parallel \ --temporary-directory=/mnt/tmp \ --base-zoom=12 \ --maximum-zoom=12 \ --minimum-zoom=12 \ -o /mnt/output/bingbuildings.mbtiles