Created
August 23, 2020 17:00
-
-
Save ses4j/22ccd5350eb28697d25c962d32879431 to your computer and use it in GitHub Desktop.
Load eBird data export into PostgreSQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
This script will create a database called ebirddb with a table called ebird. | |
It will import the standard eBird export format file, as specified by `ebird_export_path` below into | |
the new table. Then it will create some indexes and geo points. Customize to suit. | |
*/ | |
-- Set the desired tablespace, if you want, to put it on a drive with room. Skip this if desired. | |
SET default_tablespace = d_space; | |
\set ON_ERROR_STOP on | |
SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = 'ebird' AND pid <> pg_backend_pid(); | |
DROP DATABASE if exists "ebirddb"; | |
CREATE DATABASE "ebirddb" WITH ENCODING 'UTF8' TEMPLATE=template0; | |
\c "ebirddb" | |
DO $$ | |
DECLARE ebird_export_path CONSTANT VARCHAR := 'D:\ebird-db\ebd_US-DC_prv_relJul-2020.txt'; | |
BEGIN | |
CREATE TABLE "ebird" | |
( | |
GLOBAL_UNIQUE_IDENTIFIER char(50), -- always 45-47 characters needed (so far) | |
CATEGORY varchar(20), -- Probably 10 would be safe | |
COMMON_NAME varchar(70), -- Some hybrids have really long names | |
SUBSPECIES_COMMON_NAME varchar(70), -- '' | |
OBSERVATION_COUNT_STR varchar(8), -- Someone saw 1.3 million Auklets. | |
BREEDING_BIRD_ATLAS_CODE varchar(2), | |
BREEDING_BIRD_ATLAS_CATEGORY varchar(2), | |
COUNTRY varchar(50), -- long enough for "Saint Helena, Ascension and Tristan da Cunha" | |
COUNTRY_CODE char(2), -- alpha-2 codes | |
STATE varchar(50), -- no idea if this is long enough? U.S. Virgin Islands may be almost 30 | |
STATE_CODE varchar(30), | |
COUNTY varchar(50), -- no idea if this is long enough? U.S. Virgin Islands may be almost 30 | |
COUNTY_CODE varchar(30), | |
ATLAS_BLOCK varchar(20), -- i think max 10 | |
LOCALITY text, -- unstructured/potentially long | |
LOCALITY_ID char(10), -- maximum observed so far is 8 | |
LOCALITY_TYPE char(2), -- short codes | |
LATITUDE double precision, -- Is this the appropriate level of precision? | |
LONGITUDE double precision, -- '' | |
OBSERVATION_DATE date, -- Do I need to specify YMD somehow? | |
TIME_OBSERVATIONS_STARTED time, -- How do I make this a time? | |
OBSERVER_ID char(12), -- max of 9 in the data I've seen so far | |
SAMPLING_EVENT_IDENTIFIER char(12), -- Probably want to index on this. | |
PROTOCOL_CODE varchar(5), | |
PROJECT_CODE varchar(20), -- Needs to be at least 10 for sure. | |
DURATION_MINUTES int, -- bigint? | |
EFFORT_DISTANCE_KM real, -- precision? | |
EFFORT_AREA_HA real, -- precision? | |
NUMBER_OBSERVERS int, -- just a small int | |
ALL_SPECIES_REPORTED int, -- Seems to always be 1 or 0. Maybe I could make this Boolean? | |
GROUP_IDENTIFIER varchar(10), -- Appears to be max of 7 or 8 | |
HAS_MEDIA boolean, | |
APPROVED boolean, -- Can be Boolean? | |
REVIEWED boolean, -- Can be Boolean? | |
REASON text, -- May need to be longer if data set includes unvetted data | |
TRIP_COMMENTS text, -- Comments are long, unstructured, | |
SPECIES_COMMENTS text | |
); | |
EXECUTE( 'COPY "ebird" ' || | |
'FROM PROGRAM ''cut -f 1,4,5,7,9,10,11,13,14,15,16,17,18,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46 ' || ebird_export_path || '''' || | |
' WITH (FORMAT CSV, HEADER, QUOTE E''\5'', ENCODING ''UTF8'', DELIMITER E''\t'')'); | |
END $$; | |
CREATE EXTENSION if not exists postgis; | |
alter table "ebird" add column observation_count int; | |
UPDATE "ebird" SET observation_count = observation_count_str::int where observation_count_str != 'X' and observation_count_str is not null; | |
ALTER TABLE "ebird" | |
ADD COLUMN IF NOT EXISTS geog geography; | |
UPDATE "ebird" | |
SET geog = st_SetSRID(ST_MakePoint(longitude, LATITUDE), 4326)::geography | |
where true; | |
CREATE INDEX ebird_geog_idx ON "ebird" USING GIST (geog); | |
VACUUM ANALYZE "ebird" (geog); | |
ALTER TABLE "ebird" | |
ADD COLUMN IF NOT EXISTS observation_doy smallint; | |
UPDATE "ebird" | |
SET observation_doy = extract(doy from OBSERVATION_DATE) | |
where true; | |
create index ebird_locality_doy_idx ON "ebird" (locality_id asc, observation_doy asc); | |
create index ebird_common_name_idx ON "ebird" (common_name asc); | |
create index ebird_breeding_bird_atlas_category_idx ON "ebird" (breeding_bird_atlas_category) WHERE breeding_bird_atlas_category is not NULL; | |
CREATE INDEX ON "ebird" (sampling_event_identifier); | |
VACUUM ANALYZE "ebird"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment