Skip to content

Instantly share code, notes, and snippets.

@nyimbi
Last active October 5, 2023 08:33
Show Gist options
  • Save nyimbi/53dd4a1eb9e2c7ab995970ea456e1fe7 to your computer and use it in GitHub Desktop.
Save nyimbi/53dd4a1eb9e2c7ab995970ea456e1fe7 to your computer and use it in GitHub Desktop.
Geonames DBML + Loader
Table geoname {
id serial [pk, note: 'Unique identifier for each geoname']
name varchar(200) [note: 'Local name of the place or location']
asciiname varchar(200) [note: 'ASCII version of the name, suitable for URL or systems that dont support unicode']
alternatenames text [note: 'Alternative names or variations of the location name, possibly in different languages or scripts']
latitude float [note: 'Latitude coordinate of the location']
longitude float [note: 'Longitude coordinate of the location']
fclass char(1) [note: 'Feature class, represents general type/category of the location e.g. P for populated place, A for administrative division']
fcode varchar(10) [note: 'Feature code, more specific than feature class, indicating the exact type of feature']
country varchar(2) [note: 'ISO-3166 2-letter country code']
cc2 varchar(120) [note: 'Alternative country codes if the location is near a border']
admin1 varchar(20) [note: 'Primary administrative division, e.g., state in the USA, oblast in Russia']
admin2 varchar(80) [note: 'Secondary administrative division, e.g., county in the USA']
admin3 varchar(20) [note: 'Tertiary administrative division, specific to each country']
admin4 varchar(20) [note: 'Quaternary administrative division, specific to each country']
population int [note: 'Population of the location if applicable']
elevation int [note: 'Elevation above sea level in meters']
gtopo30 int [note: 'Digital elevation model, indicates the average elevation of 30"x30" area in meters']
timezone varchar(40) [note: 'The timezone in which the location lies, based on the IANA Time Zone Database']
moddate date [note: 'The last date when the record was modified or updated']
}
Table alternatename {
alternatenameId int [pk, note: 'Unique identifier for each alternate name entry']
id int [ref: > geoname.id, note: 'Reference to the geoname table; denotes which location this alternate name pertains to']
isoLanguage varchar(7) [note: 'ISO language code denoting the language of this alternate name, e.g., "en" for English']
alternateName varchar(200) [note: 'The alternate name itself in the specified language']
isPreferredName boolean [note: 'Indicates if this is the preferred name in the associated language']
isShortName boolean [note: 'Indicates if this name is a short version or abbreviation']
isColloquial boolean [note: 'Indicates if this name is colloquial or informal']
isHistoric boolean [note: 'Indicates if this name is historic and no longer widely in use']
name_from varchar(500) [note: 'Used for transliterations; the script or system from which the name was derived']
name_to varchar(500) [note: 'Used for transliterations; the script or system to which the name was translated']
}
Table country {
iso_alpha2 char(2) [pk, unique, not null, note: '2-letter ISO 3166-1 alpha code e.g., "US" for the United States']
iso_alpha3 char(3) [unique, not null, note: '3-letter ISO 3166-1 alpha code e.g., "USA" for the United States']
iso_numeric integer [note: 'ISO 3166-1 numeric code e.g., 840 for the United States']
fips_code varchar(3) [note: 'Federal Information Processing Standard code, used by the US government']
name varchar(200) [note: 'Full name of the country']
capital varchar(200) [note: 'Capital city of the country']
areainsqkm float [note: 'Total area of the country in square kilometers']
population integer [note: 'Estimated population of the country']
continent varchar(2) [note: 'Abbreviation of the continent the country is located in']
tld varchar(10) [note: 'Top Level Domain for the country e.g., ".us" for the United States']
currencycode varchar(3) [note: 'ISO code of the country’s currency e.g., "USD" for US Dollar']
currencyname varchar(20) [note: 'Full name of the country’s currency e.g., "Dollar" for US Dollar']
phone varchar(20) [note: 'Country dialing code e.g., "+1" for the United States']
postalcode varchar(100) [note: 'Template or format of postal codes in the country']
postalcoderegex varchar(200) [note: 'Regular expression pattern to validate postal codes']
languages varchar(200) [note: 'Commonly spoken languages in the country, represented as ISO codes']
id int [ref: > geoname.id, note: 'Reference to geoname table; linking country data with geographical name data']
neighbors varchar(50) [note: 'Neighboring countries, usually represented as ISO codes']
equivfipscode varchar(3) [note: 'Equivalent FIPS code in cases where it might differ from the primary FIPS code']
flag text [note: 'Field to store a link or representation of the country’s flag']
Note: 'Country Data needs expansion'
}
Table admin1codes {
code varchar(11) [pk, note: 'Primary identifier, typically a combination of country code and admin1 code e.g., "US.AL" for Alabama, United States']
countrycode char(3) [ref: > country.iso_alpha3, note: '3-letter ISO 3166-1 alpha code of the country e.g., "USA" for the United States']
admin1_code varchar(10) [note: 'Unique identifier within a country for this first-level administrative division. E.g., "AL" for Alabama']
name varchar(200) [note: 'Local name of the administrative division in the official language']
alt_name_english varchar(200) [note: 'Alternative name or translation of the division in English']
id int [ref: > geoname.id, note: 'Reference to geoname table; linking administrative division data with geographical name data']
}
Table admin2codes {
code varchar(50) [pk, note: 'Primary identifier, typically a combination of country code, admin1 code, and an additional code representing the second-level administrative division e.g., "US.AL.001"']
countrycode char(3) [ref: > country.iso_alpha3, note: '3-letter ISO 3166-1 alpha code of the country this division belongs to e.g., "USA" for the United States']
admin1_code varchar(11) [ref: > admin1codes.code, note: 'Reference to the first-level administrative division. E.g., "US.AL" for Alabama in the United States']
name varchar(200) [note: 'Local name of the second-level administrative division in the official language']
alt_name_english varchar(200) [note: 'Alternative name or translation of the division in English']
id int [ref: > geoname.id, note: 'Reference to geoname table; linking second-level administrative division data with geographical name data']
}
Table featurecodes {
code varchar(10) [pk, note: 'Primary identifier for the feature code, typically a combination of class and fcode']
class varchar(10) [note: 'Class identifier that categorizes the type of geographical feature e.g., "P" for populated place, "T" for mountain']
fcode varchar(10) [pk, note: 'Specific code within a class that describes the feature in more detail. E.g., within class "P", an fcode might specify city, village, etc.']
label varchar(100) [note: 'Short label or name for the feature code']
description varchar(1000) [note: 'Detailed description of what the feature code represents']
}
Table languagecodes {
iso_639_3 varchar(10) [pk, note: 'ISO 639-3 code is a three-letter code that represents a specific language uniquely. It offers a comprehensive set of languages.']
iso_639_2 varchar(10) [note: 'ISO 639-2 code is a three-letter code, which could be either "bibliographic" or "terminological", representing a set of similar languages.']
iso_639_1 varchar(10) [note: 'ISO 639-1 code is a two-letter code. It represents major languages but is not as exhaustive as ISO 639-3.']
name varchar(1000) [note: 'The descriptive name of the language in English.']
}
#!/usr/bin/env python3
import subprocess
import os
import tempfile
def run_command(command):
print(f"+: {command}")
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(process.stdout.readline, b''):
print(line.decode().strip())
def create_temp_file_and_run_sql(sql):
with tempfile.NamedTemporaryFile(prefix='tempsql', mode='w+', delete=False) as f:
f.write(sql)
f.flush()
run_command(f"psql geo -f {f.name}")
os.unlink(f.name)
def download(directory):
curr_dir = os.getcwd()
os.makedirs(directory, exist_ok=True)
os.chdir(directory)
urls = [
"http://download.geonames.org/export/dump/admin1CodesASCII.txt",
"http://download.geonames.org/export/dump/admin2Codes.txt",
"http://download.geonames.org/export/dump/allCountries.zip",
"http://download.geonames.org/export/dump/alternateNamesV2.zip",
"http://download.geonames.org/export/dump/countryInfo.txt",
"http://download.geonames.org/export/dump/iso-languagecodes.txt",
"http://download.geonames.org/export/dump/featureCodes_en.txt"
]
for url in urls:
filename = os.path.basename(url)
unzip = filename.endswith('.zip')
run_command(f"curl -s -o {filename} {url}")
if unzip:
run_command(f"unzip {filename}")
os.remove(filename)
os.chdir(curr_dir)
import os
def patch(directory):
curr_dir = os.getcwd()
os.chdir(directory)
# Country Info
with open('countryInfo.txt', 'r') as f:
countryinfo = f.read()
new_lines = [line
for line in countryinfo.splitlines() if not line.startswith('#')]
with open('countryInfo_patched.txt', 'w+') as f:
f.write('\n'.join(new_lines))
# Feature Codes
with open('featureCodes_en.txt', 'r') as f:
featurecodes = f.read()
new_lines = []
for line in featurecodes.splitlines():
if line.startswith('null'):
continue
fcode, label, description = line.split("\t")
code, fclass = fcode.split('.')
new_lines.append("\t".join([code, fclass, fcode, label, description]))
with open('featureCodes_en_patched.txt', 'w+') as f:
f.write('\n'.join(new_lines))
# Language Codes
with open('iso-languagecodes.txt', 'r') as f:
languagecodes = f.read()
new_lines = []
first_line = languagecodes.splitlines()[0]
for line in languagecodes.splitlines():
if line == first_line:
continue
if line.startswith("\t"):
continue
new_lines.append(line)
# new_lines = [line for line in languagecodes.splitlines() if ( (line != first_line) or (not line.startswith('\t') ) )]
with open('iso-languagecodes_patched.txt', 'w+') as f:
f.write('\n'.join(new_lines))
# Admin 1 Codes
with open('admin1CodesASCII.txt', 'r') as f:
admin1codes = f.read()
new_lines = []
for line in admin1codes.splitlines():
code, name, alt_name_english, geonameid = line.split("\t")
countrycode, admin1_code = code.split('.')
new_lines.append("\t".join([code, countrycode, admin1_code, name, alt_name_english, geonameid]))
with open('admin1CodesASCII_patched.txt', 'w+') as f:
f.write('\n'.join(new_lines))
# Admin 2 Codes
with open('admin2Codes.txt', 'r') as f:
admin2codes = f.read()
new_lines = []
for line in admin2codes.splitlines():
code, name, alt_name_english, geonameid = line.split("\t")
countrycode, admin1_code, _ = code.split('.')
new_lines.append("\t".join([code, countrycode, admin1_code, name, alt_name_english, geonameid]))
with open('admin2Codes_patched.txt', 'w+') as f:
f.write('\n'.join(new_lines))
os.chdir(curr_dir)
# Example usage
# patch("/path/to/directory")
def patchx(directory):
curr_dir = os.getcwd()
os.chdir(directory)
files_to_patch = [
('countryInfo.txt', 'countryInfo_patched.txt'),
('featureCodes_en.txt', 'featureCodes_en_patched.txt'),
('iso-languagecodes.txt', 'iso-languagecodes_patched.txt'),
('admin1CodesASCII.txt', 'admin1CodesASCII_patched.txt'),
('admin2Codes.txt', 'admin2Codes_patched.txt'),
]
for orig, patched in files_to_patch:
with open(orig, 'r') as infile, open(patched, 'w+') as outfile:
new_lines = []
for line in infile:
# Skip lines that start with '#' or 'null'
if line.startswith('#') or line.startswith(''):
continue
new_lines.append(line.strip())
outfile.write('\n'.join(new_lines))
os.chdir(curr_dir)
def setup_database():
run_command("dropdb geo")
run_command("createdb geo")
def setup_tables():
setup_queries = """
CREATE EXTENSION postgis;
CREATE TABLE geoname (
id int,
name varchar(200),
asciiname varchar(200),
alternatenames varchar(20000),
latitude float,
longitude float,
fclass char(1),
fcode varchar(10),
country varchar(2),
cc2 varchar(500),
admin1 varchar(20),
admin2 varchar(80),
admin3 varchar(20),
admin4 varchar(20),
population bigint,
elevation int,
gtopo30 int,
timezone varchar(40),
moddate date
);
ALTER TABLE ONLY geoname ADD CONSTRAINT pk_geonameid PRIMARY KEY (id);
CREATE INDEX index_geoname_on_name ON geoname USING btree (name);
CREATE TABLE alternatename (
alternatenameid int,
geonameid int,
isoLanguage varchar(7),
alternatename varchar(200),
ispreferredname boolean,
isshortname boolean,
iscolloquial boolean,
ishistoric boolean,
name_from varchar(500),
name_to varchar(500)
);
ALTER TABLE ONLY alternatename ADD CONSTRAINT pk_alternatenameid PRIMARY KEY (alternatenameid);
CREATE TABLE admin1codes (
code varchar(11),
countrycode char(3),
admin1_code varchar(10),
name varchar(200),
alt_name_english varchar(200),
geonameid int
);
ALTER TABLE ONLY admin1codes ADD CONSTRAINT pk_admin1id PRIMARY KEY (geonameid);
CREATE TABLE admin2codes (
code varchar(50),
countrycode char(3),
admin1_code varchar(11),
name varchar(200),
alt_name_english varchar(200),
geonameid int
);
ALTER TABLE ONLY admin2codes ADD CONSTRAINT pk_admin2id PRIMARY KEY (geonameid);
CREATE TABLE country (
iso_alpha2 char(2),
iso_alpha3 char(3),
iso_numeric integer,
fips_code varchar(3),
name varchar(200),
capital varchar(200),
areainsqkm double precision,
population integer,
continent varchar(2),
tld varchar(10),
currencycode varchar(3),
currencyname varchar(20),
phone varchar(20),
postalcode varchar(100),
postalcoderegex varchar(200),
languages varchar(200),
geonameid int,
neighbors varchar(50),
equivfipscode varchar(3)
);
ALTER TABLE ONLY country ADD CONSTRAINT pk_iso_alpha2 PRIMARY KEY (iso_alpha2);
CREATE TABLE featurecodes (
code varchar(10),
class varchar(10),
fcode varchar (10),
label varchar(100),
description varchar(1000)
);
ALTER TABLE ONLY featurecodes ADD CONSTRAINT pk_fcode PRIMARY KEY (fcode);
CREATE TABLE languagecodes (
iso_639_3 varchar(10),
iso_639_2 varchar(10),
iso_639_1 varchar(10),
name varchar(1000)
);
ALTER TABLE ONLY languagecodes ADD CONSTRAINT pk_languageid PRIMARY KEY (iso_639_3);
ALTER TABLE ONLY countryinfo ADD CONSTRAINT fk_geonameid FOREIGN KEY (geonameid) REFERENCES geoname(geonameid);
ALTER TABLE ONLY alternatename ADD CONSTRAINT fk_geonameid FOREIGN KEY (geonameid) REFERENCES geoname(geonameid);
"""
create_temp_file_and_run_sql(setup_queries)
def populate(directory):
curr_dir = os.getcwd()
directory = os.path.join(os.getcwd(), directory)
queries = [
f"\"copy geoname(id,name,asciiname,alternatenames,latitude,longitude,fclass,fcode,country,cc2,admin1,admin2,admin3,admin4,population,elevation,gtopo30,timezone,moddate) from '{directory}/allCountries.txt' null as '';\"",
f"\"copy alternatename(alternatenameid,geonameid,isolanguage,alternatename,ispreferredname,isshortname,iscolloquial,ishistoric,name_from, name_to) from '{directory}/alternateNamesV2.txt' null as '';\"",
f"\"copy countryinfo(iso_alpha2,iso_alpha3,iso_numeric,fips_code,name,capital,areainsqkm,population,continent,tld,currencycode,currencyname,phone,postalcode,postalcoderegex,languages,geonameid,neighbors,equivfipscode) from '{directory}/countryInfo_patched.txt' null as '';\"",
f"\"copy featurecodes(code, class, fcode, label, description) from '{directory}/featureCodes_en_patched.txt' null as '';\"",
f"\"copy languagecodes(iso_639_3, iso_639_2, iso_639_1, name) from '{directory}/iso-languagecodes_patched.txt' null as '';\"",
f"\"copy admin1codes(code, countrycode, admin1_code, name, alt_name_english, geonameid) from '{directory}/admin1CodesASCII_patched.txt' null as '';\"",
f"\"copy admin2codes(code, countrycode, admin1_code, name, alt_name_english, geonameid) from '{directory}/admin2Codes_patched.txt' null as '';\"",
]
for populate_sql in queries:
run_command(f"psql geo -c {populate_sql}")
# Returns to the previous directory
os.chdir(curr_dir)
def create_geometry():
geometry = """
SELECT AddGeometryColumn ('public','geoname','geometry',4326,'POINT',2);
UPDATE geoname SET geometry = ST_PointFromText('POINT(' || longitude || ' ' || latitude || ')', 4326);
CREATE INDEX idx_geoname_geometry ON public.geoname USING gist(geometry);
"""
create_temp_file_and_run_sql(geometry)
if __name__ == "__main__":
# download('tmp_geonames')
patch('tmp_geonames')
setup_database()
setup_tables()
populate('tmp_geonames')
create_geometry()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment