Last active
August 18, 2019 19:22
-
-
Save paulgoetze/3fea5dfb2b757a46aec25d5bcfd1359d to your computer and use it in GitHub Desktop.
Compiling an extensive cities list from free geonames.org data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This module provides a function to compile city data using the free data | |
# provided on http://download.geonames.org/export/dump. | |
# It uses following files to compile a list of 4.4 mio. cities world-wide, | |
# including their name, state (administrative level 1), country, time zone, | |
# latitude, and longitude: | |
# | |
# * allCountries.txt (included in allCountries.zip) | |
# * countryInfo.txt | |
# * admin1CodesASCII.txt | |
# | |
defmodule Cities do | |
defmodule Data do | |
@moduledoc """ | |
Provides functions for compiling a raw data file. | |
""" | |
@doc """ | |
Compiles the needed raw city data from the given files. | |
""" | |
def compile(location_file, countries_file, states_file) do | |
new_file = File.stream!("cities.txt", [:write]) | |
states = compile_states(states_file) | |
countries = compile_countries(countries_file) | |
location_file | |
|> File.stream! | |
|> Stream.filter(&city?/1) | |
|> Stream.map(&to_attrs/1) | |
|> Stream.map(&(replace_state(&1, states))) | |
|> Stream.map(&(replace_country(&1, countries))) | |
|> Stream.map(&to_line/1) | |
|> Enum.into(new_file) | |
end | |
defp city?(line) do | |
type = | |
line | |
|> String.split("\t") | |
|> Enum.at(6) | |
type == "P" # Parish -> city or village | |
end | |
defp to_attrs(line) do | |
[_geonameid, # integer id of record in geonames database | |
name, # name of geographical point (utf8) varchar(200) | |
_asciiname, # name of geographical point in plain ascii characters, varchar(200) | |
_alternatenames, # alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000) | |
latitude, # latitude in decimal degrees (wgs84) | |
longitude, # longitude in decimal degrees (wgs84) | |
_feature_class, # see http://www.geonames.org/export/codes.html, char(1) | |
_feature_code, # see http://www.geonames.org/export/codes.html, varchar(10) | |
country_code, # ISO-3166 2-letter country code, 2 characters | |
_cc2, # alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters | |
state, # fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20) | |
_admin2_code, # code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) | |
_admin3_code, # code for third level administrative division, varchar(20) | |
_admin4_code, # code for fourth level administrative division, varchar(20) | |
_population, # bigint (8 byte int) | |
_elevation, # in meters, integer | |
_dem, # digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat. | |
timezone, # the iana timezone id (see file timeZone.txt) varchar(40) | |
_modification_date # date of last modification in yyyy-MM-dd format | |
] = String.split(line, "\t") | |
%{city: name, | |
state: state, | |
country: country_code, | |
timezone: timezone, | |
latitude: latitude, | |
longitude: longitude} | |
end | |
defp replace_state(attrs, state_names) do | |
%{attrs | state: state_names["#{attrs.country}.#{attrs.state}"]} | |
end | |
defp compile_states(file) do | |
file | |
|> File.stream! | |
|> Stream.filter(&(!String.starts_with?(&1, "#"))) | |
|> Enum.into(%{}, fn line -> | |
[key, value | _] = String.split(line, "\t") | |
{key, value} | |
end) | |
end | |
defp replace_country(attrs, country_names) do | |
%{attrs | country: country_names[attrs.country]} | |
end | |
defp compile_countries(file) do | |
file | |
|> File.stream! | |
|> Stream.filter(&(!String.starts_with?(&1, "#"))) | |
|> Enum.into(%{}, fn line -> | |
[key, _, _, _, value | _] = String.split(line, "\t") | |
{key, value} | |
end) | |
end | |
defp to_line(attrs) do | |
line = Enum.join([ | |
"\"#{attrs.city}\"", | |
"\"#{attrs.state}\"", | |
"\"#{attrs.country}\"", | |
"\"#{attrs.timezone}\"", | |
attrs.latitude, | |
attrs.longitude | |
], ",") | |
"#{line}\n" | |
end | |
end | |
end | |
# Example usage: | |
Cities.Data.compile("allCountries.txt", "countryInfo.txt", "admin1CodesASCII.txt") |
@dgrl you would need to install Elixir. Please have a look here for further details: https://elixir-lang.org/install.html
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi
What program do i need to run this script?
Regards