Skip to content

Instantly share code, notes, and snippets.

@zemd
Created April 8, 2015 19:51
Show Gist options
  • Save zemd/12727187f308919f964a to your computer and use it in GitHub Desktop.
Save zemd/12727187f308919f964a to your computer and use it in GitHub Desktop.
Parsing data from geonames.org to mongodb
<?php
namespace Opesho\CommonBundle\Command;
use MongoClient;
use MongoDate;
use MongoDB;
use SplStack;
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
class ParseGeonamesCommand extends ContainerAwareCommand
{
/** @var MongoDB $db */
protected $db;
/** @var OutputInterface */
protected $output;
/** @var \MongoCollection */
protected $countries;
/** @var \MongoCollection */
protected $states;
/** @var \MongoCollection */
protected $cities;
/** @var SplStack */
protected $time;
/** @var array */
protected $cacheCountries = array();
/** @var array */
protected $cacheStates = array();
/** @var array */
protected $cacheCities = array();
/** @var array */
protected $skip = array();
/**
* @see Command
*/
protected function configure()
{
$this
->setName('opesho:geonames:parse')
->setDescription('Populating countries')
->setDefinition(array(
new InputArgument('folder', InputArgument::REQUIRED, "Folder with geonames files"),
new InputOption("skip", null, InputOption::VALUE_REQUIRED | InputOption::VALUE_IS_ARRAY, "You can skip each step of population data"),
new InputOption("refresh", null, InputOption::VALUE_NONE, "Refresh alternative names")
))
->setHelp(<<<EOT
The <info>opesho:geonames:parse</info> command populates data for countries and cities:
<info>php app/console opesho:geonames:parse</info>
EOT
);
}
protected function isSkipped($key)
{
return in_array($key, $this->skip);
}
protected function execute(InputInterface $input, OutputInterface $output)
{
$this->output = $output;
$folder = realpath($input->getArgument('folder'));
$this->skip = $input->getOption('skip');
$allCountries = $folder . DIRECTORY_SEPARATOR . 'allCountries.txt';
$alternativeNames = $folder . DIRECTORY_SEPARATOR . 'alternateNames.txt';
$countryInfo = $folder . DIRECTORY_SEPARATOR . 'countryInfo.txt';
$m = new MongoClient($this->getContainer()->getParameter('mongodb_server'));
$this->db = $m->custom_db; // <---- REPLACE WITH YOUR DATABASE NAME
$this->countries = $this->db->countries;
$this->states = $this->db->states;
$this->cities = $this->db->cities;
// COUNTRIES ~193
$this->timeStart();
$this->info("Inserting countries...");
$this->populate($allCountries, "A", "PCLI", array($this, 'proceedCountries'));
$this->timeEnd();
$this->timeStart();
$this->info("Updating countries' names...");
if ($this->isSkipped('countries_names')) {
$this->warn("Updating countries' names is skipped");
} else {
$this->populateNames($alternativeNames, array($this, 'saveCountryNames'));
}
$this->timeEnd();
$this->timeStart();
$this->info("Updating countries' info...");
if ($this->isSkipped("countries_info")) {
$this->warn("Updating countries' info is skipped");
} else {
$this->populateCountryInfo($countryInfo, array($this, 'saveCountryInfo'));
}
$this->timeEnd();
$this->cacheCountries = null;
// STATES - REGIONS ~153 562
$this->timeStart();
$this->info("Inserting states ADM1...");
$this->populate($allCountries, "A", "ADM1", array($this, "proceedStates"));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting states ADM2...");
$this->populate($allCountries, "A", "ADM2", array($this, "proceedStates"));
$this->timeEnd();
// $this->timeStart();
// $this->info("Inserting states ADM3...");
// $this->populate($allCountries, "A", "ADM3", array($this, "proceedStates"));
// $this->timeEnd();
//
// $this->timeStart();
// $this->info("Inserting states ADM4...");
// $this->populate($allCountries, "A", "ADM4", array($this, "proceedStates"));
// $this->timeEnd();
//
// $this->timeStart();
// $this->info("Inserting states ADM5...");
// $this->populate($allCountries, "A", "ADM5", array($this, "proceedStates"));
// $this->timeEnd();
$this->timeStart();
$this->info("Updating states' names...");
if ($this->isSkipped("states_names")) {
$this->warn("Updating states' names is skipped");
} else {
$this->populateNames($alternativeNames, array($this, 'saveStatesNames'));
}
$this->timeEnd();
// CITIES
$this->timeStart();
$this->info("Inserting capitals");
$this->populate($allCountries, "P", "PPLC", array($this, 'proceedCities'));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting first-order administrative division");
$this->populate($allCountries, "P", "PPLA", array($this, 'proceedCities'));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting second-order administrative division");
$this->populate($allCountries, "P", "PPLA2", array($this, 'proceedCities'));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting second-order administrative division");
$this->populate($allCountries, "P", "PPLA3", array($this, 'proceedCities'));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting second-order administrative division");
$this->populate($allCountries, "P", "PPLA4", array($this, 'proceedCities'));
$this->timeEnd();
$this->timeStart();
$this->info("Inserting second-order administrative division");
$this->populate($allCountries, "P", "PPL", array($this, 'proceedCities'), 100000);
$this->timeEnd();
$this->timeStart();
$this->info("Updating cities' names...");
$this->populateNames($alternativeNames, array($this, 'saveCitiesNames'));
$this->timeEnd();
// ADDITIONAL
if (!$this->isSkipped('countries_names')) {
$this->timeStart();
$this->info("Updating countries' prepared names...");
$this->assignCountryPreparedName();
$this->timeEnd();
}
if (!$this->isSkipped("states_names")) {
$this->timeStart();
$this->info("Updating states' prepared names...");
$this->assignStatePreparedName();
$this->timeEnd();
}
if (!$this->isSkipped("cities_names")) {
$this->timeStart();
$this->info("Updating cities' prepared names...");
$this->assignCityPreparedName();
$this->timeEnd();
}
$this->cacheStates = null;
$this->cacheCities = null;
echo " ";
}
protected function proceedCities(
$geonameid,
$name,
$asciiName,
$alternateNames,
$latitude,
$longitude,
$featureClass,
$featureCode,
$countryCode,
$cc2,
$admin1Code,
$admin2Code,
$admin3Code,
$admin4Code,
$population,
$elevation,
$dem,
$timezone,
$modificationDate
)
{
if ($this->isSkipped("cities")) {
return;
}
if ($this->isSkipped("cities_{$featureCode}")) {
return;
}
$geonameid = intval($geonameid);
$date = new \DateTime($modificationDate, new \DateTimeZone('UTC'));
$this->cacheCities[] = intval($geonameid);
$country = $this->countries->findOne(array('country_code' => $countryCode), array());
$state = $this->states->findOne(array('country_code' => $countryCode, 'region_code' => $admin1Code), array());
$this->cities->update(
array(
'geonameid' => $geonameid
),
array(
'$set' => array(
'geonameid' => $geonameid,
'name' => $name,
'ascii_name' => $asciiName,
'latitude' => floatval($latitude),
'longitude' => floatval($longitude),
'order' => $featureCode == 'PPLC' ? 0 : 500,
'modified_at' => new MongoDate($date->getTimestamp()),
'country_code' => $countryCode,
'country' => \MongoDBRef::create('countries', $country['_id']),
'states' => [$admin1Code, $admin2Code, $admin3Code, $admin4Code],
'state' => isset($state) ? \MongoDBRef::create('states', $state['_id']) : null,
'feature_code' => $featureCode,
'population' => intval($population)
)
),
array(
'upsert' => true
)
);
}
protected function proceedStates(
$geonameid,
$name,
$asciiName,
$alternateNames,
$latitude,
$longitude,
$featureClass,
$featureCode,
$countryCode,
$cc2,
$admin1Code,
$admin2Code,
$admin3Code,
$admin4Code,
$population,
$elevation,
$dem,
$timezone,
$modificationDate
)
{
if ($this->isSkipped("states")) {
$this->warn("Skipping states...", 1);
return;
}
if ($this->isSkipped("states_{$featureCode}")) {
$this->warn("Skipping states {$featureCode}", 1);
return;
}
$geonameid = intval($geonameid);
$this->cacheStates[] = $geonameid;
$this->states->update(
array(
'geonameid' => $geonameid
),
array(
'$set' => array(
'geonameid' => $geonameid,
'name' => $name,
'ascii_name' => $asciiName,
//'alternative_names' => $alternateNames,
'country_code' => $countryCode,
'admin1_code' => $admin1Code,
'admin2_code' => $admin2Code,
'admin3_code' => $admin3Code,
'admin4_code' => $admin4Code,
'feature_code' => $featureCode
)
),
array(
'upsert' => true
)
);
}
private function timeStart()
{
if (empty($this->time)) {
$this->time = new SplStack();
}
$this->time->push(microtime(true));
}
private function timeEnd()
{
$timeEnd = microtime(true);
$executionTime = ($timeEnd - $this->time->pop()) / 60;
$this->debug("Operation executed in: <info>{$executionTime}</info> mins.");
}
private function debug($message)
{
$this->output->writeln("<comment>[DEBUG]</comment> {$message}");
}
private function info($message)
{
$this->output->writeln("<info>[INFO]</info> {$message}");
}
private function warn($message, $times = 0)
{
static $messages = [];
if ($times > 0 && isset($messages[$message]) && $messages[$message] === $times) {
return;
}
if (!isset($messages[$message])) {
$messages[$message] = 0;
}
$messages[$message] += 1;
$this->output->writeln("<error>[WARN]</error> {$message}");
}
protected function saveCountryInfo(
$iso,
$iso3,
$isoNumeric,
$fips,
$countryName,
$capital,
$area,
$population,
$continent,
$tld,
$currencyCode,
$currencyName,
$phone,
$postalCodeFormat,
$postCodeRegex,
$languages,
$geonameid,
$neighbors,
$equivalentFipsCode
) {
$upd = array(
'$set' => array(
'ascii_name' => $countryName,
'locale' => substr($languages[0], 0, 2),
'languages' => $languages
)
);
$this->countries->update(array('geonameid' => intval($geonameid)), $upd);
}
private function tick($text = "")
{
$ch = '\\|/-\\';
static $char = '/';
static $lastString = "";
$char = substr($ch, strpos($ch, $char) + 1, 1);
if (empty(trim($lastString)) && empty($text)) {
echo "{$char}\r";
return;
}
$newString = str_pad("{$text} {$char}", strlen($lastString));
$ret = str_repeat("\r", strlen($newString));
echo "{$newString}{$ret}";
$lastString = $newString;
}
private function memory()
{
static $lastMemoryUsage = 0.0;
$memory = memory_get_peak_usage() / 1024 / 1024;
if ($lastMemoryUsage != $memory && $memory - $lastMemoryUsage > 0.1) {
$this->debug("Memory used: <info>{$memory}</info> MB");
$lastMemoryUsage = $memory;
}
}
protected function populateCountryInfo($file, $callback)
{
$handle = fopen($file, 'r');
while ($line = fgets($handle)) {
if ($line[0] == '#') {
continue;
}
$data = explode("\t", $line);
$this->tick();
list(
$iso,
$iso3,
$isoNumeric,
$fips,
$countryName,
$capital,
$area,
$population,
$continent,
$tld,
$currencyCode,
$currencyName,
$phone,
$postalCodeFormat,
$postCodeRegex,
$languages,
$geonameid,
$neighbors,
$equivalentFipsCode
) = $data;
$languages = explode(',', $languages);
call_user_func($callback, $iso,
$iso3,
$isoNumeric,
$fips,
$countryName,
$capital,
$area,
$population,
$continent,
$tld,
$currencyCode,
$currencyName,
$phone,
$postalCodeFormat,
$postCodeRegex,
$languages,
$geonameid,
$neighbors,
$equivalentFipsCode);
$iso = null;
$iso3 = null;
$isoNumeric = null;
$fips = null;
$countryName = null;
$capital = null;
$area = null;
$population = null;
$continent = null;
$tld = null;
$currencyCode = null;
$currencyName = null;
$phone = null;
$postalCodeFormat = null;
$postCodeRegex = null;
$languages = null;
$geonameid = null;
$neighbors = null;
$equivalentFipsCode = null;
$this->memory();
}
fclose($handle);
}
// CAUTION: memory leaks in this method
protected function assignCountryPreparedName()
{
$countries = $this->countries->find();
foreach ($countries as $country) {
$alternatives = $country['alternative_names'];
//
// locale == country_locale && is_preferred
// locale != country_locale && is_preferred
// locale == 'en' && !is_preferred
// locale == 'en' && is_preferred
//
// preferred_name = 1. locale + is_preferred 2. locale + !is_preferred 3. 'en' + is_preferred 4. 'en' + !is_preferred
// ascii_name = 1. 'en' + is_preferred 2. 'en' + !is_preferred 3. default value
// Note: ascii_name is set during populating country info, so there is no need to set it here
//
$preferred = array();
foreach ($alternatives as $alternative) {
$this->tick();
if (in_array($alternative['locale'], array($country['locale'], 'en'))) {
$preferred[] = array(
'name' => $alternative['name'],
'locale' => $alternative['locale'],
'is_preferred_name' => $alternative['is_preferred_name']
);
if ($alternative["is_preferred_name"] && $alternative['locale'] == $country['locale']) {
break;
}
}
}
$locale = $country['locale'];
$this->sortPreferred($preferred, $locale);
$upd = array(
'$set' => array(
'preferred_name' => $preferred[0]['name']
)
);
$preferred = null;
$this->tick();
$this->countries->update(array('geonameid' => $country['geonameid']), $upd);
$this->memory();
}
}
// CAUTION: memory leaks in this method
protected function assignStatePreparedName()
{
$states = $this->states->find();
$counter = 0;
foreach ($states as $state) {
if (empty($state['alternative_names'])) {
continue;
}
$alternatives = $state['alternative_names'];
$country = $this->countries->findOne(array('country_code' => $state['country_code']), array('locale'));
//
// locale == country_locale && is_preferred
// locale != country_locale && is_preferred
// locale == 'en' && !is_preferred
// locale == 'en' && is_preferred
//
// preferred_name = 1. locale + is_preferred 2. locale + !is_preferred 3. 'en' + is_preferred 4. 'en' + !is_preferred
// ascii_name = 1. 'en' + is_preferred 2. 'en' + !is_preferred 3. default value
// Note: ascii_name is set during populating country info, so there is no need to set it here
//
$preferred = array();
foreach ($alternatives as $alternative) {
$this->tick();
if (in_array($alternative['locale'], array($country['locale'], 'en'))) {
$preferred[] = array(
'name' => $alternative['name'],
'locale' => $alternative['locale'],
'is_preferred_name' => $alternative['is_preferred_name']
);
if ($alternative["is_preferred_name"] && $alternative['locale'] == $country['locale']) {
break;
}
}
}
if (empty($preferred)) {
continue;
}
$locale = $country['locale'];
$this->sortPreferred($preferred, $locale);
$upd = array(
'$set' => array(
'preferred_name' => $preferred[0]['name']
)
);
$preferred = null;
$counter += 1;
$this->tick("{$counter} states' names updated...");
$this->states->update(array('geonameid' => $state['geonameid']), $upd);
$this->memory();
}
}
// CAUTION: memory leaks in this method
protected function assignCityPreparedName()
{
$cities = $this->cities->find(array('geonameid' => array( '$in' => $this->cacheCities )));
foreach ($cities as $city) {
if (empty($city['alternative_names'])) {
continue;
}
$alternatives = $city['alternative_names'];
$country = $this->countries->findOne(array('country_code' => $city['country_code']), array('locale'));
//
// locale == country_locale && is_preferred
// locale != country_locale && is_preferred
// locale == 'en' && !is_preferred
// locale == 'en' && is_preferred
//
// preferred_name = 1. locale + is_preferred 2. locale + !is_preferred 3. 'en' + is_preferred 4. 'en' + !is_preferred
// ascii_name = 1. 'en' + is_preferred 2. 'en' + !is_preferred 3. default value
// Note: ascii_name is set during populating country info, so there is no need to set it here
//
$preferred = array();
foreach ($alternatives as $alternative) {
$this->tick();
if (in_array($alternative['locale'], array($country['locale'], 'en'))) {
$preferred[] = array(
'name' => $alternative['name'],
'locale' => $alternative['locale'],
'is_preferred_name' => $alternative['is_preferred_name']
);
if ($alternative["is_preferred_name"] && $alternative['locale'] == $country['locale']) {
break;
}
}
}
if (empty($preferred)) {
continue;
}
$locale = $country['locale'];
$this->sortPreferred($preferred, $locale);
$upd = array(
'$set' => array(
'preferred_name' => $preferred[0]['name']
)
);
$preferred = null;
$this->tick();
$this->cities->update(array('geonameid' => $city['geonameid']), $upd);
$this->memory();
}
}
private function sortPreferred(array &$arr, $locale)
{
usort($arr, function ($a, $b) use ($locale) {
$isEqual = $a['locale'] == $b['locale'] && $a['is_preferred_name'] == $b['is_preferred_name'];
if ($isEqual) {
if ($a == $b) {
return 0;
}
return $a > $b ? -1 : 1;
};
if ($a['locale'] == $locale && $b['locale'] != $locale) {
return -1;
}
if ($b['locale'] == $locale && $a['locale'] != $locale) {
return 1;
}
if ($a['locale'] == $b['locale']) {
return $a['is_preferred_name'] ? -1 : 1;
}
});
}
protected function populateNames($source, $callback)
{
$handle = fopen($source, 'r');
$counter = 0;
while ($line = fgets($handle)) {
$data = explode("\t", $line);
list(
$id,
$geonameid,
$isocode,
$name,
$isPreferredName,
$isShortName,
$isColloquial,
$isHistoric
) = $data;
if ($isHistoric == '1' || $isColloquial == '1' || empty($isocode) || in_array($isocode,
array('link', 'abbr', 'fr_1793', 'faac', 'icao', 'iata', 'post'))
) {
continue;
}
$populated = call_user_func($callback,
$geonameid,
$id,
$isocode,
$name,
$isPreferredName,
$isShortName);
if ($populated === true) {
$counter += 1;
}
$this->tick("Populated {$counter} entities");
$this->memory();
}
$this->info("Populated {$counter} entities' names");
fclose($handle);
}
protected function saveCitiesNames(
$geonameid,
$id,
$isocode,
$name,
$isPreferredName,
$isShortName
)
{
$geonameid = intval($geonameid);
if (!in_array($geonameid, $this->cacheCities)) {
return null;
}
// if (!$this->cities->count(array('geonameid' => intval($geonameid)))) {
// return;
// }
$upd = array(
'$addToSet' => array(
'alternative_names' => array(
'locale' => strtolower($isocode),
'name' => $name,
'is_preferred_name' => $isPreferredName == '1',
'is_short_name' => $isShortName == '1'
)
)
);
$this->cities->update(array('geonameid' => intval($geonameid)), $upd);
return true;
}
protected function saveStatesNames(
$geonameid,
$id,
$isocode,
$name,
$isPreferredName,
$isShortName
)
{
$geonameid = intval($geonameid);
if (!in_array($geonameid, $this->cacheStates)) {
return null;
}
// if (!$this->states->count(array('geonameid' => intval($geonameid)))) {
// return;
// }
$upd = array(
'$addToSet' => array(
'alternative_names' => array(
'locale' => strtolower($isocode),
'name' => $name,
'is_preferred_name' => $isPreferredName == '1',
'is_short_name' => $isShortName == '1'
)
)
);
$this->states->update(array('geonameid' => intval($geonameid)), $upd);
return true;
}
protected function saveCountryNames(
$geonameid,
$id,
$isocode,
$name,
$isPreferredName,
$isShortName
) {
$geonameid = intval($geonameid);
if (!in_array($geonameid, $this->cacheCountries)) {
return null;
}
// if (!$this->countries->count(array('geonameid' => intval($geonameid)))) {
// return;
// }
$upd = array(
'$addToSet' => array(
'alternative_names' => array(
'locale' => strtolower($isocode),
'name' => $name,
'is_preferred_name' => $isPreferredName == '1',
'is_short_name' => $isShortName == '1'
)
)
);
$this->countries->update(array('geonameid' => intval($geonameid)), $upd);
return true;
}
protected function proceedCountries(
$geonameid,
$name,
$asciiName,
$alternateNames,
$latitude,
$longitude,
$featureClass,
$featureCode,
$countryCode,
$cc2,
$admin1Code,
$admin2Code,
$admin3Code,
$admin4Code,
$population,
$elevation,
$dem,
$timezone,
$modificationDate
) {
$geonameid = intval($geonameid);
$this->cacheCountries[] = $geonameid;
if ($this->isSkipped('countries')) {
return;
}
$date = new \DateTime($modificationDate, new \DateTimeZone('UTC'));
$this->countries->update(
array(
'geonameid' => $geonameid
),
array(
'$set' => array(
'geonameid' => $geonameid,
'name' => $name,
'ascii_name' => $asciiName,
//'alternative_names' => $alternateNames,
'country_code' => $countryCode,
'modified_at' => new MongoDate($date->getTimestamp()),
'states_included' => true,
'active' => true
)
),
array(
'upsert' => true
)
);
}
protected function populate($file, $paramFatureClass, $paramFeatureCode, $callback, $populationLimit = false)
{
$handle = fopen($file, 'r');
while ($line = fgets($handle)) {
$data = explode("\t", $line);
if ($data[6] != $paramFatureClass || $data[7] != $paramFeatureCode) {
continue;
}
if ($populationLimit !== false && intval($data[14]) < $populationLimit) {
continue;
}
$this->tick();
list($geonameid,
$name,
$asciiName,
$alternateNames,
$latitude,
$longitude,
$featureClass,
$featureCode,
$countryCode,
$cc2,
$admin1Code,
$admin2Code,
$admin3Code,
$admin4Code,
$population,
$elevation,
$dem,
$timezone,
$modificationDate) = $data;
call_user_func($callback, $geonameid,
$name,
$asciiName,
$alternateNames,
$latitude,
$longitude,
$featureClass,
$featureCode,
$countryCode,
$cc2,
$admin1Code,
$admin2Code,
$admin3Code,
$admin4Code,
$population,
$elevation,
$dem,
$timezone,
$modificationDate);
$geonameid = null;
$name = null;
$asciiName = null;
$alternateNames = null;
$latitude = null;
$longitude = null;
$featureClass = null;
$featureCode = null;
$countryCode = null;
$cc2 = null;
$admin1Code = null;
$admin2Code = null;
$admin3Code = null;
$admin4Code = null;
$population = null;
$elevation = null;
$dem = null;
$timezone = null;
$modificationDate = null;
$data = null;
$this->memory();
}
fclose($handle);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment