Skip to content

Instantly share code, notes, and snippets.

@bobrik
Last active August 27, 2018 10:23
Show Gist options
  • Save bobrik/6184751 to your computer and use it in GitHub Desktop.
Save bobrik/6184751 to your computer and use it in GitHub Desktop.
Geonames duplicate finder
<?php
// download some country file (RU.zip for example), unzip it and run:
// php -d memory_limit=512M -f checker.php ~/RU.txt RU
// summary will appear in stderr, actual duplicates in stdout
if (count($argv) < 3) {
fwrite(STDERR, "please specify path to file with cities and country code\n");
exit(1);
}
fwrite(STDERR, "processing file\n");
$fd = fopen($argv[1], "r");
$cities = array();
while ($line = trim(fgets($fd), "\n")) {
if (!$line) {
break;
}
$line = explode("\t", $line);
// only take cities
if ($line[6] != "P") {
continue;
}
if ($line[8] != $argv[2]) {
continue;
}
$code = $line[8];
for ($i = 0; $i < 4; $i++) {
if ($line[10 + $i] != "" && $line[10 + $i] != "00") {
$code .= ".".$line[10 + $i];
}
}
$cities[$line[0]] = array($line[1], $code);
}
fwrite(STDERR, "processing complete, found ".count($cities)." cities. checking for duplicates\n");
$duplicates = array();
foreach ($cities as $id => $city) {
$index = $city[0].":".$city[1]; // name:code
if (!array_key_exists($index, $duplicates)) {
$duplicates[$index] = array();
}
$duplicates[$index][] = $city[0]." (".$id.")";
}
$extra = 0;
foreach ($duplicates as $index => $found) {
if (count($found) > 1) {
echo 'found '.count($found).' cities by key '.$index.": ".implode(", ", $found)."\n";
$extra += count($found) - 1;
}
}
fwrite(STDERR, "all done, ".$extra." extra cities found\n");
fclose($fd);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment