Last active
August 27, 2018 10:23
-
-
Save bobrik/6184751 to your computer and use it in GitHub Desktop.
Geonames duplicate finder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// download some country file (RU.zip for example), unzip it and run: | |
// php -d memory_limit=512M -f checker.php ~/RU.txt RU | |
// summary will appear in stderr, actual duplicates in stdout | |
if (count($argv) < 3) { | |
fwrite(STDERR, "please specify path to file with cities and country code\n"); | |
exit(1); | |
} | |
fwrite(STDERR, "processing file\n"); | |
$fd = fopen($argv[1], "r"); | |
$cities = array(); | |
while ($line = trim(fgets($fd), "\n")) { | |
if (!$line) { | |
break; | |
} | |
$line = explode("\t", $line); | |
// only take cities | |
if ($line[6] != "P") { | |
continue; | |
} | |
if ($line[8] != $argv[2]) { | |
continue; | |
} | |
$code = $line[8]; | |
for ($i = 0; $i < 4; $i++) { | |
if ($line[10 + $i] != "" && $line[10 + $i] != "00") { | |
$code .= ".".$line[10 + $i]; | |
} | |
} | |
$cities[$line[0]] = array($line[1], $code); | |
} | |
fwrite(STDERR, "processing complete, found ".count($cities)." cities. checking for duplicates\n"); | |
$duplicates = array(); | |
foreach ($cities as $id => $city) { | |
$index = $city[0].":".$city[1]; // name:code | |
if (!array_key_exists($index, $duplicates)) { | |
$duplicates[$index] = array(); | |
} | |
$duplicates[$index][] = $city[0]." (".$id.")"; | |
} | |
$extra = 0; | |
foreach ($duplicates as $index => $found) { | |
if (count($found) > 1) { | |
echo 'found '.count($found).' cities by key '.$index.": ".implode(", ", $found)."\n"; | |
$extra += count($found) - 1; | |
} | |
} | |
fwrite(STDERR, "all done, ".$extra." extra cities found\n"); | |
fclose($fd); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment