Skip to content

Instantly share code, notes, and snippets.

@siffash
Last active February 19, 2018 20:52
Show Gist options
  • Save siffash/d3de209983cd571ee3f429af083249f0 to your computer and use it in GitHub Desktop.
Save siffash/d3de209983cd571ee3f429af083249f0 to your computer and use it in GitHub Desktop.
Extract all unique languages from OSM .bz2 files + all unique prefixes & suffixes
<?
/*****************
*** README ***
Prerequisites:
- PHP / Apache x64
- the following PHP extensions installed & enabled: intl, bz2
Caveats:
- the script skips multi-stream bz2 files because PHP's bz2 extension does not support them, treating EOS as EOF
- the script skips certain names, see $names_to_skip
- the script skips the names which contain numbers only, see is_numeric
- the script treats "en_1" and "en1" as "en", see preg_replace
- the script does not print the languages which are used in OSM less than 10 times, see if($freq > 10)
How To Use:
1) download bz2 files containing OSM XML data (preferably from download.geofabrik.de)
2) put them into the same directory you put this script
3) run the script
FYI:
- if you want to see detailed progress (every 1 GB processed), uncomment the commented part of the code
*****************/
ini_set('error_reporting', E_ALL);
ini_set('display_errors', '1');
ini_set('memory_limit', '-1');
ini_set('max_execution_time', 0);
set_time_limit(0);
$names_to_skip = ['prefix', 'suffix', 'postfix', 'botanical', 'left', 'right', 'url', 'abbreviation', 'source', 'gbif_equivalent_tag', 'simple', 'individualid', 'locality', 'alt_name', 'vernacular', 'bridge', 'old', 'historic', 'map-bms', 'language', 'movibus', 'operator', 'actual', 'forward', 'backward', 'comment', 'vulgar', 'spatial', 'wikipedia', 'tunnel', 'disused', 'int', 'level', 'generic', 'variants', 'informal', 'village', 'type', 'pronunciation', 'former', 'short', 'maybe', 'alternative', 'demolished', 'was', 'review', 'house', 'transliteration', 'fonetic', 'end', 'street', 'abbr', 'desc', 'full', 'acronym', 'source_ref', 'brand', 'int_name', 'room', 'local', 'short_name', 'old_name', 'acryonym', 'international', 'religion', 'possible', 'color', 'amenity', 'place', 'hydropower_project', 'date', 'link', 'etymology', 'railway', 'start_date', 'address', 'alt_spelling', 'statcan_rbuid', '1875-1895', 'proposed', 'absent', 'original', 'census_type_code', 'census_division_type', 'census_division', 'conrail', 'note', 'fullname', 'honorary', 'end_date', 'cycleway', 'second', 'third', 'structure', 'genitive', 'adjective', 'carnaval', 'highway', 'signed', 'loc', 't4t35.fr', 'cadastre', 'güterweg', 'viaduct', 'sign', 'official', 'transisère', 'diff', 'technical', 'parent', 'tag', 'historical', 'myth', 'history', 'dialect', 'signposted', 'practitioner', 'griffithsvaluation', 'dbag', 'long', 'express-', 'census1901', 'azafady', 'future', 'website', 'lanes', 'notes', 'wikidata', 'description', 'species', 'maps4bw', 'esr', 'alter', 'alt', 'cobbles'];
$gb = 1000000000;
$bytes_unzipped_max = $gb;
$bytes_zipped_total = 0;
$bytes_zipped = 0;
$bytes_unzipped = 0;
$decompr = [''];
$langs = [];
$examples = [];
$pref_suf = [];
$file_num = 0;
$file_ext = '.osm.bz2';
echo 'Checking Intl extension... ';
echo transliterator_transliterate('Any-Lower', 'THIS TEXT SHOULD BE CONVERTED TO LOWERCASE');
echo '<br/>Searching for ' . $file_ext . ' files... ';
$files = glob('*' . $file_ext);
if(count($files) == 0)
exit('no files have been found.');
else
{
foreach($files as $file)
$bytes_zipped_total += filesize($file);
echo count($files) . ' files have been found, total size: ' . round($bytes_zipped_total / $gb, 2) . ' GB';
}
flush();
foreach($files as $file)
{
$file_num++;
echo '<br/>Processing the file ' . $file_num . '/' . count($files) . ': <b>' . $file . '</b>, ETA: ';
if($bytes_zipped > 0)
echo round( (microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) * ($bytes_zipped_total - $bytes_zipped) / $bytes_zipped ) . ' seconds';
else
echo 'n/a';
flush();
$bz = bzopen($file, 'r');
while($buffer = bzread($bz, 8192))
{
if(bzerrno($bz) !== 0)
exit('<p/>Error: Compression problem ' . bzerrno($bz));
if(ftell($bz) < 0)
exit('<p/>Error: The file pointer is at negative position. Are you running x32 version of Apache/PHP?');
$bytes_unzipped += 8192;
/*
if($bytes_unzipped > $bytes_unzipped_max)
{
echo '<br/><nobr>Current progress: ' . ($bytes_unzipped_max / $gb) . ' GB of decompressed data; file pointer is at ' . ftell($bz) . '; '
. count($langs) . ' languages and ' . count($pref_suf) . ' prefixes & suffixes have been found so far; '
. 'script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds</nobr>';
flush();
$bytes_unzipped_max += $gb;
}
*/
$decompr = explode("\n", $decompr[0] . $buffer);
for($i = 0, $len = count($decompr) - 1; $i <= $len; $i++)
{
$line = $decompr[$i];
if($i == $len)
$decompr = [$line];
else
{
preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches);
if($matches)
{
$lang = transliterator_transliterate( 'Any-Lower', preg_replace('/(?<=[^0-9])_?[0-9]$/', '', $matches[1]) );
$example = $matches[2];
if(!in_array($lang, $names_to_skip) and !is_numeric( str_replace(['-', '_'], '', $lang) ))
{
if(array_key_exists($lang, $langs))
{
$langs[$lang]++;
if(count($examples[$lang]) < 6)
$examples[$lang][] = $example;
}
else
{
$langs[$lang] = 1;
$examples[$lang] = [$example];
}
}
}
unset($matches);
preg_match('/name:(?:prefix|suffix|postfix):?([^"]*)" v="([^"]+)/', $line, $matches);
if($matches)
{
$pr_sf = trim( transliterator_transliterate('Any-Lower', $matches[2]), "()" );
$pr_sf_lang = $matches[1];
if(!array_key_exists($pr_sf, $pref_suf))
$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang];
elseif($pr_sf_lang != '')
{
if($pref_suf[$pr_sf] == 0)
$pref_suf[$pr_sf] = [$pr_sf_lang];
elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf]))
$pref_suf[$pr_sf][] = $pr_sf_lang;
}
}
}
}
}
if(ftell($bz) == 900000)
echo '<br/>Error: The file ' . $file . ' is multi-stream. PHP\'s bz2 extension does not support multi-stream files. The file will be skipped.';
else
$bytes_zipped += filesize($file);
bzclose($bz);
}
echo '<p/>Script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds. '
. 'Total decompressed size of all files: ' . round($bytes_unzipped / $gb, 2) . ' GB. <b>'
. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.';
if(count($langs) > 0)
{
arsort($langs);
echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">'
. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>';
foreach($langs as $lang => $freq)
{
if($freq > 10)
echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>';
}
echo '</table>';
}
if(count($pref_suf) > 0)
{
ksort($pref_suf);
$pref_suf_output = [];
echo '<p/><b>Prefixes & suffixes:</b> ';
foreach($pref_suf as $pr_sf => $pr_sf_lang)
$pref_suf_output[] = '<span' . ($pr_sf_lang === 0 ? '' : ' title="' . implode(', ', $pr_sf_lang) . '"') . '>\'' . $pr_sf . '\'</span>';
echo implode(', ', $pref_suf_output);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment