Last active
February 19, 2018 20:52
-
-
Save siffash/d3de209983cd571ee3f429af083249f0 to your computer and use it in GitHub Desktop.
Extract all unique languages from OSM .bz2 files + all unique prefixes & suffixes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? | |
/***************** | |
*** README *** | |
Prerequisites: | |
- PHP / Apache x64 | |
- the following PHP extensions installed & enabled: intl, bz2 | |
Caveats: | |
- the script skips multi-stream bz2 files because PHP's bz2 extension does not support them, treating EOS as EOF | |
- the script skips certain names, see $names_to_skip | |
- the script skips the names which contain numbers only, see is_numeric | |
- the script treats "en_1" and "en1" as "en", see preg_replace | |
- the script does not print the languages which are used in OSM less than 10 times, see if($freq > 10) | |
How To Use: | |
1) download bz2 files containing OSM XML data (preferably from download.geofabrik.de) | |
2) put them into the same directory you put this script | |
3) run the script | |
FYI: | |
- if you want to see detailed progress (every 1 GB processed), uncomment the commented part of the code | |
*****************/ | |
ini_set('error_reporting', E_ALL); | |
ini_set('display_errors', '1'); | |
ini_set('memory_limit', '-1'); | |
ini_set('max_execution_time', 0); | |
set_time_limit(0); | |
$names_to_skip = ['prefix', 'suffix', 'postfix', 'botanical', 'left', 'right', 'url', 'abbreviation', 'source', 'gbif_equivalent_tag', 'simple', 'individualid', 'locality', 'alt_name', 'vernacular', 'bridge', 'old', 'historic', 'map-bms', 'language', 'movibus', 'operator', 'actual', 'forward', 'backward', 'comment', 'vulgar', 'spatial', 'wikipedia', 'tunnel', 'disused', 'int', 'level', 'generic', 'variants', 'informal', 'village', 'type', 'pronunciation', 'former', 'short', 'maybe', 'alternative', 'demolished', 'was', 'review', 'house', 'transliteration', 'fonetic', 'end', 'street', 'abbr', 'desc', 'full', 'acronym', 'source_ref', 'brand', 'int_name', 'room', 'local', 'short_name', 'old_name', 'acryonym', 'international', 'religion', 'possible', 'color', 'amenity', 'place', 'hydropower_project', 'date', 'link', 'etymology', 'railway', 'start_date', 'address', 'alt_spelling', 'statcan_rbuid', '1875-1895', 'proposed', 'absent', 'original', 'census_type_code', 'census_division_type', 'census_division', 'conrail', 'note', 'fullname', 'honorary', 'end_date', 'cycleway', 'second', 'third', 'structure', 'genitive', 'adjective', 'carnaval', 'highway', 'signed', 'loc', 't4t35.fr', 'cadastre', 'güterweg', 'viaduct', 'sign', 'official', 'transisère', 'diff', 'technical', 'parent', 'tag', 'historical', 'myth', 'history', 'dialect', 'signposted', 'practitioner', 'griffithsvaluation', 'dbag', 'long', 'express-', 'census1901', 'azafady', 'future', 'website', 'lanes', 'notes', 'wikidata', 'description', 'species', 'maps4bw', 'esr', 'alter', 'alt', 'cobbles']; | |
$gb = 1000000000; | |
$bytes_unzipped_max = $gb; | |
$bytes_zipped_total = 0; | |
$bytes_zipped = 0; | |
$bytes_unzipped = 0; | |
$decompr = ['']; | |
$langs = []; | |
$examples = []; | |
$pref_suf = []; | |
$file_num = 0; | |
$file_ext = '.osm.bz2'; | |
echo 'Checking Intl extension... '; | |
echo transliterator_transliterate('Any-Lower', 'THIS TEXT SHOULD BE CONVERTED TO LOWERCASE'); | |
echo '<br/>Searching for ' . $file_ext . ' files... '; | |
$files = glob('*' . $file_ext); | |
if(count($files) == 0) | |
exit('no files have been found.'); | |
else | |
{ | |
foreach($files as $file) | |
$bytes_zipped_total += filesize($file); | |
echo count($files) . ' files have been found, total size: ' . round($bytes_zipped_total / $gb, 2) . ' GB'; | |
} | |
flush(); | |
foreach($files as $file) | |
{ | |
$file_num++; | |
echo '<br/>Processing the file ' . $file_num . '/' . count($files) . ': <b>' . $file . '</b>, ETA: '; | |
if($bytes_zipped > 0) | |
echo round( (microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) * ($bytes_zipped_total - $bytes_zipped) / $bytes_zipped ) . ' seconds'; | |
else | |
echo 'n/a'; | |
flush(); | |
$bz = bzopen($file, 'r'); | |
while($buffer = bzread($bz, 8192)) | |
{ | |
if(bzerrno($bz) !== 0) | |
exit('<p/>Error: Compression problem ' . bzerrno($bz)); | |
if(ftell($bz) < 0) | |
exit('<p/>Error: The file pointer is at negative position. Are you running x32 version of Apache/PHP?'); | |
$bytes_unzipped += 8192; | |
/* | |
if($bytes_unzipped > $bytes_unzipped_max) | |
{ | |
echo '<br/><nobr>Current progress: ' . ($bytes_unzipped_max / $gb) . ' GB of decompressed data; file pointer is at ' . ftell($bz) . '; ' | |
. count($langs) . ' languages and ' . count($pref_suf) . ' prefixes & suffixes have been found so far; ' | |
. 'script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds</nobr>'; | |
flush(); | |
$bytes_unzipped_max += $gb; | |
} | |
*/ | |
$decompr = explode("\n", $decompr[0] . $buffer); | |
for($i = 0, $len = count($decompr) - 1; $i <= $len; $i++) | |
{ | |
$line = $decompr[$i]; | |
if($i == $len) | |
$decompr = [$line]; | |
else | |
{ | |
preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches); | |
if($matches) | |
{ | |
$lang = transliterator_transliterate( 'Any-Lower', preg_replace('/(?<=[^0-9])_?[0-9]$/', '', $matches[1]) ); | |
$example = $matches[2]; | |
if(!in_array($lang, $names_to_skip) and !is_numeric( str_replace(['-', '_'], '', $lang) )) | |
{ | |
if(array_key_exists($lang, $langs)) | |
{ | |
$langs[$lang]++; | |
if(count($examples[$lang]) < 6) | |
$examples[$lang][] = $example; | |
} | |
else | |
{ | |
$langs[$lang] = 1; | |
$examples[$lang] = [$example]; | |
} | |
} | |
} | |
unset($matches); | |
preg_match('/name:(?:prefix|suffix|postfix):?([^"]*)" v="([^"]+)/', $line, $matches); | |
if($matches) | |
{ | |
$pr_sf = trim( transliterator_transliterate('Any-Lower', $matches[2]), "()" ); | |
$pr_sf_lang = $matches[1]; | |
if(!array_key_exists($pr_sf, $pref_suf)) | |
$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang]; | |
elseif($pr_sf_lang != '') | |
{ | |
if($pref_suf[$pr_sf] == 0) | |
$pref_suf[$pr_sf] = [$pr_sf_lang]; | |
elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf])) | |
$pref_suf[$pr_sf][] = $pr_sf_lang; | |
} | |
} | |
} | |
} | |
} | |
if(ftell($bz) == 900000) | |
echo '<br/>Error: The file ' . $file . ' is multi-stream. PHP\'s bz2 extension does not support multi-stream files. The file will be skipped.'; | |
else | |
$bytes_zipped += filesize($file); | |
bzclose($bz); | |
} | |
echo '<p/>Script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds. ' | |
. 'Total decompressed size of all files: ' . round($bytes_unzipped / $gb, 2) . ' GB. <b>' | |
. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.'; | |
if(count($langs) > 0) | |
{ | |
arsort($langs); | |
echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">' | |
. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>'; | |
foreach($langs as $lang => $freq) | |
{ | |
if($freq > 10) | |
echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>'; | |
} | |
echo '</table>'; | |
} | |
if(count($pref_suf) > 0) | |
{ | |
ksort($pref_suf); | |
$pref_suf_output = []; | |
echo '<p/><b>Prefixes & suffixes:</b> '; | |
foreach($pref_suf as $pr_sf => $pr_sf_lang) | |
$pref_suf_output[] = '<span' . ($pr_sf_lang === 0 ? '' : ' title="' . implode(', ', $pr_sf_lang) . '"') . '>\'' . $pr_sf . '\'</span>'; | |
echo implode(', ', $pref_suf_output); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment