siffash · February 19, 2018 20:52
diff --git a/extract-languages-from-osm-bz2.php b/extract-languages-from-osm-bz2.php
 <?

 /*****************
 	*** README ***
 	
 	Prerequisites:
 		- PHP / Apache x64
 		- the following PHP extensions installed & enabled: intl, bz2
 		
 	Caveats:
 		- the script skips multi-stream bz2 files because PHP's bz2 extension does not support them, treating EOS as EOF
 		- the script skips certain names, see $names_to_skip
 		- the script skips the names which contain numbers only, see is_numeric
 		- the script treats "en_1" and "en1" as "en", see preg_replace
 		- the script does not print the languages which are used in OSM less than 10 times, see if($freq > 10)
 		
 	How To Use:
 		1) download bz2 files containing OSM XML data (preferably from download.geofabrik.de)
 		2) put them into the same directory you put this script
 		3) run the script
 		
 	FYI:
 		- if you want to see detailed progress (every 1 GB processed), uncomment the commented part of the code
 		
 *****************/

 ini_set('error_reporting', E_ALL);
 ini_set('display_errors', '1');
 ini_set('memory_limit', '-1');
 ini_set('max_execution_time', 0);
 set_time_limit(0);

 $names_to_skip = ['prefix', 'suffix', 'postfix', 'botanical', 'left', 'right', 'url', 'abbreviation', 'source', 'gbif_equivalent_tag', 'simple', 'individualid', 'locality', 'alt_name', 'vernacular', 'bridge', 'old', 'historic', 'map-bms', 'language', 'movibus', 'operator', 'actual', 'forward', 'backward', 'comment', 'vulgar', 'spatial', 'wikipedia', 'tunnel', 'disused', 'int', 'level', 'generic', 'variants', 'informal', 'village', 'type', 'pronunciation', 'former', 'short', 'maybe', 'alternative', 'demolished', 'was', 'review', 'house', 'transliteration', 'fonetic', 'end', 'street', 'abbr', 'desc', 'full', 'acronym', 'source_ref', 'brand', 'int_name', 'room', 'local', 'short_name', 'old_name', 'acryonym', 'international', 'religion', 'possible', 'color', 'amenity', 'place', 'hydropower_project', 'date', 'link', 'etymology', 'railway', 'start_date', 'address', 'alt_spelling', 'statcan_rbuid', '1875-1895', 'proposed', 'absent', 'original', 'census_type_code', 'census_division_type', 'census_division', 'conrail', 'note', 'fullname', 'honorary', 'end_date', 'cycleway', 'second', 'third', 'structure', 'genitive', 'adjective', 'carnaval', 'highway', 'signed', 'loc', 't4t35.fr', 'cadastre', 'güterweg', 'viaduct', 'sign', 'official', 'transisère', 'diff', 'technical', 'parent', 'tag', 'historical', 'myth', 'history', 'dialect', 'signposted', 'practitioner', 'griffithsvaluation', 'dbag', 'long', 'express-', 'census1901', 'azafady', 'future', 'website', 'lanes', 'notes', 'wikidata', 'description', 'species', 'maps4bw', 'esr', 'alter', 'alt', 'cobbles'];
 $gb = 1000000000;
 $bytes_unzipped_max = $gb;
 $bytes_zipped_total = 0;
 $bytes_zipped = 0;
 $bytes_unzipped = 0;
 $decompr = [''];
 $langs = [];
 $examples = [];
 $pref_suf = [];
 $file_num = 0;
 $file_ext = '.osm.bz2';

 echo 'Checking Intl extension... ';
 echo transliterator_transliterate('Any-Lower', 'THIS TEXT SHOULD BE CONVERTED TO LOWERCASE');

 echo '<br/>Searching for ' . $file_ext . ' files... ';
 $files = glob('*' . $file_ext);
 if(count($files) == 0)
 	exit('no files have been found.');
 else
 {
 	foreach($files as $file)
 		$bytes_zipped_total += filesize($file);
 	echo count($files) . ' files have been found, total size: ' . round($bytes_zipped_total / $gb, 2) . ' GB';
 }
 flush();

 foreach($files as $file)
 {
 	$file_num++;
 	echo '<br/>Processing the file ' . $file_num . '/' . count($files) . ': <b>' . $file . '</b>, ETA: ';
 	if($bytes_zipped > 0)
 		echo round( (microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) * ($bytes_zipped_total - $bytes_zipped) / $bytes_zipped ) . ' seconds';
 	else
 		echo 'n/a';
 	flush();
 	
 	$bz = bzopen($file, 'r');
 	
 	while($buffer = bzread($bz, 8192))
 	{
 		if(bzerrno($bz) !== 0)
 			exit('<p/>Error: Compression problem ' . bzerrno($bz));
 		if(ftell($bz) < 0)
 			exit('<p/>Error: The file pointer is at negative position. Are you running x32 version of Apache/PHP?');
 		$bytes_unzipped += 8192;
 		/*
 		if($bytes_unzipped > $bytes_unzipped_max)
 		{
 			echo '<br/><nobr>Current progress: ' . ($bytes_unzipped_max / $gb) . ' GB of decompressed data; file pointer is at ' . ftell($bz) . '; '
 				. count($langs) . ' languages and ' . count($pref_suf) . ' prefixes & suffixes have been found so far; '
 				. 'script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds</nobr>';
 			flush();
 			$bytes_unzipped_max += $gb;
 		}
 		*/
 		$decompr = explode("\n", $decompr[0] . $buffer);
 		for($i = 0, $len = count($decompr) - 1; $i <= $len; $i++)
 		{
 			$line = $decompr[$i];
 			if($i == $len)
 				$decompr = [$line];
 			else
 			{
 				preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches);
 				if($matches)
 				{
 					$lang = transliterator_transliterate( 'Any-Lower', preg_replace('/(?<=[^0-9])_?[0-9]$/', '', $matches[1]) );
 					$example = $matches[2];
 					if(!in_array($lang, $names_to_skip) and !is_numeric( str_replace(['-', '_'], '', $lang) ))
 					{
 						if(array_key_exists($lang, $langs))
 						{
 							$langs[$lang]++;
 							if(count($examples[$lang]) < 6)
 								$examples[$lang][] = $example;
 						}
 						else
 						{
 							$langs[$lang] = 1;
 							$examples[$lang] = [$example];
 						}
 					}
 				}
 				unset($matches);
 				preg_match('/name:(?:prefix|suffix|postfix):?([^"]*)" v="([^"]+)/', $line, $matches);
 				if($matches)
 				{
 					$pr_sf = trim( transliterator_transliterate('Any-Lower', $matches[2]), "()" );
 					$pr_sf_lang = $matches[1];
 					if(!array_key_exists($pr_sf, $pref_suf))
 						$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang];
 					elseif($pr_sf_lang != '')
 					{
 						if($pref_suf[$pr_sf] == 0)
 							$pref_suf[$pr_sf] = [$pr_sf_lang];
 						elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf]))
 							$pref_suf[$pr_sf][] = $pr_sf_lang;
 					}
 				}
 			}
 		}
 	}
 	
 	if(ftell($bz) == 900000)
 		echo '<br/>Error: The file ' . $file . ' is multi-stream. PHP\'s bz2 extension does not support multi-stream files. The file will be skipped.';
 	else
 		$bytes_zipped += filesize($file);
 	
 	bzclose($bz);
 }

 echo '<p/>Script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds. '
 	. 'Total decompressed size of all files: ' . round($bytes_unzipped / $gb, 2) . ' GB. <b>'
 	. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.';

 if(count($langs) > 0)
 {
 	arsort($langs);
 	echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">'
 		. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>';
 	foreach($langs as $lang => $freq)
 	{
 		if($freq > 10)
 			echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>';
 	}
 	echo '</table>';
 }

 if(count($pref_suf) > 0)
 {
 	ksort($pref_suf);
 	$pref_suf_output = [];
 	echo '<p/><b>Prefixes & suffixes:</b> ';
 	foreach($pref_suf as $pr_sf => $pr_sf_lang)
 		$pref_suf_output[] = '<span' . ($pr_sf_lang === 0 ? '' : ' title="' . implode(', ', $pr_sf_lang) . '"') . '>\'' . $pr_sf . '\'</span>';
 	echo implode(', ', $pref_suf_output);
 }
	<?

	/*****************
	* README *

	Prerequisites:
	- PHP / Apache x64
	- the following PHP extensions installed & enabled: intl, bz2

	Caveats:
	- the script skips multi-stream bz2 files because PHP's bz2 extension does not support them, treating EOS as EOF
	- the script skips certain names, see $names_to_skip
	- the script skips the names which contain numbers only, see is_numeric
	- the script treats "en_1" and "en1" as "en", see preg_replace
	- the script does not print the languages which are used in OSM less than 10 times, see if($freq > 10)

	How To Use:
	1) download bz2 files containing OSM XML data (preferably from download.geofabrik.de)
	2) put them into the same directory you put this script
	3) run the script

	FYI:
	- if you want to see detailed progress (every 1 GB processed), uncomment the commented part of the code

	*****************/

	ini_set('error_reporting', E_ALL);
	ini_set('display_errors', '1');
	ini_set('memory_limit', '-1');
	ini_set('max_execution_time', 0);
	set_time_limit(0);

	$names_to_skip = ['prefix', 'suffix', 'postfix', 'botanical', 'left', 'right', 'url', 'abbreviation', 'source', 'gbif_equivalent_tag', 'simple', 'individualid', 'locality', 'alt_name', 'vernacular', 'bridge', 'old', 'historic', 'map-bms', 'language', 'movibus', 'operator', 'actual', 'forward', 'backward', 'comment', 'vulgar', 'spatial', 'wikipedia', 'tunnel', 'disused', 'int', 'level', 'generic', 'variants', 'informal', 'village', 'type', 'pronunciation', 'former', 'short', 'maybe', 'alternative', 'demolished', 'was', 'review', 'house', 'transliteration', 'fonetic', 'end', 'street', 'abbr', 'desc', 'full', 'acronym', 'source_ref', 'brand', 'int_name', 'room', 'local', 'short_name', 'old_name', 'acryonym', 'international', 'religion', 'possible', 'color', 'amenity', 'place', 'hydropower_project', 'date', 'link', 'etymology', 'railway', 'start_date', 'address', 'alt_spelling', 'statcan_rbuid', '1875-1895', 'proposed', 'absent', 'original', 'census_type_code', 'census_division_type', 'census_division', 'conrail', 'note', 'fullname', 'honorary', 'end_date', 'cycleway', 'second', 'third', 'structure', 'genitive', 'adjective', 'carnaval', 'highway', 'signed', 'loc', 't4t35.fr', 'cadastre', 'güterweg', 'viaduct', 'sign', 'official', 'transisère', 'diff', 'technical', 'parent', 'tag', 'historical', 'myth', 'history', 'dialect', 'signposted', 'practitioner', 'griffithsvaluation', 'dbag', 'long', 'express-', 'census1901', 'azafady', 'future', 'website', 'lanes', 'notes', 'wikidata', 'description', 'species', 'maps4bw', 'esr', 'alter', 'alt', 'cobbles'];
	$gb = 1000000000;
	$bytes_unzipped_max = $gb;
	$bytes_zipped_total = 0;
	$bytes_zipped = 0;
	$bytes_unzipped = 0;
	$decompr = [''];
	$langs = [];
	$examples = [];
	$pref_suf = [];
	$file_num = 0;
	$file_ext = '.osm.bz2';

	echo 'Checking Intl extension... ';
	echo transliterator_transliterate('Any-Lower', 'THIS TEXT SHOULD BE CONVERTED TO LOWERCASE');

	echo '<br/>Searching for ' . $file_ext . ' files... ';
	$files = glob('*' . $file_ext);
	if(count($files) == 0)
	exit('no files have been found.');
	else
	{
	foreach($files as $file)
	$bytes_zipped_total += filesize($file);
	echo count($files) . ' files have been found, total size: ' . round($bytes_zipped_total / $gb, 2) . ' GB';
	}
	flush();

	foreach($files as $file)
	{
	$file_num++;
	echo '<br/>Processing the file ' . $file_num . '/' . count($files) . ': <b>' . $file . '</b>, ETA: ';
	if($bytes_zipped > 0)
	echo round( (microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) * ($bytes_zipped_total - $bytes_zipped) / $bytes_zipped ) . ' seconds';
	else
	echo 'n/a';
	flush();

	$bz = bzopen($file, 'r');

	while($buffer = bzread($bz, 8192))
	{
	if(bzerrno($bz) !== 0)
	exit('<p/>Error: Compression problem ' . bzerrno($bz));
	if(ftell($bz) < 0)
	exit('<p/>Error: The file pointer is at negative position. Are you running x32 version of Apache/PHP?');
	$bytes_unzipped += 8192;
	/*
	if($bytes_unzipped > $bytes_unzipped_max)
	{
	echo '<br/><nobr>Current progress: ' . ($bytes_unzipped_max / $gb) . ' GB of decompressed data; file pointer is at ' . ftell($bz) . '; '
	. count($langs) . ' languages and ' . count($pref_suf) . ' prefixes & suffixes have been found so far; '
	. 'script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds</nobr>';
	flush();
	$bytes_unzipped_max += $gb;
	}
	*/
	$decompr = explode("\n", $decompr[0] . $buffer);
	for($i = 0, $len = count($decompr) - 1; $i <= $len; $i++)
	{
	$line = $decompr[$i];
	if($i == $len)
	$decompr = [$line];
	else
	{
	preg_match('/name:([^:"]+)" v="([^"]+)/', $line, $matches);
	if($matches)
	{
	$lang = transliterator_transliterate( 'Any-Lower', preg_replace('/(?<=[^0-9])_?[0-9]$/', '', $matches[1]) );
	$example = $matches[2];
	if(!in_array($lang, $names_to_skip) and !is_numeric( str_replace(['-', '_'], '', $lang) ))
	{
	if(array_key_exists($lang, $langs))
	{
	$langs[$lang]++;
	if(count($examples[$lang]) < 6)
	$examples[$lang][] = $example;
	}
	else
	{
	$langs[$lang] = 1;
	$examples[$lang] = [$example];
	}
	}
	}
	unset($matches);
	preg_match('/name:(?:prefix\|suffix\|postfix):?([^"]*)" v="([^"]+)/', $line, $matches);
	if($matches)
	{
	$pr_sf = trim( transliterator_transliterate('Any-Lower', $matches[2]), "()" );
	$pr_sf_lang = $matches[1];
	if(!array_key_exists($pr_sf, $pref_suf))
	$pref_suf[$pr_sf] = $pr_sf_lang == '' ? 0 : [$pr_sf_lang];
	elseif($pr_sf_lang != '')
	{
	if($pref_suf[$pr_sf] == 0)
	$pref_suf[$pr_sf] = [$pr_sf_lang];
	elseif(!in_array($pr_sf_lang, $pref_suf[$pr_sf]))
	$pref_suf[$pr_sf][] = $pr_sf_lang;
	}
	}
	}
	}
	}

	if(ftell($bz) == 900000)
	echo '<br/>Error: The file ' . $file . ' is multi-stream. PHP\'s bz2 extension does not support multi-stream files. The file will be skipped.';
	else
	$bytes_zipped += filesize($file);

	bzclose($bz);
	}

	echo '<p/>Script execution time: ' . round(microtime(true) - $_SERVER['REQUEST_TIME_FLOAT']) . ' seconds. '
	. 'Total decompressed size of all files: ' . round($bytes_unzipped / $gb, 2) . ' GB. <b>'
	. count($langs) . ' languages</b> and <b>' . count($pref_suf) . ' prefixes & suffixes</b> have been found.';

	if(count($langs) > 0)
	{
	arsort($langs);
	echo '<p/><table border="1" width="100%" cellspacing="0" cellpadding="5" style="border-collapse: collapse">'
	. '<tr><th>Language</th><th>Frequency</th><th>Examples</th></tr>';
	foreach($langs as $lang => $freq)
	{
	if($freq > 10)
	echo '<tr><td>' . $lang . '</td><td>' . $freq . '</td><td>' . implode(', ', $examples[$lang]) . '</td></tr>';
	}
	echo '</table>';
	}

	if(count($pref_suf) > 0)
	{
	ksort($pref_suf);
	$pref_suf_output = [];
	echo '<p/><b>Prefixes & suffixes:</b> ';
	foreach($pref_suf as $pr_sf => $pr_sf_lang)
	$pref_suf_output[] = '<span' . ($pr_sf_lang === 0 ? '' : ' title="' . implode(', ', $pr_sf_lang) . '"') . '>\'' . $pr_sf . '\'</span>';
	echo implode(', ', $pref_suf_output);
	}