-
-
Save abhishekbhardwaj/a2fb5fc87cb3241f69ba1292102888e7 to your computer and use it in GitHub Desktop.
Generate a PHP compatible regular expression to match emoji from the most recent unicode data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Uses the data from unicode.org's emoji-data.txt to build a PHP compatible Regular Expression | |
* along the lines of: | |
* (?:\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\x{FE0F}?) | |
* | |
* To use: php build-hashtag-regexp.php <emoji-data.txt> | |
* Output will be the generated regular expression. | |
* | |
* Get a current copy of emoji-data.txt from http://www.unicode.org/Public/emoji/latest/emoji-data.txt | |
*/ | |
if (!isset($argv[1]) || !file_exists($argv[1])) { | |
print "Usage: build-hashtag-regexp.php <emoji-data.txt>\n"; | |
print "\n"; | |
print "Prints the generated PHP compatible Regular Expression to STDOUT.\n"; | |
print "\n"; | |
print "Grab emoji-data.txt from http://www.unicode.org/Public/emoji/latest/emoji-data.txt\n"; | |
die(1); | |
} | |
$emojiFilename = $argv[1]; | |
$emojiFile = file($emojiFilename); | |
$emojiData = array(); | |
$emojiClasses = array(); | |
foreach ($emojiFile as $line) { | |
$pos = strpos($line,'#'); | |
if ($pos!==false) { | |
$line = substr($line,0,$pos); | |
} | |
$line = trim($line); | |
if (!$line) { | |
continue; | |
} | |
$line = explode(';',$line); | |
if (count($line)!=2) { | |
continue; | |
} | |
$range = strtoupper(trim($line[0])); | |
$class = trim($line[1]); | |
if (!isset($emojiClasses[$class])) { | |
$emojiClasses[$class] = array(); | |
} | |
$range = explode('..',$range); | |
if (count($range)==1) { | |
$emojiClasses[$class][] = '\\x{'.$range[0].'}'; | |
} | |
else { | |
$emojiClasses[$class][] = '[\\x{'.$range[0].'}-\\x{'.$range[1].'}]'; | |
} | |
} | |
$emojiRegexp = '(?:\\p{Emoji_Modifier_Base}\\p{Emoji_Modifier}?|\\p{Emoji_Presentation}|\\p{Emoji}\\x{FE0F}?)'; | |
foreach ($emojiClasses as $class=>$components) { | |
$emojiRegexp = str_replace('\\p{'.$class.'}','(?:'.implode('|',$components).')',$emojiRegexp); | |
} | |
print $emojiRegexp; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment