Skip to content

Instantly share code, notes, and snippets.

@abhishekbhardwaj
Forked from ravisorg/test-hashtag-regexp.php
Created December 13, 2018 05:50
Show Gist options
  • Save abhishekbhardwaj/97fc32e8040e9d5ded354c939dbe3000 to your computer and use it in GitHub Desktop.
Save abhishekbhardwaj/97fc32e8040e9d5ded354c939dbe3000 to your computer and use it in GitHub Desktop.
pnut.io unicode hashtag parser
<?php
// You can find the test data file at https://www.ravis.org/hashtag-test.zip
// You're gonna want to have your console output supporting UTF8 before running this, or you're
// gonna see a bunch of ? in the output...
// For curiosity's sake, post number 693,847 is an emoji hashtag: #(heart)
// Emoji are spread throughout the Unicode codespace and can contain modifiers. This regexp
// was automatically generated from http://ftp.unicode.org/Public/emoji/1.0/emoji-data.txt
// and is the equivilant (because PHP doesn't support these character classes) of:
// (?:\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\x{FE0F}?)
// Generated using https://gist.github.com/ravisorg/23edafbfcbd45de9875adec5310fca76
$emojiRegexp = '(?:(?:\x{261D}|\x{26F9}|[\x{270A}-\x{270B}]|[\x{270C}-\x{270D}]|\x{1F385}|[\x{1F3C2}-\x{1F3C4}]|\x{1F3C7}|\x{1F3CA}|[\x{1F3CB}-\x{1F3CC}]|[\x{1F442}-\x{1F443}]|[\x{1F446}-\x{1F450}]|[\x{1F466}-\x{1F469}]|\x{1F46E}|[\x{1F470}-\x{1F478}]|\x{1F47C}|[\x{1F481}-\x{1F483}]|[\x{1F485}-\x{1F487}]|\x{1F4AA}|[\x{1F574}-\x{1F575}]|\x{1F57A}|\x{1F590}|[\x{1F595}-\x{1F596}]|[\x{1F645}-\x{1F647}]|[\x{1F64B}-\x{1F64F}]|\x{1F6A3}|[\x{1F6B4}-\x{1F6B6}]|\x{1F6C0}|\x{1F6CC}|\x{1F918}|[\x{1F919}-\x{1F91C}]|\x{1F91E}|\x{1F91F}|\x{1F926}|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F939}]|[\x{1F93D}-\x{1F93E}]|[\x{1F9D1}-\x{1F9DD}])(?:[\x{1F3FB}-\x{1F3FF}])?|(?:[\x{231A}-\x{231B}]|[\x{23E9}-\x{23EC}]|\x{23F0}|\x{23F3}|[\x{25FD}-\x{25FE}]|[\x{2614}-\x{2615}]|[\x{2648}-\x{2653}]|\x{267F}|\x{2693}|\x{26A1}|[\x{26AA}-\x{26AB}]|[\x{26BD}-\x{26BE}]|[\x{26C4}-\x{26C5}]|\x{26CE}|\x{26D4}|\x{26EA}|[\x{26F2}-\x{26F3}]|\x{26F5}|\x{26FA}|\x{26FD}|\x{2705}|[\x{270A}-\x{270B}]|\x{2728}|\x{274C}|\x{274E}|[\x{2753}-\x{2755}]|\x{2757}|[\x{2795}-\x{2797}]|\x{27B0}|\x{27BF}|[\x{2B1B}-\x{2B1C}]|\x{2B50}|\x{2B55}|\x{1F004}|\x{1F0CF}|\x{1F18E}|[\x{1F191}-\x{1F19A}]|[\x{1F1E6}-\x{1F1FF}]|\x{1F201}|\x{1F21A}|\x{1F22F}|[\x{1F232}-\x{1F236}]|[\x{1F238}-\x{1F23A}]|[\x{1F250}-\x{1F251}]|[\x{1F300}-\x{1F320}]|[\x{1F32D}-\x{1F32F}]|[\x{1F330}-\x{1F335}]|[\x{1F337}-\x{1F37C}]|[\x{1F37E}-\x{1F37F}]|[\x{1F380}-\x{1F393}]|[\x{1F3A0}-\x{1F3C4}]|\x{1F3C5}|[\x{1F3C6}-\x{1F3CA}]|[\x{1F3CF}-\x{1F3D3}]|[\x{1F3E0}-\x{1F3F0}]|\x{1F3F4}|[\x{1F3F8}-\x{1F3FF}]|[\x{1F400}-\x{1F43E}]|\x{1F440}|[\x{1F442}-\x{1F4F7}]|\x{1F4F8}|[\x{1F4F9}-\x{1F4FC}]|\x{1F4FF}|[\x{1F500}-\x{1F53D}]|[\x{1F54B}-\x{1F54E}]|[\x{1F550}-\x{1F567}]|\x{1F57A}|[\x{1F595}-\x{1F596}]|\x{1F5A4}|[\x{1F5FB}-\x{1F5FF}]|\x{1F600}|[\x{1F601}-\x{1F610}]|\x{1F611}|[\x{1F612}-\x{1F614}]|\x{1F615}|\x{1F616}|\x{1F617}|\x{1F618}|\x{1F619}|\x{1F61A}|\x{1F61B}|[\x{1F61C}-\x{1F61E}]|\x{1F61F}|[\x{1F620}-\x{1F625}]|[\x{1F626}-\x{1F627}]|[\x{1F628}-\x{1F62B}]|\x{1F62C}|\x{1F62D}|[\x{1F62E}-\x{1F62F}]|[\x{1F630}-\x{1F633}]|\x{1F634}|[\x{1F635}-\x{1F640}]|[\x{1F641}-\x{1F642}]|[\x{1F643}-\x{1F644}]|[\x{1F645}-\x{1F64F}]|[\x{1F680}-\x{1F6C5}]|\x{1F6CC}|\x{1F6D0}|[\x{1F6D1}-\x{1F6D2}]|[\x{1F6EB}-\x{1F6EC}]|[\x{1F6F4}-\x{1F6F6}]|[\x{1F6F7}-\x{1F6F8}]|[\x{1F910}-\x{1F918}]|[\x{1F919}-\x{1F91E}]|\x{1F91F}|[\x{1F920}-\x{1F927}]|[\x{1F928}-\x{1F92F}]|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F93A}]|[\x{1F93C}-\x{1F93E}]|[\x{1F940}-\x{1F945}]|[\x{1F947}-\x{1F94B}]|\x{1F94C}|[\x{1F950}-\x{1F95E}]|[\x{1F95F}-\x{1F96B}]|[\x{1F980}-\x{1F984}]|[\x{1F985}-\x{1F991}]|[\x{1F992}-\x{1F997}]|\x{1F9C0}|[\x{1F9D0}-\x{1F9E6}])|(?:\x{0023}|\x{002A}|[\x{0030}-\x{0039}]|\x{00A9}|\x{00AE}|\x{203C}|\x{2049}|\x{2122}|\x{2139}|[\x{2194}-\x{2199}]|[\x{21A9}-\x{21AA}]|[\x{231A}-\x{231B}]|\x{2328}|\x{23CF}|[\x{23E9}-\x{23F3}]|[\x{23F8}-\x{23FA}]|\x{24C2}|[\x{25AA}-\x{25AB}]|\x{25B6}|\x{25C0}|[\x{25FB}-\x{25FE}]|[\x{2600}-\x{2604}]|\x{260E}|\x{2611}|[\x{2614}-\x{2615}]|\x{2618}|\x{261D}|\x{2620}|[\x{2622}-\x{2623}]|\x{2626}|\x{262A}|[\x{262E}-\x{262F}]|[\x{2638}-\x{263A}]|\x{2640}|\x{2642}|[\x{2648}-\x{2653}]|\x{2660}|\x{2663}|[\x{2665}-\x{2666}]|\x{2668}|\x{267B}|\x{267F}|[\x{2692}-\x{2697}]|\x{2699}|[\x{269B}-\x{269C}]|[\x{26A0}-\x{26A1}]|[\x{26AA}-\x{26AB}]|[\x{26B0}-\x{26B1}]|[\x{26BD}-\x{26BE}]|[\x{26C4}-\x{26C5}]|\x{26C8}|\x{26CE}|\x{26CF}|\x{26D1}|[\x{26D3}-\x{26D4}]|[\x{26E9}-\x{26EA}]|[\x{26F0}-\x{26F5}]|[\x{26F7}-\x{26FA}]|\x{26FD}|\x{2702}|\x{2705}|[\x{2708}-\x{2709}]|[\x{270A}-\x{270B}]|[\x{270C}-\x{270D}]|\x{270F}|\x{2712}|\x{2714}|\x{2716}|\x{271D}|\x{2721}|\x{2728}|[\x{2733}-\x{2734}]|\x{2744}|\x{2747}|\x{274C}|\x{274E}|[\x{2753}-\x{2755}]|\x{2757}|[\x{2763}-\x{2764}]|[\x{2795}-\x{2797}]|\x{27A1}|\x{27B0}|\x{27BF}|[\x{2934}-\x{2935}]|[\x{2B05}-\x{2B07}]|[\x{2B1B}-\x{2B1C}]|\x{2B50}|\x{2B55}|\x{3030}|\x{303D}|\x{3297}|\x{3299}|\x{1F004}|\x{1F0CF}|[\x{1F170}-\x{1F171}]|\x{1F17E}|\x{1F17F}|\x{1F18E}|[\x{1F191}-\x{1F19A}]|[\x{1F1E6}-\x{1F1FF}]|[\x{1F201}-\x{1F202}]|\x{1F21A}|\x{1F22F}|[\x{1F232}-\x{1F23A}]|[\x{1F250}-\x{1F251}]|[\x{1F300}-\x{1F320}]|\x{1F321}|[\x{1F324}-\x{1F32C}]|[\x{1F32D}-\x{1F32F}]|[\x{1F330}-\x{1F335}]|\x{1F336}|[\x{1F337}-\x{1F37C}]|\x{1F37D}|[\x{1F37E}-\x{1F37F}]|[\x{1F380}-\x{1F393}]|[\x{1F396}-\x{1F397}]|[\x{1F399}-\x{1F39B}]|[\x{1F39E}-\x{1F39F}]|[\x{1F3A0}-\x{1F3C4}]|\x{1F3C5}|[\x{1F3C6}-\x{1F3CA}]|[\x{1F3CB}-\x{1F3CE}]|[\x{1F3CF}-\x{1F3D3}]|[\x{1F3D4}-\x{1F3DF}]|[\x{1F3E0}-\x{1F3F0}]|[\x{1F3F3}-\x{1F3F5}]|\x{1F3F7}|[\x{1F3F8}-\x{1F3FF}]|[\x{1F400}-\x{1F43E}]|\x{1F43F}|\x{1F440}|\x{1F441}|[\x{1F442}-\x{1F4F7}]|\x{1F4F8}|[\x{1F4F9}-\x{1F4FC}]|\x{1F4FD}|\x{1F4FF}|[\x{1F500}-\x{1F53D}]|[\x{1F549}-\x{1F54A}]|[\x{1F54B}-\x{1F54E}]|[\x{1F550}-\x{1F567}]|[\x{1F56F}-\x{1F570}]|[\x{1F573}-\x{1F579}]|\x{1F57A}|\x{1F587}|[\x{1F58A}-\x{1F58D}]|\x{1F590}|[\x{1F595}-\x{1F596}]|\x{1F5A4}|\x{1F5A5}|\x{1F5A8}|[\x{1F5B1}-\x{1F5B2}]|\x{1F5BC}|[\x{1F5C2}-\x{1F5C4}]|[\x{1F5D1}-\x{1F5D3}]|[\x{1F5DC}-\x{1F5DE}]|\x{1F5E1}|\x{1F5E3}|\x{1F5E8}|\x{1F5EF}|\x{1F5F3}|\x{1F5FA}|[\x{1F5FB}-\x{1F5FF}]|\x{1F600}|[\x{1F601}-\x{1F610}]|\x{1F611}|[\x{1F612}-\x{1F614}]|\x{1F615}|\x{1F616}|\x{1F617}|\x{1F618}|\x{1F619}|\x{1F61A}|\x{1F61B}|[\x{1F61C}-\x{1F61E}]|\x{1F61F}|[\x{1F620}-\x{1F625}]|[\x{1F626}-\x{1F627}]|[\x{1F628}-\x{1F62B}]|\x{1F62C}|\x{1F62D}|[\x{1F62E}-\x{1F62F}]|[\x{1F630}-\x{1F633}]|\x{1F634}|[\x{1F635}-\x{1F640}]|[\x{1F641}-\x{1F642}]|[\x{1F643}-\x{1F644}]|[\x{1F645}-\x{1F64F}]|[\x{1F680}-\x{1F6C5}]|[\x{1F6CB}-\x{1F6CF}]|\x{1F6D0}|[\x{1F6D1}-\x{1F6D2}]|[\x{1F6E0}-\x{1F6E5}]|\x{1F6E9}|[\x{1F6EB}-\x{1F6EC}]|\x{1F6F0}|\x{1F6F3}|[\x{1F6F4}-\x{1F6F6}]|[\x{1F6F7}-\x{1F6F8}]|[\x{1F910}-\x{1F918}]|[\x{1F919}-\x{1F91E}]|\x{1F91F}|[\x{1F920}-\x{1F927}]|[\x{1F928}-\x{1F92F}]|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F93A}]|[\x{1F93C}-\x{1F93E}]|[\x{1F940}-\x{1F945}]|[\x{1F947}-\x{1F94B}]|\x{1F94C}|[\x{1F950}-\x{1F95E}]|[\x{1F95F}-\x{1F96B}]|[\x{1F980}-\x{1F984}]|[\x{1F985}-\x{1F991}]|[\x{1F992}-\x{1F997}]|\x{1F9C0}|[\x{1F9D0}-\x{1F9E6}])\x{FE0F})';
$hashtagRegexp = '/'.
// Start with a pound sign (or a unicode variant of that)
'[##]'.
// Capture the entire hashtag
'('.
// Defines a group of (possible) digits and (required) valid non digit characters
'(?:'.
// We can optionally start with one or more numbers, so long as the number is...
'\d*'.
// Followed by one or more valid hashtag characters that aren't numbers...
'(?:'.
'\p{L}|'. // Any language letter
'\p{M}|'. // Any language letter modifier
$emojiRegexp.'|'. // Any valid emoji
'_'. // Underscore
')+'.
// Repeat this group at least once (we need at least one character for a hashtag)
')+'.
// Which can (optionally) be followed by more numbers
'\d*'.
// end the actual hashtag capture group
')'.
// use unicode modifiers / unicode strings
'/u';
// Load test data scraped from ADN
$csvFile = fopen('hashtag-test.csv','rb');
$passed = 0;
$failed = 0;
$testNumber = 0;
while (!feof($csvFile)) {
$hashtags = fgetcsv($csvFile);
// Ignore blank lines.
if (!$hashtags) {
continue;
}
// Post ID is first column, separate it from the hashtags.
$postId = array_shift($hashtags);
// Text of the post is the second column, separate it from the hashtags.
$text = array_shift($hashtags);
// This is a cheat for now - remove URLs that contain # from the text so we don't get hung up on
// something that is apparently already handled in the pnut code.
$textBefore = $text;
$text = preg_replace('%\b(https?://|ftp://|mailto:)?[a-z0-9\-\.]+\.[a-z]{2,}/[^\s]*#[^\s]+%i','',$text);
// Convert test hashtags to lower case
foreach ($hashtags as &$hashtag) {
$hashtag = mb_strtolower($hashtag);
unset($hashtag);
}
// Run our own hashtag detection on the text to see if it matches what ADN said it should be.
$pnutHashtags = array();
if (preg_match_all($hashtagRegexp,$text,$temp)) {
foreach ($temp[1] as $htag) {
$pnutHashtags[] = mb_strtolower($htag);
}
}
// Sort them so we can reliably compare them both.
sort($pnutHashtags);
sort($hashtags);
// If we failed, dump to output (should perhaps be stderr).
if ($hashtags!=$pnutHashtags) {
print "Failed test number $testNumber\n";
print " ADN Post: ".number_format($postId)."\n";
print " Text: ".$text."\n";
if ($text != $textBefore) {
print " Orig Text: ".$textBefore."\n";
}
print " ADN Hashtags: ".implode(', ',$hashtags)."\n";
print "PNUT Hashtags: ".implode(', ',$pnutHashtags)."\n\n";
$failed++;
}
else {
$passed++;
}
// Increment, rinse, wipe, repeat.
$testNumber++;
}
// Clean up
fclose($csvFile);
// Report
print "Ran ".number_format($testNumber)." tests.\n";
print "Passed ".number_format($passed)." (".number_format(($passed/$testNumber)*100,2)."%)\n";
print "Failed ".number_format($failed)." (".number_format(($failed/$testNumber)*100,2)."%)\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment