-
-
Save abhishekbhardwaj/97fc32e8040e9d5ded354c939dbe3000 to your computer and use it in GitHub Desktop.
pnut.io unicode hashtag parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// You can find the test data file at https://www.ravis.org/hashtag-test.zip | |
// You're gonna want to have your console output supporting UTF8 before running this, or you're | |
// gonna see a bunch of ? in the output... | |
// For curiosity's sake, post number 693,847 is an emoji hashtag: #(heart) | |
// Emoji are spread throughout the Unicode codespace and can contain modifiers. This regexp | |
// was automatically generated from http://ftp.unicode.org/Public/emoji/1.0/emoji-data.txt | |
// and is the equivilant (because PHP doesn't support these character classes) of: | |
// (?:\p{Emoji_Modifier_Base}\p{Emoji_Modifier}?|\p{Emoji_Presentation}|\p{Emoji}\x{FE0F}?) | |
// Generated using https://gist.github.com/ravisorg/23edafbfcbd45de9875adec5310fca76 | |
$emojiRegexp = '(?:(?:\x{261D}|\x{26F9}|[\x{270A}-\x{270B}]|[\x{270C}-\x{270D}]|\x{1F385}|[\x{1F3C2}-\x{1F3C4}]|\x{1F3C7}|\x{1F3CA}|[\x{1F3CB}-\x{1F3CC}]|[\x{1F442}-\x{1F443}]|[\x{1F446}-\x{1F450}]|[\x{1F466}-\x{1F469}]|\x{1F46E}|[\x{1F470}-\x{1F478}]|\x{1F47C}|[\x{1F481}-\x{1F483}]|[\x{1F485}-\x{1F487}]|\x{1F4AA}|[\x{1F574}-\x{1F575}]|\x{1F57A}|\x{1F590}|[\x{1F595}-\x{1F596}]|[\x{1F645}-\x{1F647}]|[\x{1F64B}-\x{1F64F}]|\x{1F6A3}|[\x{1F6B4}-\x{1F6B6}]|\x{1F6C0}|\x{1F6CC}|\x{1F918}|[\x{1F919}-\x{1F91C}]|\x{1F91E}|\x{1F91F}|\x{1F926}|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F939}]|[\x{1F93D}-\x{1F93E}]|[\x{1F9D1}-\x{1F9DD}])(?:[\x{1F3FB}-\x{1F3FF}])?|(?:[\x{231A}-\x{231B}]|[\x{23E9}-\x{23EC}]|\x{23F0}|\x{23F3}|[\x{25FD}-\x{25FE}]|[\x{2614}-\x{2615}]|[\x{2648}-\x{2653}]|\x{267F}|\x{2693}|\x{26A1}|[\x{26AA}-\x{26AB}]|[\x{26BD}-\x{26BE}]|[\x{26C4}-\x{26C5}]|\x{26CE}|\x{26D4}|\x{26EA}|[\x{26F2}-\x{26F3}]|\x{26F5}|\x{26FA}|\x{26FD}|\x{2705}|[\x{270A}-\x{270B}]|\x{2728}|\x{274C}|\x{274E}|[\x{2753}-\x{2755}]|\x{2757}|[\x{2795}-\x{2797}]|\x{27B0}|\x{27BF}|[\x{2B1B}-\x{2B1C}]|\x{2B50}|\x{2B55}|\x{1F004}|\x{1F0CF}|\x{1F18E}|[\x{1F191}-\x{1F19A}]|[\x{1F1E6}-\x{1F1FF}]|\x{1F201}|\x{1F21A}|\x{1F22F}|[\x{1F232}-\x{1F236}]|[\x{1F238}-\x{1F23A}]|[\x{1F250}-\x{1F251}]|[\x{1F300}-\x{1F320}]|[\x{1F32D}-\x{1F32F}]|[\x{1F330}-\x{1F335}]|[\x{1F337}-\x{1F37C}]|[\x{1F37E}-\x{1F37F}]|[\x{1F380}-\x{1F393}]|[\x{1F3A0}-\x{1F3C4}]|\x{1F3C5}|[\x{1F3C6}-\x{1F3CA}]|[\x{1F3CF}-\x{1F3D3}]|[\x{1F3E0}-\x{1F3F0}]|\x{1F3F4}|[\x{1F3F8}-\x{1F3FF}]|[\x{1F400}-\x{1F43E}]|\x{1F440}|[\x{1F442}-\x{1F4F7}]|\x{1F4F8}|[\x{1F4F9}-\x{1F4FC}]|\x{1F4FF}|[\x{1F500}-\x{1F53D}]|[\x{1F54B}-\x{1F54E}]|[\x{1F550}-\x{1F567}]|\x{1F57A}|[\x{1F595}-\x{1F596}]|\x{1F5A4}|[\x{1F5FB}-\x{1F5FF}]|\x{1F600}|[\x{1F601}-\x{1F610}]|\x{1F611}|[\x{1F612}-\x{1F614}]|\x{1F615}|\x{1F616}|\x{1F617}|\x{1F618}|\x{1F619}|\x{1F61A}|\x{1F61B}|[\x{1F61C}-\x{1F61E}]|\x{1F61F}|[\x{1F620}-\x{1F625}]|[\x{1F626}-\x{1F627}]|[\x{1F628}-\x{1F62B}]|\x{1F62C}|\x{1F62D}|[\x{1F62E}-\x{1F62F}]|[\x{1F630}-\x{1F633}]|\x{1F634}|[\x{1F635}-\x{1F640}]|[\x{1F641}-\x{1F642}]|[\x{1F643}-\x{1F644}]|[\x{1F645}-\x{1F64F}]|[\x{1F680}-\x{1F6C5}]|\x{1F6CC}|\x{1F6D0}|[\x{1F6D1}-\x{1F6D2}]|[\x{1F6EB}-\x{1F6EC}]|[\x{1F6F4}-\x{1F6F6}]|[\x{1F6F7}-\x{1F6F8}]|[\x{1F910}-\x{1F918}]|[\x{1F919}-\x{1F91E}]|\x{1F91F}|[\x{1F920}-\x{1F927}]|[\x{1F928}-\x{1F92F}]|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F93A}]|[\x{1F93C}-\x{1F93E}]|[\x{1F940}-\x{1F945}]|[\x{1F947}-\x{1F94B}]|\x{1F94C}|[\x{1F950}-\x{1F95E}]|[\x{1F95F}-\x{1F96B}]|[\x{1F980}-\x{1F984}]|[\x{1F985}-\x{1F991}]|[\x{1F992}-\x{1F997}]|\x{1F9C0}|[\x{1F9D0}-\x{1F9E6}])|(?:\x{0023}|\x{002A}|[\x{0030}-\x{0039}]|\x{00A9}|\x{00AE}|\x{203C}|\x{2049}|\x{2122}|\x{2139}|[\x{2194}-\x{2199}]|[\x{21A9}-\x{21AA}]|[\x{231A}-\x{231B}]|\x{2328}|\x{23CF}|[\x{23E9}-\x{23F3}]|[\x{23F8}-\x{23FA}]|\x{24C2}|[\x{25AA}-\x{25AB}]|\x{25B6}|\x{25C0}|[\x{25FB}-\x{25FE}]|[\x{2600}-\x{2604}]|\x{260E}|\x{2611}|[\x{2614}-\x{2615}]|\x{2618}|\x{261D}|\x{2620}|[\x{2622}-\x{2623}]|\x{2626}|\x{262A}|[\x{262E}-\x{262F}]|[\x{2638}-\x{263A}]|\x{2640}|\x{2642}|[\x{2648}-\x{2653}]|\x{2660}|\x{2663}|[\x{2665}-\x{2666}]|\x{2668}|\x{267B}|\x{267F}|[\x{2692}-\x{2697}]|\x{2699}|[\x{269B}-\x{269C}]|[\x{26A0}-\x{26A1}]|[\x{26AA}-\x{26AB}]|[\x{26B0}-\x{26B1}]|[\x{26BD}-\x{26BE}]|[\x{26C4}-\x{26C5}]|\x{26C8}|\x{26CE}|\x{26CF}|\x{26D1}|[\x{26D3}-\x{26D4}]|[\x{26E9}-\x{26EA}]|[\x{26F0}-\x{26F5}]|[\x{26F7}-\x{26FA}]|\x{26FD}|\x{2702}|\x{2705}|[\x{2708}-\x{2709}]|[\x{270A}-\x{270B}]|[\x{270C}-\x{270D}]|\x{270F}|\x{2712}|\x{2714}|\x{2716}|\x{271D}|\x{2721}|\x{2728}|[\x{2733}-\x{2734}]|\x{2744}|\x{2747}|\x{274C}|\x{274E}|[\x{2753}-\x{2755}]|\x{2757}|[\x{2763}-\x{2764}]|[\x{2795}-\x{2797}]|\x{27A1}|\x{27B0}|\x{27BF}|[\x{2934}-\x{2935}]|[\x{2B05}-\x{2B07}]|[\x{2B1B}-\x{2B1C}]|\x{2B50}|\x{2B55}|\x{3030}|\x{303D}|\x{3297}|\x{3299}|\x{1F004}|\x{1F0CF}|[\x{1F170}-\x{1F171}]|\x{1F17E}|\x{1F17F}|\x{1F18E}|[\x{1F191}-\x{1F19A}]|[\x{1F1E6}-\x{1F1FF}]|[\x{1F201}-\x{1F202}]|\x{1F21A}|\x{1F22F}|[\x{1F232}-\x{1F23A}]|[\x{1F250}-\x{1F251}]|[\x{1F300}-\x{1F320}]|\x{1F321}|[\x{1F324}-\x{1F32C}]|[\x{1F32D}-\x{1F32F}]|[\x{1F330}-\x{1F335}]|\x{1F336}|[\x{1F337}-\x{1F37C}]|\x{1F37D}|[\x{1F37E}-\x{1F37F}]|[\x{1F380}-\x{1F393}]|[\x{1F396}-\x{1F397}]|[\x{1F399}-\x{1F39B}]|[\x{1F39E}-\x{1F39F}]|[\x{1F3A0}-\x{1F3C4}]|\x{1F3C5}|[\x{1F3C6}-\x{1F3CA}]|[\x{1F3CB}-\x{1F3CE}]|[\x{1F3CF}-\x{1F3D3}]|[\x{1F3D4}-\x{1F3DF}]|[\x{1F3E0}-\x{1F3F0}]|[\x{1F3F3}-\x{1F3F5}]|\x{1F3F7}|[\x{1F3F8}-\x{1F3FF}]|[\x{1F400}-\x{1F43E}]|\x{1F43F}|\x{1F440}|\x{1F441}|[\x{1F442}-\x{1F4F7}]|\x{1F4F8}|[\x{1F4F9}-\x{1F4FC}]|\x{1F4FD}|\x{1F4FF}|[\x{1F500}-\x{1F53D}]|[\x{1F549}-\x{1F54A}]|[\x{1F54B}-\x{1F54E}]|[\x{1F550}-\x{1F567}]|[\x{1F56F}-\x{1F570}]|[\x{1F573}-\x{1F579}]|\x{1F57A}|\x{1F587}|[\x{1F58A}-\x{1F58D}]|\x{1F590}|[\x{1F595}-\x{1F596}]|\x{1F5A4}|\x{1F5A5}|\x{1F5A8}|[\x{1F5B1}-\x{1F5B2}]|\x{1F5BC}|[\x{1F5C2}-\x{1F5C4}]|[\x{1F5D1}-\x{1F5D3}]|[\x{1F5DC}-\x{1F5DE}]|\x{1F5E1}|\x{1F5E3}|\x{1F5E8}|\x{1F5EF}|\x{1F5F3}|\x{1F5FA}|[\x{1F5FB}-\x{1F5FF}]|\x{1F600}|[\x{1F601}-\x{1F610}]|\x{1F611}|[\x{1F612}-\x{1F614}]|\x{1F615}|\x{1F616}|\x{1F617}|\x{1F618}|\x{1F619}|\x{1F61A}|\x{1F61B}|[\x{1F61C}-\x{1F61E}]|\x{1F61F}|[\x{1F620}-\x{1F625}]|[\x{1F626}-\x{1F627}]|[\x{1F628}-\x{1F62B}]|\x{1F62C}|\x{1F62D}|[\x{1F62E}-\x{1F62F}]|[\x{1F630}-\x{1F633}]|\x{1F634}|[\x{1F635}-\x{1F640}]|[\x{1F641}-\x{1F642}]|[\x{1F643}-\x{1F644}]|[\x{1F645}-\x{1F64F}]|[\x{1F680}-\x{1F6C5}]|[\x{1F6CB}-\x{1F6CF}]|\x{1F6D0}|[\x{1F6D1}-\x{1F6D2}]|[\x{1F6E0}-\x{1F6E5}]|\x{1F6E9}|[\x{1F6EB}-\x{1F6EC}]|\x{1F6F0}|\x{1F6F3}|[\x{1F6F4}-\x{1F6F6}]|[\x{1F6F7}-\x{1F6F8}]|[\x{1F910}-\x{1F918}]|[\x{1F919}-\x{1F91E}]|\x{1F91F}|[\x{1F920}-\x{1F927}]|[\x{1F928}-\x{1F92F}]|\x{1F930}|[\x{1F931}-\x{1F932}]|[\x{1F933}-\x{1F93A}]|[\x{1F93C}-\x{1F93E}]|[\x{1F940}-\x{1F945}]|[\x{1F947}-\x{1F94B}]|\x{1F94C}|[\x{1F950}-\x{1F95E}]|[\x{1F95F}-\x{1F96B}]|[\x{1F980}-\x{1F984}]|[\x{1F985}-\x{1F991}]|[\x{1F992}-\x{1F997}]|\x{1F9C0}|[\x{1F9D0}-\x{1F9E6}])\x{FE0F})'; | |
$hashtagRegexp = '/'. | |
// Start with a pound sign (or a unicode variant of that) | |
'[##]'. | |
// Capture the entire hashtag | |
'('. | |
// Defines a group of (possible) digits and (required) valid non digit characters | |
'(?:'. | |
// We can optionally start with one or more numbers, so long as the number is... | |
'\d*'. | |
// Followed by one or more valid hashtag characters that aren't numbers... | |
'(?:'. | |
'\p{L}|'. // Any language letter | |
'\p{M}|'. // Any language letter modifier | |
$emojiRegexp.'|'. // Any valid emoji | |
'_'. // Underscore | |
')+'. | |
// Repeat this group at least once (we need at least one character for a hashtag) | |
')+'. | |
// Which can (optionally) be followed by more numbers | |
'\d*'. | |
// end the actual hashtag capture group | |
')'. | |
// use unicode modifiers / unicode strings | |
'/u'; | |
// Load test data scraped from ADN | |
$csvFile = fopen('hashtag-test.csv','rb'); | |
$passed = 0; | |
$failed = 0; | |
$testNumber = 0; | |
while (!feof($csvFile)) { | |
$hashtags = fgetcsv($csvFile); | |
// Ignore blank lines. | |
if (!$hashtags) { | |
continue; | |
} | |
// Post ID is first column, separate it from the hashtags. | |
$postId = array_shift($hashtags); | |
// Text of the post is the second column, separate it from the hashtags. | |
$text = array_shift($hashtags); | |
// This is a cheat for now - remove URLs that contain # from the text so we don't get hung up on | |
// something that is apparently already handled in the pnut code. | |
$textBefore = $text; | |
$text = preg_replace('%\b(https?://|ftp://|mailto:)?[a-z0-9\-\.]+\.[a-z]{2,}/[^\s]*#[^\s]+%i','',$text); | |
// Convert test hashtags to lower case | |
foreach ($hashtags as &$hashtag) { | |
$hashtag = mb_strtolower($hashtag); | |
unset($hashtag); | |
} | |
// Run our own hashtag detection on the text to see if it matches what ADN said it should be. | |
$pnutHashtags = array(); | |
if (preg_match_all($hashtagRegexp,$text,$temp)) { | |
foreach ($temp[1] as $htag) { | |
$pnutHashtags[] = mb_strtolower($htag); | |
} | |
} | |
// Sort them so we can reliably compare them both. | |
sort($pnutHashtags); | |
sort($hashtags); | |
// If we failed, dump to output (should perhaps be stderr). | |
if ($hashtags!=$pnutHashtags) { | |
print "Failed test number $testNumber\n"; | |
print " ADN Post: ".number_format($postId)."\n"; | |
print " Text: ".$text."\n"; | |
if ($text != $textBefore) { | |
print " Orig Text: ".$textBefore."\n"; | |
} | |
print " ADN Hashtags: ".implode(', ',$hashtags)."\n"; | |
print "PNUT Hashtags: ".implode(', ',$pnutHashtags)."\n\n"; | |
$failed++; | |
} | |
else { | |
$passed++; | |
} | |
// Increment, rinse, wipe, repeat. | |
$testNumber++; | |
} | |
// Clean up | |
fclose($csvFile); | |
// Report | |
print "Ran ".number_format($testNumber)." tests.\n"; | |
print "Passed ".number_format($passed)." (".number_format(($passed/$testNumber)*100,2)."%)\n"; | |
print "Failed ".number_format($failed)." (".number_format(($failed/$testNumber)*100,2)."%)\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment