Created
December 5, 2010 09:37
-
-
Save imme-emosol/728973 to your computer and use it in GitHub Desktop.
uri checkers testbed in php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* Note: You'll need PHP5.3 to run this script! | |
*/ | |
/* | |
* Patterns originate from http://mathiasbynens.be/demo/url-regex | |
* | |
* Note: None of the patterns had the S-modifier. I added it to speed up the tests. | |
* When patterns are used repeatedly, /S can improve performance :) | |
*/ | |
$patterns = array( | |
'spoon' => '/(((http|ftp|https):\/{2})+(([0-9a-z_-]+\.)+(aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx|cy|cz|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mn|mo|mp|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|nom|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ra|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw|arpa)(:[0-9]+)?((\/([~0-9a-zA-Z\#\+\%@\.\/_-]+))?(\?[0-9a-zA-Z\+\%@\/&\[\];=_-]+)?)?))\b/imuS', | |
'krijnhoetmer' => '_(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|\'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s\',\|\(\).:;?\-\[\]>\)])_iS', | |
'gruber' => '#\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))#iS', | |
'gruber revised' => "#(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))#iS", | |
'cowboy' => '~(?:\b[a-z\d.-]+://[^<>\s]+|\b(?:(?:(?:[^\s!@#$%^&*()_=+[\]{}\|;:\'",.<>/?]+)\.)+(?:ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|coop|com|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp|xn--kgbechtv|xn--zckzah|ye|yt|yu|za|zm|zw)|(?:(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])\.){3}(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]))(?:[;/][^#?<>\s]*)?(?:\?[^#<>\s]*)?(?:#[^<>\s]*)?(?!\w))~iS', | |
// jeffrey friedl | |
'jeffrey friedl' => '@\b((ftp|https?)://[-\w]+(\.\w[-\w]*)+|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?: com\b|edu\b|biz\b|gov\b|in(?:t|fo)\b|mil\b|net\b|org\b|[a-z][a-z]\b))(\:\d+)?(/[^.!,?;"\'<>()\[\]{}\s\x7F-\xFF]*(?:[.!,?]+[^.!,?;"\'<>()\[\]{}\s\x7F-\xFF]+)*)?@iS', | |
'mattfarina' => '/^([a-z][a-z0-9\*\-\.]*):\/\/(?:(?:(?:[\w\.\-\+!$&\'\(\)*\+,;=]|%[0-9a-f]{2})+:)*(?:[\w\.\-\+%!$&\'\(\)*\+,;=]|%[0-9a-f]{2})+@)?(?:(?:[a-z0-9\-\.]|%[0-9a-f]{2})+|(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]))(?::[0-9]+)?(?:[\/|\?](?:[\w#!:\.\?\+=&@!$\'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})*)?$/xiS', | |
'stephenhay' => '@^[httprads:]*\/\/[^$.?#].*$@iS', | |
'stephenhay revised' => '@^[hftps]*:\/\/[^/$.?#].[^\s]*$@iS', | |
// hint: javascript does \uXX while php does \xXX | |
'scottgonzales' => '#([a-z]([a-z]|\d|\+|-|\.)*):(\/\/(((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:)*@)?((\[(|(v[\da-f]{1,}\.(([a-z]|\d|-|\.|_|~)|[!\$&\'\(\)\*\+,;=]|:)+))\])|((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=])*)(:\d*)?)(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*|(\/((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*)?)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)){0})(\?((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)|[\xE000-\xF8FF]|\/|\?)*)?(\#((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)|\/|\?)*)?#iS', | |
'rodneyrehm' => '#((https?://|ftp://|www\.|[^\s:=]+@www\.).*?[a-z_\/0-9\-\#=&])(?=(\.|,|;|\?|\!)?("|\'|«|»|\[|\s|\r|\n|$))#iS', | |
'rodneyrehm revised' => '#(([a-z]+://|www\.|[^\s:=]+@www\.)([^/].*?[a-z0-9].*?)([a-z_\/0-9\-\#=&]|))(?=[\.,;\?\!]?(["\'«»\[\s\r\n]|$))#iS', | |
'imme_emosol' => '@(https?|ftp|torrent|image|irc)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS' , | |
'imme_emosol ht-&f-tp(s)' => '@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS' , | |
'filter_var' => function( $subject ) | |
{ | |
if( $t = filter_var( $subject, FILTER_VALIDATE_URL ) ) | |
return array( $t ); | |
return false; | |
}, | |
'parse_url' => function( $subject ) | |
{ | |
if( $t = parse_url( $subject ) ) | |
{ | |
$x = array(); | |
if( !empty( $t['scheme'] ) ) | |
{ | |
$x[] = $t['scheme']; | |
$x[] = '://'; | |
} | |
if( !empty( $t['user'] ) ) | |
{ | |
$x[] = $t['user']; | |
if( !empty( $t['pass'] ) ) | |
{ | |
$x[] = ':'; | |
$x[] = $t['pass']; | |
} | |
$x[] = '@'; | |
} | |
if( !empty( $t['host'] ) ) | |
{ | |
$x[] = $t['host']; | |
} | |
if( !empty( $t['port'] ) ) | |
{ | |
$x[] = ':'; | |
$x[] = $t['port']; | |
} | |
if( !empty( $t['path'] ) ) | |
{ | |
$x[] = $t['path']; | |
} | |
if( !empty( $t['query'] ) ) | |
{ | |
$x[] = '?'; | |
$x[] = $t['query']; | |
} | |
if( !empty( $t['fragment'] ) ) | |
{ | |
$x[] = '#'; | |
$x[] = $t['fragment']; | |
} | |
//var_dump( $subject, parse_url( $subject ), join( '', parse_url( $subject ) ) ); exit; | |
return array( join( '', $x ) ); | |
} | |
return false; | |
} | |
); | |
$authors = array( | |
'spoon' => 'http://www.spoon-library.com/', | |
'krijnhoetmer' => 'http://twitter.com/krijnhoetmer', | |
'gruber' => 'http://twitter.com/gruber', | |
'gruber revised' => 'http://twitter.com/gruber', | |
'cowboy' => 'http://twitter.com/cowboy', | |
'jeffrey friedl' => 'http://regex.info/blog/', | |
'mattfarina' => 'http://twitter.com/mattfarina', | |
'stephenhay' => 'http://twitter.com/stephenhay', | |
'scottgonzales' => 'http://twitter.com/scottgonzales', | |
'rodneyrehm' => 'http://twitter.com/rodneyrehm', | |
'imme_emosol' => 'http://twitter.com/imme_emosol' , | |
'filter_var' => 'http://php.net/filter_var', | |
'parse_url' => 'http://php.net/parse_url', | |
); | |
$tests = array( | |
'positive' => array( | |
'http://foo.com/blah_blah', | |
'http://foo.com/blah_blah/', | |
'http://foo.com/blah_blah_(wikipedia)', | |
'http://foo.com/blah_blah_(wikipedia)_(again)', | |
'http://www.example.com/wpstyle/?p=364', | |
'https://www.example.com/foo/?bar=baz&inga=42&quux', | |
'http://✪df.ws/123', | |
'http://userid:[email protected]:8080', | |
'http://userid:[email protected]:8080/', | |
'http://[email protected]', | |
'http://[email protected]/', | |
'http://[email protected]:8080', | |
'http://[email protected]:8080/', | |
'http://userid:[email protected]', | |
'http://userid:[email protected]/', | |
'http://192.168.1.1/', | |
'http://192.168.1.1:8080/', | |
'http://➡.ws/䨹', | |
'http://⌘.ws', | |
'http://⌘.ws/', | |
'http://foo.com/blah_(wikipedia)#cite-1', | |
'http://foo.com/blah_(wikipedia)_blah#cite-1', | |
'http://foo.com/unicode_(✪)_in_parens', | |
'http://foo.com/(something)?after=parens', | |
'http://☺.damowmow.com/', | |
'http://code.example.com/events/#&product=browser', | |
'http://j.mp', | |
'ftp://foo.bar/baz', | |
'torrent://foo.bar/baz', | |
'image://foo.bar:993', | |
'irc://foo.bar:6667', | |
), | |
'negative' => array( | |
'rdar://1234', | |
'http://', | |
'http://.', | |
'http://..', | |
'http://../', | |
'http://?', | |
'http://??', | |
'http://??/', | |
'http://#', | |
'http://##', | |
'http://##/', | |
'http://foo.bar?q=Spaces should be encoded', | |
'//', | |
'//a', | |
'///a', | |
'///', | |
'http:///a', | |
'foo.com', | |
'http://-a.b.co', | |
'http://a.b-.co', | |
), | |
'fulltext' => array( | |
'http://example.com' => 'The brown http://example.com, jumped over the fox', | |
'http://example.com?la#mc' => 'http://example.com?la#mc. nice site', | |
'http://example.com?la#mc-' => '"http://example.com?la#mc-" looks bad', | |
'www.example.com?la#mc-' => 'www.example.com?la#mc-" is off', | |
'ftp://foo.example.com?la#mc-' => 'ftp://foo.example.com?la#mc-" is oldschool', | |
'www.example.org' => 'yeah, looks nice www.example.org?', | |
'http://✪df.ws/123' => 'what about IDN? http://✪df.ws/123', | |
'irc://foo:6667' => 'do you speak IRC? irc://foo:6667', | |
// and so on... too lazy to come up with more tests | |
), | |
); | |
$testNames = array( | |
'positive' => 'URL verification (on URLs)', | |
'negative' => 'URL verification (on non URLs, "passed" being good, "failed" being "false positive")', | |
'fulltext' => 'URL scanning in natural text', | |
); | |
$testAssertions = array( | |
'positive' => function( $_subject, $subject, $matches ) | |
{ | |
// matched nothing | |
if( !$matches ) | |
{ | |
return 0; | |
} | |
// matched correctly | |
else if( $matches[0] == $subject ) | |
{ | |
return 1; | |
} | |
// matched something | |
else | |
{ | |
return 2; | |
} | |
}, | |
'negative' => function( $_subject, $subject, $matches ) | |
{ | |
// matched nothing | |
if( !$matches ) | |
{ | |
return 1; | |
} | |
// matched something | |
else | |
{ | |
return 0; | |
} | |
}, | |
'fulltext' => function( $_subject, $subject, $matches ) | |
{ | |
// matched nothing | |
if( !$matches ) | |
{ | |
return 0; | |
} | |
// matched something | |
else | |
{ | |
if( $matches[0] == $_subject | |
//|| ( isset( $matches[1] ) && $matches[ 1 ] === $_subject ) | |
) | |
{ | |
return 1; | |
} | |
else | |
{ | |
return 2; | |
} | |
} | |
}, | |
); | |
$results = array(); | |
$maybes = array(); | |
foreach( $tests as $_test => $test ) | |
{ | |
foreach( $test as $_subject => $subject ) | |
{ | |
foreach( $patterns as $_pattern => $pattern ) | |
{ | |
if( is_string($pattern) ) | |
preg_match( $pattern, $subject, $matches ); | |
else | |
$matches = $pattern( $subject ); | |
if( empty( $results[ $_pattern ] ) ) | |
$results[ $_pattern ] = array(); | |
if( empty( $results[ $_pattern ][ $_test ] ) ) | |
$results[ $_pattern ][ $_test ] = array(); | |
if ( | |
( | |
$results[ $_pattern ][ $_test ][ $subject ] | |
= $testAssertions[ $_test ]( | |
$_subject, $subject, $matches | |
) | |
) == 2 | |
) | |
$maybes[ $_pattern .'#'. $_test .'#'. $subject ] = $matches[0]; | |
} | |
} | |
} | |
?><!DOCTYPE html> | |
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de" lang="de"> | |
<head> | |
<meta charset="utf-8" /> | |
<meta http-equiv="content-type" content="text/html; charset=utf-8" /> | |
<title>URL identification</title> | |
<style type="text/css"> | |
th, td { padding: 3px; } | |
th { text-align: left; font-weight: bold; background-color: #EEE; } | |
td { text-align: center; background-color: #EFEFEF; } | |
.passed { color: #00B530; background-color: #50F78B; } | |
.failed { color: #FF0546; background-color: #FFB9CA; } | |
.maybe { color: #D29331; background-color: #FCE484; } | |
.results tbody tr:first-child th { background-color: #DDD; } | |
.regex { } | |
.regex th, .regex td { text-align:left; } | |
.regex .number { text-align: right; } | |
.regex .code { overflow:auto; white-space:nowrap; } | |
:target th, :target td { background-color: lightblue; } | |
</style> | |
</head> | |
<body> | |
<h1>URL verification and identification</h1> | |
<table class="results"> | |
<thead> | |
<tr> | |
<th>Subject</th> | |
<?php | |
foreach( $patterns as $_pattern => $pattern ) | |
echo "\t\t\t\t\t\t" . | |
'<th><a href="#', | |
urlencode( $_pattern ), | |
'">', | |
htmlspecialchars( $_pattern ), | |
'</a></th>' . | |
"\n" | |
; | |
?> | |
</tr> | |
</thead> | |
<?php | |
$patternsCount = count( $patterns ); | |
$styles = array( | |
'failed' , | |
'passed' , | |
'maybe' , | |
); | |
echo '<tbody>'; | |
foreach( $tests as $_test => $test ) | |
{ | |
echo '' . | |
'<tr><th colspan="' . | |
( $patternsCount + 1 ) . | |
'">' . | |
htmlspecialchars( $testNames[ $_test ] ) . | |
'</th></tr>' . | |
''; | |
foreach( $test as $subject ) | |
{ | |
echo '' . | |
'<tr><th>' . | |
htmlspecialchars( $subject ) . | |
'</th>' . | |
''; | |
foreach( $patterns as $_pattern => $pattern ) | |
{ | |
$_maybe = $_pattern .'#'. $_test .'#'. $subject; | |
$result = $results[ $_pattern ][ $_test ][ $subject ]; | |
$state = $styles[ $result ]; | |
echo '' . | |
'<td class="' . | |
$state . | |
'"' . | |
( !empty( $maybes[ $_maybe ] ) | |
? '' . | |
' title="'. | |
htmlspecialchars( $maybes[ $_maybe ] ) . | |
'"' . | |
'' | |
: '' . | |
//' title="' . | |
//$subject . | |
//'"' . | |
'' | |
) . | |
'>' . | |
$state . | |
'</td>' . | |
''; | |
} | |
echo '</tr>'; | |
} | |
} | |
echo '</tbody>'; | |
?> | |
</tbody> | |
</table> | |
<h2>The Regular Expressions</h2> | |
<table class="regex"> | |
<thead> | |
<tr> | |
<th>Name</th> | |
<th>Characters</th> | |
<th>Expression</th> | |
</thead> | |
<tbody> | |
<?php | |
foreach( $patterns as $_pattern => $pattern ) | |
{ | |
$author_link = ''; | |
$pattern_author = $_pattern; | |
for (;;) | |
{ | |
if ( isset( $authors[ $pattern_author ] ) ) | |
{ | |
$author_link = htmlspecialchars( $authors[ $pattern_author ] ); | |
break; | |
} | |
$new_pattern_author = substr( $_pattern , 0 , strpos( $_pattern , ' ' ) ); | |
if ( $new_pattern_author === $pattern_author ) | |
{ | |
break; | |
} | |
$pattern_author = $new_pattern_author; | |
} | |
echo '' . | |
'<tr id="' . | |
urlencode( $_pattern ) . | |
'">' . | |
'<th><a href="' . | |
$author_link . | |
'">' . | |
htmlspecialchars( $_pattern ) . | |
'</a></th>' | |
; | |
if( is_string( $pattern ) ) | |
{ | |
echo '' . | |
'<td class="number">' . | |
mb_strlen( $pattern , 'UTF-8' ) . | |
'</td>' . | |
'<td><div class="code">' . | |
htmlspecialchars( $pattern ) . | |
'</div></td>' | |
; | |
} | |
else | |
{ | |
echo '<td colspan="2">PHP core function, not a Regular expression!</td>'; | |
} | |
echo '</tr>'; | |
} | |
?> | |
</tbody> | |
</table> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment