Created
November 18, 2010 07:59
-
-
Save lifthrasiir/704754 to your computer and use it in GitHub Desktop.
a drop-in replacement for ereg* functions in PHP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// ere2pcre -- a drop-in replacement for ereg* functions | |
// written by Kang Seonghoon <[email protected]>. | |
// | |
// this library is dedicated to the public domain. for the jurisdictions that | |
// does not recognize the public domain, CC0 1.0 Universal Public Domain | |
// Dedication applies. | |
function _ere2pcre_escape($c) { | |
if ($c == "\0") { | |
trigger_error('ere2pcre: a literal null byte in the regex', E_USER_ERROR); | |
} elseif (strpos('\^$.[]|()?*+{}-/', $c) !== false) { | |
return "\\".$c; | |
} else { | |
return $c; | |
} | |
} | |
// recursively converts ERE into PCRE, starting at the position $i. | |
function _ere2pcre($s, $i) { | |
$r = array(''); | |
$rr = 0; | |
$l = strlen($s); | |
while ($i < $l) { | |
// atom | |
$c = $s[$i]; | |
if ($c == '(') { | |
if ($i + 1 < $l && $s[$i+1] == ')') { // special case | |
$r[$rr] .= '()'; | |
++$i; | |
} else { | |
list($t, $ii) = _ere2pcre($s, $i + 1); | |
if ($ii >= $l || $s[$ii] != ')') { | |
trigger_error('ere2pcre: "(" does not have a matching ")"', | |
E_USER_ERROR); | |
} | |
$r[$rr] .= '('.$t.')'; | |
$i = $ii; | |
} | |
} elseif ($c == '[') { | |
++$i; | |
$cls = ''; | |
if ($i < $l && $s[$i] == '^') { | |
$cls .= '^'; | |
++$i; | |
} | |
if ($i >= $l) { | |
trigger_error('ere2pcre: "[" does not have a matching "]"', | |
E_USER_ERROR); | |
} | |
$start = true; | |
do { | |
if ($s[$i] == '[' && | |
$i+1 < $l && strpos('.=:', $s[$i+1]) !== false) { | |
$ii = strpos($s, ']', $i); | |
if ($ii === false) { | |
trigger_error('ere2pcre: "[" does not have a matching '. | |
'"]"', E_USER_ERROR); | |
} | |
$ccls = substr($s, $i+1, $ii-($i+1)); | |
$cclsmap = array( | |
':alnum:' => '[:alnum:]', | |
':alpha:' => '[:alpha:]', | |
':blank:' => '[:blank:]', | |
':cntrl:' => '[:cntrl:]', | |
':digit:' => '\d', | |
':graph:' => '[:graph:]', | |
':lower:' => '[:lower:]', | |
':print:' => '[:print:]', | |
':punct:' => '[:punct:]', | |
':space:' => '\013\s', // should include VT | |
':upper:' => '[:upper:]', | |
':xdigit:' => '[:xdigit:]', | |
); | |
if (!isset($cclsmap[$ccls])) { | |
trigger_error('ere2pcre: an invalid or unsupported '. | |
'character class ['.$ccls.']', | |
E_USER_ERROR); | |
} | |
$cls .= $cclsmap[$ccls]; | |
$i = $ii + 1; | |
} else { | |
$a = $s[$i++]; | |
if ($a === '-' && !$start && !($i < $l && $s[$i] == ']')) { | |
trigger_error('ere2pcre: "-" is invalid for the start '. | |
'character in the brackets', | |
E_USER_ERROR); | |
} | |
if ($i < $l && $s[$i] === '-') { | |
++$i; | |
$b = $s[$i++]; | |
if ($b == ']') { | |
$cls .= _ere2pcre_escape($a).'\-'; | |
break; | |
} elseif (ord($a) > ord($b)) { | |
trigger_error('ere2pcre: an invalid character '. | |
'range "'.$a.'-'.$b.'"', | |
E_USER_ERROR); | |
} | |
$cls .= _ere2pcre_escape($a).'-'._ere2pcre_escape($b); | |
} else { | |
$cls .= _ere2pcre_escape($a); | |
} | |
} | |
$start = false; | |
} while ($i < $l && $s[$i] != ']'); | |
if ($i >= $l) { | |
trigger_error('ere2pcre: "[" does not have a matching "]"', | |
E_USER_ERROR); | |
} | |
$r[$rr] .= '['.$cls.']'; | |
} elseif ($c == ')') { | |
break; | |
} elseif ($c == '*' || $c == '+' || $c == '?') { | |
trigger_error('ere2pcre: unescaped metacharacter "'.$c.'"', | |
E_USER_ERROR); | |
} elseif ($c == '{') { | |
if ($i + 1 < $l && strpos('0123456789', $s[$i+1]) !== false) { | |
$r[$rr] .= '\{'; | |
} else { | |
trigger_error('ere2pcre: unescaped metacharacter "'.$c.'"', | |
E_USER_ERROR); | |
} | |
} elseif ($c == '.') { | |
$r[$rr] .= $c; | |
} elseif ($c == '^' || $c == '$') { | |
$r[$rr] .= $c; | |
++$i; | |
continue; | |
} elseif ($c == '|') { | |
if ($r[$rr] === '') { | |
trigger_error('ere2pcre: empty branch', E_USER_ERROR); | |
} | |
$r[] = ''; | |
++$rr; | |
++$i; | |
continue; | |
} elseif ($c == "\\") { | |
if (++$i >= $l) { | |
trigger_error('ere2pcre: an invalid escape sequence at the end', | |
E_USER_ERROR); | |
} | |
$r[$rr] .= _ere2pcre_escape($s[$i]); | |
} else { // including ] and } which are allowed as a literal character | |
$r[$rr] .= _ere2pcre_escape($c); | |
} | |
++$i; | |
if ($i >= $l) break; | |
// piece after the atom (only ONE of them is possible) | |
$c = $s[$i]; | |
if ($c == '*' || $c == '+' || $c == '?') { | |
$r[$rr] .= $c; | |
++$i; | |
} elseif ($c == '{') { | |
$ii = strpos($s, '}', $i); | |
if ($ii === false) { | |
trigger_error('ere2pcre: "{" does not have a matching "}"', | |
E_USER_ERROR); | |
} | |
$bound = substr($s, $i+1, $ii-($i+1)); | |
if (!preg_match('/^([0-9]|[1-9][0-9]|1[0-9][0-9]| | |
2[0-4][0-9]|25[0-5]) | |
(,([0-9]|[1-9][0-9]|1[0-9][0-9]| | |
2[0-4][0-9]|25[0-5])?)?$/x', | |
$bound, $m)) { | |
trigger_error('ere2pcre: an invalid bound', E_USER_ERROR); | |
} | |
if (isset($m[3])) { | |
if ($m[1] > $m[3]) { | |
trigger_error('ere2pcre: an invalid bound', E_USER_ERROR); | |
} | |
$r[$rr] .= '{'.$m[1].','.$m[3].'}'; | |
} elseif (isset($m[2])) { | |
$r[$rr] .= '{'.$m[1].',}'; | |
} else { | |
$r[$rr] .= '{'.$m[1].'}'; | |
} | |
$i = $ii + 1; | |
} | |
} | |
if ($r[$rr] === '') { | |
trigger_error('ere2pcre: empty regular expression or branch', | |
E_USER_ERROR); | |
} | |
return array(implode('|', $r), $i); | |
} | |
// converts the ERE $s into the PCRE $r. triggers error on any invalid input. | |
function ere2pcre($s, $ignorecase) { | |
static $cache = array(), $icache = array(); | |
if ($ignorecase) { | |
if (isset($icache[$s])) return $icache[$s]; | |
} else { | |
if (isset($cache[$s])) return $cache[$s]; | |
} | |
list($r, $i) = _ere2pcre($s, 0); | |
if ($i != strlen($s)) { | |
trigger_error('ere2pcre: unescaped metacharacter ")"', E_USER_ERROR); | |
} | |
if ($ignorecase) { | |
return ($icache[$s] = '/'.$r.'/mi'); | |
} else { | |
return ($cache[$s] = '/'.$r.'/m'); | |
} | |
} | |
////////////////////////////////////////////////////////////////////////////// | |
// drop-in replacements | |
// drop-in replacement for ereg. | |
function myereg($r, $s, &$m = null) { | |
$r = ere2pcre($r, false); | |
if (func_num_args() > 2) { // fix the result | |
return (preg_match($r, $s, $m) ? strlen($m[0]) : false); | |
} else { | |
return (preg_match($r, $s) ? 1 : false); | |
} | |
} | |
// drop-in replacement for eregi. | |
function myeregi($r, $s, &$m = null) { | |
$r = ere2pcre($r, true); | |
if (func_num_args() > 2) { // fix the result | |
return (preg_match($r, $s, $m) ? strlen($m[0]) : false); | |
} else { | |
return (preg_match($r, $s) ? 1 : false); | |
} | |
} | |
// drop-in replacement for ereg_replace. | |
function myereg_replace($r, $t, $s) { | |
return preg_replace(ere2pcre($r, false), $t, $s); | |
} | |
// drop-in replacement for eregi_replace. | |
function myeregi_replace($r, $t, $s) { | |
return preg_replace(ere2pcre($r, true), $t, $s); | |
} | |
// drop-in replacement for split. | |
function mysplit($r, $s, $l=-1) { | |
return preg_split(ere2pcre($r, false), $s, ($l == 0 ? 1 : $l)); | |
} | |
// drop-in replacement for spliti. | |
function myspliti($r, $s, $l=-1) { | |
return preg_split(ere2pcre($r, true), $s, ($l == 0 ? 1 : $l)); | |
} | |
////////////////////////////////////////////////////////////////////////////// | |
// tests | |
function ere2pcre_test() { | |
foreach (array( | |
'mearie.org', | |
'mearie\.org', | |
'mearie[.,]org', | |
'[a-z]+[.,][a-z]+', | |
'^[a-z]+[.,][a-z]+$', | |
'^[a-z]+[.,][a-z]{3,}$', | |
'a|b|(c|d)|e', | |
'a|b|()|c', | |
'[[:alnum:][:punct:]]', | |
'[]-z]', | |
'[[a]]', | |
'[---]', | |
'[a\z]', | |
'[^^]', | |
'^$^$^$^$', | |
'\([^>]*\"?[^)]*\)', | |
) as $r) { | |
printf("%-40s%-40s\n", $r, ere2pcre($r, false)); | |
} | |
$r = "^(http(s?):\/\/|ftp:\/\/)*([[:alpha:]][-[:alnum:]]*[[:alnum:]])". | |
"(\.[[:alpha:]][-[:alnum:]]*[[:alpha:]])+(/[[:alpha:]][-[:alnum:]]*". | |
"[[:alnum:]])*(\/?)(/[[:alpha:]][-[:alnum:]]*\.[[:alpha:]]{3,5})?". | |
"(\?([[:alnum:]][-_%[:alnum:]]*=[-_%[:alnum:]]+)(&([[:alnum:]]". | |
"[-_%[:alnum:]]*=[-_%[:alnum:]]+))*)?$"; | |
printf("%s\n%s\n", $r, ere2pcre($r, false)); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment