Created
November 4, 2012 16:18
-
-
Save iloris/4012464 to your computer and use it in GitHub Desktop.
Validate an email with php function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
# | |
# RFC 822/2822/5322 Email Parser | |
# | |
# By Cal Henderson <[email protected]> | |
# | |
# This code is dual licensed: | |
# CC Attribution-ShareAlike 2.5 - http://creativecommons.org/licenses/by-sa/2.5/ | |
# GPLv3 - http://www.gnu.org/copyleft/gpl.html | |
# | |
# $Revision$ | |
# | |
################################################################################## | |
function is_valid_email_address($email, $options=array()){ | |
# | |
# you can pass a few different named options as a second argument, | |
# but the defaults are usually a good choice. | |
# | |
$defaults = array( | |
'allow_comments' => true, | |
'public_internet' => true, # turn this off for 'strict' mode | |
); | |
$opts = array(); | |
foreach ($defaults as $k => $v) $opts[$k] = isset($options[$k]) ? $options[$k] : $v; | |
$options = $opts; | |
#################################################################################### | |
# | |
# NO-WS-CTL = %d1-8 / ; US-ASCII control characters | |
# %d11 / ; that do not include the | |
# %d12 / ; carriage return, line feed, | |
# %d14-31 / ; and white space characters | |
# %d127 | |
# ALPHA = %x41-5A / %x61-7A ; A-Z / a-z | |
# DIGIT = %x30-39 | |
$no_ws_ctl = "[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]"; | |
$alpha = "[\\x41-\\x5a\\x61-\\x7a]"; | |
$digit = "[\\x30-\\x39]"; | |
$cr = "\\x0d"; | |
$lf = "\\x0a"; | |
$crlf = "(?:$cr$lf)"; | |
#################################################################################### | |
# | |
# obs-char = %d0-9 / %d11 / ; %d0-127 except CR and | |
# %d12 / %d14-127 ; LF | |
# obs-text = *LF *CR *(obs-char *LF *CR) | |
# text = %d1-9 / ; Characters excluding CR and LF | |
# %d11 / | |
# %d12 / | |
# %d14-127 / | |
# obs-text | |
# obs-qp = "\" (%d0-127) | |
# quoted-pair = ("\" text) / obs-qp | |
$obs_char = "[\\x00-\\x09\\x0b\\x0c\\x0e-\\x7f]"; | |
$obs_text = "(?:$lf*$cr*(?:$obs_char$lf*$cr*)*)"; | |
$text = "(?:[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f]|$obs_text)"; | |
# | |
# there's an issue with the definition of 'text', since 'obs_text' can | |
# be blank and that allows qp's with no character after the slash. we're | |
# treating that as bad, so this just checks we have at least one | |
# (non-CRLF) character | |
# | |
$text = "(?:$lf*$cr*$obs_char$lf*$cr*)"; | |
$obs_qp = "(?:\\x5c[\\x00-\\x7f])"; | |
$quoted_pair = "(?:\\x5c$text|$obs_qp)"; | |
#################################################################################### | |
# | |
# obs-FWS = 1*WSP *(CRLF 1*WSP) | |
# FWS = ([*WSP CRLF] 1*WSP) / ; Folding white space | |
# obs-FWS | |
# ctext = NO-WS-CTL / ; Non white space controls | |
# %d33-39 / ; The rest of the US-ASCII | |
# %d42-91 / ; characters not including "(", | |
# %d93-126 ; ")", or "\" | |
# ccontent = ctext / quoted-pair / comment | |
# comment = "(" *([FWS] ccontent) [FWS] ")" | |
# CFWS = *([FWS] comment) (([FWS] comment) / FWS) | |
# | |
# note: we translate ccontent only partially to avoid an infinite loop | |
# instead, we'll recursively strip *nested* comments before processing | |
# the input. that will leave 'plain old comments' to be matched during | |
# the main parse. | |
# | |
$wsp = "[\\x20\\x09]"; | |
$obs_fws = "(?:$wsp+(?:$crlf$wsp+)*)"; | |
$fws = "(?:(?:(?:$wsp*$crlf)?$wsp+)|$obs_fws)"; | |
$ctext = "(?:$no_ws_ctl|[\\x21-\\x27\\x2A-\\x5b\\x5d-\\x7e])"; | |
$ccontent = "(?:$ctext|$quoted_pair)"; | |
$comment = "(?:\\x28(?:$fws?$ccontent)*$fws?\\x29)"; | |
$cfws = "(?:(?:$fws?$comment)*(?:$fws?$comment|$fws))"; | |
# | |
# these are the rules for removing *nested* comments. we'll just detect | |
# outer comment and replace it with an empty comment, and recurse until | |
# we stop. | |
# | |
$outer_ccontent_dull = "(?:$fws?$ctext|$quoted_pair)"; | |
$outer_ccontent_nest = "(?:$fws?$comment)"; | |
$outer_comment = "(?:\\x28$outer_ccontent_dull*(?:$outer_ccontent_nest$outer_ccontent_dull*)+$fws?\\x29)"; | |
#################################################################################### | |
# | |
# atext = ALPHA / DIGIT / ; Any character except controls, | |
# "!" / "#" / ; SP, and specials. | |
# "$" / "%" / ; Used for atoms | |
# "&" / "'" / | |
# "*" / "+" / | |
# "-" / "/" / | |
# "=" / "?" / | |
# "^" / "_" / | |
# "`" / "{" / | |
# "|" / "}" / | |
# "~" | |
# atom = [CFWS] 1*atext [CFWS] | |
$atext = "(?:$alpha|$digit|[\\x21\\x23-\\x27\\x2a\\x2b\\x2d\\x2f\\x3d\\x3f\\x5e\\x5f\\x60\\x7b-\\x7e])"; | |
$atom = "(?:$cfws?(?:$atext)+$cfws?)"; | |
#################################################################################### | |
# | |
# qtext = NO-WS-CTL / ; Non white space controls | |
# %d33 / ; The rest of the US-ASCII | |
# %d35-91 / ; characters not including "\" | |
# %d93-126 ; or the quote character | |
# qcontent = qtext / quoted-pair | |
# quoted-string = [CFWS] | |
# DQUOTE *([FWS] qcontent) [FWS] DQUOTE | |
# [CFWS] | |
# word = atom / quoted-string | |
$qtext = "(?:$no_ws_ctl|[\\x21\\x23-\\x5b\\x5d-\\x7e])"; | |
$qcontent = "(?:$qtext|$quoted_pair)"; | |
$quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)*$fws?\\x22$cfws?)"; | |
# | |
# changed the '*' to a '+' to require that quoted strings are not empty | |
# | |
$quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)+$fws?\\x22$cfws?)"; | |
$word = "(?:$atom|$quoted_string)"; | |
#################################################################################### | |
# | |
# obs-local-part = word *("." word) | |
# obs-domain = atom *("." atom) | |
$obs_local_part = "(?:$word(?:\\x2e$word)*)"; | |
$obs_domain = "(?:$atom(?:\\x2e$atom)*)"; | |
#################################################################################### | |
# | |
# dot-atom-text = 1*atext *("." 1*atext) | |
# dot-atom = [CFWS] dot-atom-text [CFWS] | |
$dot_atom_text = "(?:$atext+(?:\\x2e$atext+)*)"; | |
$dot_atom = "(?:$cfws?$dot_atom_text$cfws?)"; | |
#################################################################################### | |
# | |
# domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS] | |
# dcontent = dtext / quoted-pair | |
# dtext = NO-WS-CTL / ; Non white space controls | |
# | |
# %d33-90 / ; The rest of the US-ASCII | |
# %d94-126 ; characters not including "[", | |
# ; "]", or "\" | |
$dtext = "(?:$no_ws_ctl|[\\x21-\\x5a\\x5e-\\x7e])"; | |
$dcontent = "(?:$dtext|$quoted_pair)"; | |
$domain_literal = "(?:$cfws?\\x5b(?:$fws?$dcontent)*$fws?\\x5d$cfws?)"; | |
#################################################################################### | |
# | |
# local-part = dot-atom / quoted-string / obs-local-part | |
# domain = dot-atom / domain-literal / obs-domain | |
# addr-spec = local-part "@" domain | |
$local_part = "(($dot_atom)|($quoted_string)|($obs_local_part))"; | |
$domain = "(($dot_atom)|($domain_literal)|($obs_domain))"; | |
$addr_spec = "$local_part\\x40$domain"; | |
# | |
# this was previously 256 based on RFC3696, but dominic's errata was accepted. | |
# | |
if (strlen($email) > 254) return 0; | |
# | |
# we need to strip nested comments first - we replace them with a simple comment | |
# | |
if ($options['allow_comments']){ | |
$email = email_strip_comments($outer_comment, $email, "(x)"); | |
} | |
# | |
# now match what's left | |
# | |
if (!preg_match("!^$addr_spec$!", $email, $m)){ | |
return 0; | |
} | |
$bits = array( | |
'local' => isset($m[1]) ? $m[1] : '', | |
'local-atom' => isset($m[2]) ? $m[2] : '', | |
'local-quoted' => isset($m[3]) ? $m[3] : '', | |
'local-obs' => isset($m[4]) ? $m[4] : '', | |
'domain' => isset($m[5]) ? $m[5] : '', | |
'domain-atom' => isset($m[6]) ? $m[6] : '', | |
'domain-literal' => isset($m[7]) ? $m[7] : '', | |
'domain-obs' => isset($m[8]) ? $m[8] : '', | |
); | |
# | |
# we need to now strip comments from $bits[local] and $bits[domain], | |
# since we know they're in the right place and we want them out of the | |
# way for checking IPs, label sizes, etc | |
# | |
if ($options['allow_comments']){ | |
$bits['local'] = email_strip_comments($comment, $bits['local']); | |
$bits['domain'] = email_strip_comments($comment, $bits['domain']); | |
} | |
# | |
# length limits on segments | |
# | |
if (strlen($bits['local']) > 64) return 0; | |
if (strlen($bits['domain']) > 255) return 0; | |
# | |
# restrictions on domain-literals from RFC2821 section 4.1.3 | |
# | |
# RFC4291 changed the meaning of :: in IPv6 addresses - i can mean one or | |
# more zero groups (updated from 2 or more). | |
# | |
if (strlen($bits['domain-literal'])){ | |
$Snum = "(\d{1,3})"; | |
$IPv4_address_literal = "$Snum\.$Snum\.$Snum\.$Snum"; | |
$IPv6_hex = "(?:[0-9a-fA-F]{1,4})"; | |
$IPv6_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){7}"; | |
$IPv6_comp_part = "(?:$IPv6_hex(?:\:$IPv6_hex){0,7})?"; | |
$IPv6_comp = "IPv6\:($IPv6_comp_part\:\:$IPv6_comp_part)"; | |
$IPv6v4_full = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){5}\:$IPv4_address_literal"; | |
$IPv6v4_comp_part = "$IPv6_hex(?:\:$IPv6_hex){0,5}"; | |
$IPv6v4_comp = "IPv6\:((?:$IPv6v4_comp_part)?\:\:(?:$IPv6v4_comp_part\:)?)$IPv4_address_literal"; | |
# | |
# IPv4 is simple | |
# | |
if (preg_match("!^\[$IPv4_address_literal\]$!", $bits['domain'], $m)){ | |
if (intval($m[1]) > 255) return 0; | |
if (intval($m[2]) > 255) return 0; | |
if (intval($m[3]) > 255) return 0; | |
if (intval($m[4]) > 255) return 0; | |
}else{ | |
# | |
# this should be IPv6 - a bunch of tests are needed here :) | |
# | |
while (1){ | |
if (preg_match("!^\[$IPv6_full\]$!", $bits['domain'])){ | |
break; | |
} | |
if (preg_match("!^\[$IPv6_comp\]$!", $bits['domain'], $m)){ | |
list($a, $b) = explode('::', $m[1]); | |
$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b"; | |
$groups = explode(':', $folded); | |
if (count($groups) > 7) return 0; | |
break; | |
} | |
if (preg_match("!^\[$IPv6v4_full\]$!", $bits['domain'], $m)){ | |
if (intval($m[1]) > 255) return 0; | |
if (intval($m[2]) > 255) return 0; | |
if (intval($m[3]) > 255) return 0; | |
if (intval($m[4]) > 255) return 0; | |
break; | |
} | |
if (preg_match("!^\[$IPv6v4_comp\]$!", $bits['domain'], $m)){ | |
list($a, $b) = explode('::', $m[1]); | |
$b = substr($b, 0, -1); # remove the trailing colon before the IPv4 address | |
$folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b"; | |
$groups = explode(':', $folded); | |
if (count($groups) > 5) return 0; | |
break; | |
} | |
return 0; | |
} | |
} | |
}else{ | |
# | |
# the domain is either dot-atom or obs-domain - either way, it's | |
# made up of simple labels and we split on dots | |
# | |
$labels = explode('.', $bits['domain']); | |
# | |
# this is allowed by both dot-atom and obs-domain, but is un-routeable on the | |
# public internet, so we'll fail it (e.g. user@localhost) | |
# | |
if ($options['public_internet']){ | |
if (count($labels) == 1) return 0; | |
} | |
# | |
# checks on each label | |
# | |
foreach ($labels as $label){ | |
if (strlen($label) > 63) return 0; | |
if (substr($label, 0, 1) == '-') return 0; | |
if (substr($label, -1) == '-') return 0; | |
} | |
# | |
# last label can't be all numeric | |
# | |
if ($options['public_internet']){ | |
if (preg_match('!^[0-9]+$!', array_pop($labels))) return 0; | |
} | |
} | |
return 1; | |
} | |
################################################################################## | |
function email_strip_comments($comment, $email, $replace=''){ | |
while (1){ | |
$new = preg_replace("!$comment!", $replace, $email); | |
if (strlen($new) == strlen($email)){ | |
return $email; | |
} | |
$email = $new; | |
} | |
} | |
################################################################################## | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment