Created
January 15, 2016 23:24
-
-
Save qntm/013a796b17314266b2b1 to your computer and use it in GitHub Desktop.
RegEx match open tags except XHTML self-contained tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
"RegEx match open tags except XHTML self-contained tags" | |
<http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags> | |
The W3C grammar for an XHTML open tag is given by production "STag". This | |
production is not recursive and is in fact strictly regular. The relevant | |
portions of the XML grammar are as follows (<http://www.w3.org/TR/xml11/>): | |
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] | |
No constraints | |
NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] | |
No constraints | |
Name ::= NameStartChar (NameChar)* | |
No constraints | |
S ::= (#x20 | #x9 | #xD | #xA)+ | |
No constraints | |
Eq ::= S? '=' S? | |
No constraints | |
EntityRef ::= '&' Name ';' | |
Well-formedness constraint: Entity Declared | |
Validity constraint: Entity Declared | |
Well-formedness constraint: Parsed Entity | |
Well-formedness constraint: No Recursion | |
CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' | |
Well-formedness constraint: Legal Character | |
Reference ::= EntityRef | CharRef | |
No constraints | |
AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" | |
No constraints | |
Attribute ::= Name Eq AttValue | |
Validity constraint: Attribute Value Type | |
Well-formedness constraint: No External Entity References | |
Well-formedness constraint: No < in Attribute Values | |
STag ::= '<' Name (S Attribute)* S? '>' | |
Well-formedness constraint: Unique Att Spec | |
Amazingly, the well-formedness and validity constraints *can* still be checked | |
using regular expressions. However, this requires far more information and | |
yields a very bloated, unpleasant regular expression. | |
If we plan to extract that information and check it separately, or we don't | |
care at all, then the desired regex can be constructed as follows: | |
''' | |
NameStartChar = "[" + ":A-Z_a-z" + "\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF" + "\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D" + "\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD\\U00010000-\\U000EFFFF]" | |
NameChar = "[\\-.0-9:A-Z_a-z\\u00B7\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0300-\\u036F\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u203F-\\u2040\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD\\U00010000-\\U000EFFFF]" | |
Name = NameStartChar + NameChar + "*" | |
Reference = "&(" + Name + "|#[0-9]+|#x[0-9a-fA-F]+);" | |
DqAttValue = '"([^<&"]|' + Reference + ')*"' | |
SqAttValue = "'([^<&']|" + Reference + ")*'" | |
AttValue = "(" + DqAttValue + "|" + SqAttValue + ")" | |
Sstar = "[\\u0020\\u0009\\u000D\\u000A]*" | |
Splus = "[\\u0020\\u0009\\u000D\\u000A]+" | |
Attribute = Name + Sstar + "=" + Sstar + AttValue | |
STag = "<" + Name + "(" + Splus + Attribute + ")*" + Sstar + ">" | |
print(STag) | |
# Unit tests | |
import re | |
prog = re.compile(NameStartChar) | |
assert prog.fullmatch(":") | |
assert prog.fullmatch("A") | |
assert prog.fullmatch("\u00F8") | |
assert prog.fullmatch("\U000EFFFF") | |
assert not prog.fullmatch("-") | |
assert not prog.fullmatch("\\") | |
prog = re.compile(NameChar) | |
assert prog.fullmatch(":") | |
assert prog.fullmatch("A") | |
assert prog.fullmatch("\u00F8") | |
assert prog.fullmatch("\U000EFFFF") | |
assert prog.fullmatch("-") | |
assert prog.fullmatch("8") | |
assert not prog.fullmatch("\\") | |
prog = re.compile(Name) | |
assert prog.fullmatch("a-b-c:d-e-f0") | |
assert prog.fullmatch("div") | |
assert prog.fullmatch("xmlns:root") | |
assert prog.fullmatch(":anything") | |
assert not prog.fullmatch("-xml") | |
assert not prog.fullmatch("0abc") | |
prog = re.compile(Reference) | |
assert prog.fullmatch("&") | |
assert prog.fullmatch("Ӓ") | |
assert prog.fullmatch("&xABCDEF;") | |
assert not prog.fullmatch("&xABCDEF") | |
assert not prog.fullmatch("xABCDEF;") | |
assert prog.fullmatch("&ABCDEF;") | |
assert not prog.fullmatch("&-;") | |
prog = re.compile(DqAttValue) | |
assert prog.fullmatch('"abc def"') | |
assert prog.fullmatch('"abcnbsp;def"') | |
assert not prog.fullmatch('"abc def') | |
assert not prog.fullmatch('"abc def"') | |
assert not prog.fullmatch('abc def"') | |
assert prog.fullmatch('"\'"') | |
assert not prog.fullmatch('"<"') | |
assert not prog.fullmatch('"&"') | |
assert not prog.fullmatch('"""') | |
prog = re.compile(SqAttValue) | |
assert prog.fullmatch("'abc def'") | |
assert prog.fullmatch("'abcnbsp;def'") | |
assert not prog.fullmatch("'abc def") | |
assert not prog.fullmatch("'abc def'") | |
assert not prog.fullmatch("abc def'") | |
assert prog.fullmatch("'\"'") | |
assert not prog.fullmatch("'<'") | |
assert not prog.fullmatch("'&'") | |
assert not prog.fullmatch("'''") | |
prog = re.compile(AttValue) | |
assert prog.fullmatch("''") | |
assert prog.fullmatch("'aaa'") | |
assert prog.fullmatch('"aaa"') | |
assert not prog.fullmatch("'aaa\"") | |
assert not prog.fullmatch("\"aaa'") | |
prog = re.compile(Sstar) | |
assert prog.fullmatch(" \t\r\n") | |
assert prog.fullmatch("") | |
assert not prog.fullmatch("a") | |
prog = re.compile(Splus) | |
assert prog.fullmatch(" \t\r\n") | |
assert not prog.fullmatch("") | |
assert not prog.fullmatch("a") | |
prog = re.compile(Attribute) | |
assert prog.fullmatch("a=''") | |
assert prog.fullmatch("a = ' '") | |
assert prog.fullmatch('a = "abc"') | |
prog = re.compile(STag) | |
assert prog.fullmatch("<p>") | |
assert prog.fullmatch('<a href=\"foo\">') | |
assert not prog.fullmatch("<br />") | |
assert not prog.fullmatch('<hr class="foo" />') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment