Last active
August 29, 2015 14:01
-
-
Save yv84/0cdbe77b29743b8ac8f1 to your computer and use it in GitHub Desktop.
mastering_regular_expressions_third_edition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def f(text, pattern): | |
match = None | |
for match in re.finditer(pattern, text): | |
s = match.start() | |
e = match.end() | |
print('Found "%s" at %d:%d' % (text[s:e], s, e)) | |
if match: print(match.groups()) | |
r = lambda text, pattern, repl : re.compile(pattern).sub(repl, text) | |
#lookahead | |
f(\92First Last <[email protected]>\92, | |
"""(?x) | |
# A name is made up of letters, and may include "." | |
# for title abbreviations and middle initials. | |
( (?P<name> | |
([\w.,]+\s+)*[\w.,]+ | |
) | |
\s+ | |
) # name is no longer optional | |
# LOOKAHEAD | |
# Email addresses are wrapped in angle brackets, but only | |
# if thet are both present or neither is. | |
(?= (<.*>$) # remainder wrapped in angle brackets | |
| | |
([^<].*[^>]$) # remainder *not* wrapped in angle brackets | |
) | |
<? # optional opening angle bracket | |
# The address itself: [email protected] | |
(?P<email> | |
[\w\d.+-]+ # username | |
@ | |
([\w\d.]+\.)+ # domain name prefix | |
(com|org|edu|ru) # limit the allowed top-level domains | |
) | |
>? # optional closing angle bracket | |
""") | |
# negative lookahead | |
f(\[email protected]\92, | |
r"""(?x) | |
^ | |
# An address: [email protected] | |
# Ignore noreply addresses | |
(?!noreply@.*$) | |
[\w\d.+-]+ # username | |
@ | |
([\w\d.]+\.)+ # domain name prefix | |
(com|org|edu|ru) #limit the allowed top-level domains | |
$ | |
""") | |
#positive lookbehind | |
f(r"""This text includes two Twitter handles. | |
One for @ThePSF, and one for the author, @doughellmann. | |
""", | |
r"""(?x) | |
# A twitter handle: @username | |
(?<=@) | |
([\w\d_]+) # username | |
""") | |
amp = lambda text : r(r(r(r(text, r"&", '&'), r"<", '<'), ">", '>'),"(?m)^\s*$", '<p>') | |
mail = lambda text: r(text, r"""(?x)(?i) | |
(?P<mailhere> | |
\b | |
# Capture the adress to mailhere | |
( | |
\w[-.\w]* # username | |
\@ | |
[-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) #hostname | |
) | |
\b) | |
""", r'<a href="mailto:\g<mailhere>">\g<mailhere></a>') | |
url = lambda text: r(text, r"""(?x)(?i) | |
(?P<urlhere> | |
\b | |
# Capture the URL to urlhere | |
( | |
http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) \b # hostname | |
( | |
/ [-a-z0-9_:\@&?=+,.!/~*'%\$]* | |
)? | |
) | |
) | |
""", r'<a href="\g<urlhere>">\g<urlhere></a>') | |
udalenieOdinakovixPodryadIduschixSlov = lambda text: r(text, | |
r"""(?x)(?i) | |
# (regex starts here) | |
### Need to match one word | |
\b # Start of word... | |
(?P<wordone> [a-z]+ ) # Grab word, filleng wordone | |
### Now need to allow any number of spaces and/or <TAGS> | |
(?P<wordto> # Save what intervenes to $2 | |
(?: # Non-capturing parens for grouping alternation | |
\s #Whitespace (includes newline, which is good). | |
| # -or- | |
<[^>]+> #Item like <TAG>. | |
)+ #Need at least one of the above, but allow more | |
) | |
### Now match the first word again: | |
(?P<wordtree> (?P=wordone)\b) # \b ensures not embedded. This copy saved to #3. | |
# (regex ends here) | |
""", | |
'\g<wordone>') # highlight: '\e[7m\g<wordone>\e[m\g<wordto>\e[7m\g<wordtree>\e[m') | |
f('<B>Billions</B> and <B>Zillions</B> of suns', | |
r'''(?x) | |
(?: | |
<B> # Match the opening <B> | |
(?: # Now, only as many of the following as needed... | |
(?! </?B> ) # If not <B>... | |
. # ...any character is okay | |
)* # | |
</B> # ...until the closing delimiter can match | |
) | |
''') | |
r('1.6257', r'(?P<fpoint>\.\d\d(?=(?P<tmp>[1-9]?))(?P=tmp))\d+', '\g<fpoint>') | |
#Atomic grouping with (?> [1-9]?) -> (?=(?P<tmp>[1-9]?))(?P=tmp) | |
#atomic grouping | |
(?>\w+): -> (?=(?P<tmp>\w+))(?P=tmp): | |
>>> r("1.264646747", | |
r"""(?x) | |
(?P<fpoint> | |
\.\d\d | |
(?=(?P<tmp>[1-9]?))(?P=tmp) | |
) | |
\d+ | |
""", | |
"\g<fpoint>") | |
f("Jan 2", r"Jan (?:[12][0-9]|3[01]|0?[0-9])") | |
f("Jan 1", r"Jan (?:31|[123]0|[012]?[1-9])") | |
f("Jan 1", r"Jan (?:0[1-9]|[12][0-9]?|3[01]?|[4-9])") | |
#################################################################### | |
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\ | |
missing.c msg.c node.c re.c version.c\\""", | |
r"""(?x)(?s)(?m) | |
^ | |
\w+ | |
= | |
.* | |
(?: | |
(?: \\\n.*) | |
)* | |
""") | |
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\ | |
missing.c msg.c node.c re.c version.c\\""", | |
r"""(?x)(?s)(?m) | |
^ | |
\w+ | |
= | |
[^\n\\]* | |
(?: | |
(?: \\\n[^\n\\]*) | |
)* | |
""") | |
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\ | |
missing.c msg.c node.c re.c version.c\\""", | |
r"""(?x)(?s)(?m) | |
^ | |
\w+ | |
= | |
(?: | |
[^\n\r\\] | |
| | |
\\. | |
)* | |
""") | |
#################################################################### | |
#ip adress | |
f("0.0.0.1", | |
r"""(?x) | |
(?<![\w.]) #(^|\s) | |
(?! 0+\.0+\.0+\.0+$) # if not 0.0.0.0 | |
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\. | |
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\. | |
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\. | |
(?: [01]?\d\d?|2[0-4]\d|25[0-5]) | |
(?![\w.]) #($|\s) | |
""") | |
# full path -> filename | |
r('usr/local/bin/gcc', r"^.*/", "") | |
r(r'\Program Files\Yahoo!\Messenger', "^.*\\", "") | |
# $depth /194 | |
f(" foo(bar(this), 3.7) + 2 * (that - 1); adfasdfas df", | |
r"""(?x) | |
\( | |
[^()]* | |
( | |
\( | |
[^()]* | |
\) | |
[^()]* | |
)* | |
\) | |
""") | |
# | |
f("nothing here, but there .55489 ?", | |
r"""(?x) | |
-? | |
[0-9]+ | |
(?: | |
\. | |
[0-9]* | |
)? | |
| | |
-? | |
\. | |
[0-9]+ | |
""") | |
f('2\"x3\" likeness', | |
r"""(?x) | |
( | |
[^\\"]+ | |
| | |
\\. | |
)* | |
""") | |
f('Darth Symbol: "/-|-\\" or "[^-^]"', | |
r"""(?x) | |
(?=(?P<tmp> | |
( | |
[^\\"]+ | |
| | |
\\. | |
)* | |
))(?P=tmp) | |
""") | |
#HTML | |
f('<input name=dir value=">">', | |
r"""(?x) | |
< # Opening"<" | |
(?=(?P<tmp> # Any amount of... | |
(?: | |
"[^"]*" # double-quoted string, | |
| # or... | |
'[^']*' # single-quoted string, | |
| # or... | |
[^'">] # "other stuff" | |
)* # | |
))(?P=tmp) # | |
> # Closing ">" | |
""") | |
#HTML Link | |
f(""" ?<a href="http://www.oreilly.com">O'Reilly Media</a> """, | |
r"""(?x)(?s) | |
<a\b | |
(?P<HREF> [^>]+) | |
> | |
(.*?) | |
</a> | |
""") | |
#HREF -> URL | |
f(' href="http://www.oreilly.com"', | |
r"""(?x)(?i) | |
\b HREF | |
\s* = \s* | |
(?: | |
"(?: [^"]*)" | |
| | |
'(?: [^']*)' | |
| | |
(?: [^'">\s]+) | |
) | |
""") | |
#URL (?) | |
f("http://www.oreilly.com/mysite/:8080", | |
r"""(?x)(?i)(?s) | |
^http:// | |
(?P<Host> [^/:]+) | |
(?P<Path> /[^:]*)? | |
(?P<Port> :(?: \d+))? | |
$ | |
""") | |
#hostname | |
f("www.oreilly.com", | |
r"""(?x) | |
^ | |
(?i) # apply this regex in a case-instensive manner. | |
# One or more dot-separated parts... | |
(?: [a-z0-9]\. | [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )* | |
# Followed by the final suffix part... | |
(?: com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z] ) | |
$ | |
""") | |
#URL yahoo-finance | |
# !!!! lower/uppercase on/off (embedding Flags in Patterns) | |
f("Read his comments at http://www.OReilly.cOm/asktim/index.html. He ...", | |
r"""(?x) | |
\b | |
# Match the leading part (proto://hostname, or just hostname) | |
( | |
# ftp://, http://, or https:// leading part | |
(ftp|https?)://[-\w]+(\.\w[-\w]*)+ | |
| | |
# or, try to find a hostname with our more specific sub-expression | |
(?: | |
(?: [a-z0-9] (?: [-a-z0-9]*[a-z0-9])? \. )+ # sub domains | |
) | |
# Now ending .com, etc. For these, require lowercase | |
(?: com\b | |
| edu\b | |
| biz\b | |
| gov\b | |
| in(?: t|fo)\b # .int or .info | |
| mil\b | |
| net\b | |
| org\b | |
| [a-z][a-z]\b # two-letter country codes | |
) | |
) | |
# Allow an optional port number | |
( : \d )? | |
# The rest of the URL is optional, and begins with/... | |
( | |
/ | |
The rest are beuristics for what seems to work well | |
[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]* | |
(?: | |
[.!,?]+ [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+ | |
)* | |
)? | |
""") | |
# !!! "library"(embedded) of regular expression | |
# | |
f("0382453144941161521344182950354427275201021744323", | |
r"""(?x) | |
(?: | |
(?!44) | |
\d\d\d\d\d)* | |
(44\d\d\d) | |
""") | |
# ------------------------------------------------------------- | |
import re | |
import timeit | |
re_1 = re.compile(r'''\( | |
( | |
[^()]+ # x+ | |
| | |
\( [^()]* \) | |
)+ | |
\) | |
''', re.X) | |
re_2 = re.compile(r'''\( | |
( | |
(?=(?P<tmp>[^()]+ ))(?P=tmp) # Emulate (?> x+) | |
| | |
\( [^()]* \) | |
)+ | |
\)''', re.X) | |
print (timeit.timeit("re_1.search('((()' + 'a' * 25)", | |
setup = "from __main__ import re_1", | |
number = 10)) | |
print (timeit.timeit("re_2.search('((()' + 'a' * 25)", | |
setup = "from __main__ import re_2", | |
number = 10)) | |
pat = re.compile(pattern) | |
pat = re.compile(pattern) | |
def fs(text, pattern, count): | |
t = """pat.search("%s")""" % text | |
print (t) | |
print ("%e" % timeit.timeit(t, | |
setup = "from __main__ import pat", | |
number = count) | |
) | |
# ------------------------------------------------------------- | |
def f1(text, pattern): | |
match = None | |
for match in re.finditer(pattern, text): | |
s = match.start() | |
e = match.end() | |
#Faster failures with atomic grouping | |
ft = lambda text, pattern, count: print ("%e" % timeit.timeit('''f1("""%s""","""%s""")'''%(text, pattern), | |
setup = "from __main__ import f1", | |
number = count)) | |
print (timeit.timeit("f('Subject', r'^\w+:')", | |
setup = "from __main__ import f", | |
number = 1000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment