Skip to content

Instantly share code, notes, and snippets.

@yv84
Last active August 29, 2015 14:01
Show Gist options
  • Save yv84/0cdbe77b29743b8ac8f1 to your computer and use it in GitHub Desktop.
Save yv84/0cdbe77b29743b8ac8f1 to your computer and use it in GitHub Desktop.
mastering_regular_expressions_third_edition
def f(text, pattern):
match = None
for match in re.finditer(pattern, text):
s = match.start()
e = match.end()
print('Found "%s" at %d:%d' % (text[s:e], s, e))
if match: print(match.groups())
r = lambda text, pattern, repl : re.compile(pattern).sub(repl, text)
#lookahead
f(\92First Last <[email protected]>\92,
"""(?x)
# A name is made up of letters, and may include "."
# for title abbreviations and middle initials.
( (?P<name>
([\w.,]+\s+)*[\w.,]+
)
\s+
) # name is no longer optional
# LOOKAHEAD
# Email addresses are wrapped in angle brackets, but only
# if thet are both present or neither is.
(?= (<.*>$) # remainder wrapped in angle brackets
|
([^<].*[^>]$) # remainder *not* wrapped in angle brackets
)
<? # optional opening angle bracket
# The address itself: [email protected]
(?P<email>
[\w\d.+-]+ # username
@
([\w\d.]+\.)+ # domain name prefix
(com|org|edu|ru) # limit the allowed top-level domains
)
>? # optional closing angle bracket
""")
# negative lookahead
f(\[email protected]\92,
r"""(?x)
^
# An address: [email protected]
# Ignore noreply addresses
(?!noreply@.*$)
[\w\d.+-]+ # username
@
([\w\d.]+\.)+ # domain name prefix
(com|org|edu|ru) #limit the allowed top-level domains
$
""")
#positive lookbehind
f(r"""This text includes two Twitter handles.
One for @ThePSF, and one for the author, @doughellmann.
""",
r"""(?x)
# A twitter handle: @username
(?<=@)
([\w\d_]+) # username
""")
amp = lambda text : r(r(r(r(text, r"&", '&amp;'), r"<", '&lt;'), ">", '&gt;'),"(?m)^\s*$", '<p>')
mail = lambda text: r(text, r"""(?x)(?i)
(?P<mailhere>
\b
# Capture the adress to mailhere
(
\w[-.\w]* # username
\@
[-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) #hostname
)
\b)
""", r'<a href="mailto:\g<mailhere>">\g<mailhere></a>')
url = lambda text: r(text, r"""(?x)(?i)
(?P<urlhere>
\b
# Capture the URL to urlhere
(
http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) \b # hostname
(
/ [-a-z0-9_:\@&?=+,.!/~*'%\$]*
)?
)
)
""", r'<a href="\g<urlhere>">\g<urlhere></a>')
udalenieOdinakovixPodryadIduschixSlov = lambda text: r(text,
r"""(?x)(?i)
# (regex starts here)
### Need to match one word
\b # Start of word...
(?P<wordone> [a-z]+ ) # Grab word, filleng wordone
### Now need to allow any number of spaces and/or <TAGS>
(?P<wordto> # Save what intervenes to $2
(?: # Non-capturing parens for grouping alternation
\s #Whitespace (includes newline, which is good).
| # -or-
<[^>]+> #Item like <TAG>.
)+ #Need at least one of the above, but allow more
)
### Now match the first word again:
(?P<wordtree> (?P=wordone)\b) # \b ensures not embedded. This copy saved to #3.
# (regex ends here)
""",
'\g<wordone>') # highlight: '\e[7m\g<wordone>\e[m\g<wordto>\e[7m\g<wordtree>\e[m')
f('<B>Billions</B> and <B>Zillions</B> of suns',
r'''(?x)
(?:
<B> # Match the opening <B>
(?: # Now, only as many of the following as needed...
(?! </?B> ) # If not <B>...
. # ...any character is okay
)* #
</B> # ...until the closing delimiter can match
)
''')
r('1.6257', r'(?P<fpoint>\.\d\d(?=(?P<tmp>[1-9]?))(?P=tmp))\d+', '\g<fpoint>')
#Atomic grouping with (?> [1-9]?) -> (?=(?P<tmp>[1-9]?))(?P=tmp)
#atomic grouping
(?>\w+): -> (?=(?P<tmp>\w+))(?P=tmp):
>>> r("1.264646747",
r"""(?x)
(?P<fpoint>
\.\d\d
(?=(?P<tmp>[1-9]?))(?P=tmp)
)
\d+
""",
"\g<fpoint>")
f("Jan 2", r"Jan (?:[12][0-9]|3[01]|0?[0-9])")
f("Jan 1", r"Jan (?:31|[123]0|[012]?[1-9])")
f("Jan 1", r"Jan (?:0[1-9]|[12][0-9]?|3[01]?|[4-9])")
####################################################################
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
missing.c msg.c node.c re.c version.c\\""",
r"""(?x)(?s)(?m)
^
\w+
=
.*
(?:
(?: \\\n.*)
)*
""")
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
missing.c msg.c node.c re.c version.c\\""",
r"""(?x)(?s)(?m)
^
\w+
=
[^\n\\]*
(?:
(?: \\\n[^\n\\]*)
)*
""")
f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
missing.c msg.c node.c re.c version.c\\""",
r"""(?x)(?s)(?m)
^
\w+
=
(?:
[^\n\r\\]
|
\\.
)*
""")
####################################################################
#ip adress
f("0.0.0.1",
r"""(?x)
(?<![\w.]) #(^|\s)
(?! 0+\.0+\.0+\.0+$) # if not 0.0.0.0
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
(?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
(?: [01]?\d\d?|2[0-4]\d|25[0-5])
(?![\w.]) #($|\s)
""")
# full path -> filename
r('usr/local/bin/gcc', r"^.*/", "")
r(r'\Program Files\Yahoo!\Messenger', "^.*\\", "")
# $depth /194
f(" foo(bar(this), 3.7) + 2 * (that - 1); adfasdfas df",
r"""(?x)
\(
[^()]*
(
\(
[^()]*
\)
[^()]*
)*
\)
""")
#
f("nothing here, but there .55489 ?",
r"""(?x)
-?
[0-9]+
(?:
\.
[0-9]*
)?
|
-?
\.
[0-9]+
""")
f('2\"x3\" likeness',
r"""(?x)
(
[^\\"]+
|
\\.
)*
""")
f('Darth Symbol: "/-|-\\" or "[^-^]"',
r"""(?x)
(?=(?P<tmp>
(
[^\\"]+
|
\\.
)*
))(?P=tmp)
""")
#HTML
f('<input name=dir value=">">',
r"""(?x)
< # Opening"<"
(?=(?P<tmp> # Any amount of...
(?:
"[^"]*" # double-quoted string,
| # or...
'[^']*' # single-quoted string,
| # or...
[^'">] # "other stuff"
)* #
))(?P=tmp) #
> # Closing ">"
""")
#HTML Link
f(""" ?<a href="http://www.oreilly.com">O'Reilly Media</a> """,
r"""(?x)(?s)
<a\b
(?P<HREF> [^>]+)
>
(.*?)
</a>
""")
#HREF -> URL
f(' href="http://www.oreilly.com"',
r"""(?x)(?i)
\b HREF
\s* = \s*
(?:
"(?: [^"]*)"
|
'(?: [^']*)'
|
(?: [^'">\s]+)
)
""")
#URL (?)
f("http://www.oreilly.com/mysite/:8080",
r"""(?x)(?i)(?s)
^http://
(?P<Host> [^/:]+)
(?P<Path> /[^:]*)?
(?P<Port> :(?: \d+))?
$
""")
#hostname
f("www.oreilly.com",
r"""(?x)
^
(?i) # apply this regex in a case-instensive manner.
# One or more dot-separated parts...
(?: [a-z0-9]\. | [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )*
# Followed by the final suffix part...
(?: com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z] )
$
""")
#URL yahoo-finance
# !!!! lower/uppercase on/off (embedding Flags in Patterns)
f("Read his comments at http://www.OReilly.cOm/asktim/index.html. He ...",
r"""(?x)
\b
# Match the leading part (proto://hostname, or just hostname)
(
# ftp://, http://, or https:// leading part
(ftp|https?)://[-\w]+(\.\w[-\w]*)+
|
# or, try to find a hostname with our more specific sub-expression
(?:
(?: [a-z0-9] (?: [-a-z0-9]*[a-z0-9])? \. )+ # sub domains
)
# Now ending .com, etc. For these, require lowercase
(?: com\b
| edu\b
| biz\b
| gov\b
| in(?: t|fo)\b # .int or .info
| mil\b
| net\b
| org\b
| [a-z][a-z]\b # two-letter country codes
)
)
# Allow an optional port number
( : \d )?
# The rest of the URL is optional, and begins with/...
(
/
The rest are beuristics for what seems to work well
[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*
(?:
[.!,?]+ [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+
)*
)?
""")
# !!! "library"(embedded) of regular expression
#
f("0382453144941161521344182950354427275201021744323",
r"""(?x)
(?:
(?!44)
\d\d\d\d\d)*
(44\d\d\d)
""")
# -------------------------------------------------------------
import re
import timeit
re_1 = re.compile(r'''\(
(
[^()]+ # x+
|
\( [^()]* \)
)+
\)
''', re.X)
re_2 = re.compile(r'''\(
(
(?=(?P<tmp>[^()]+ ))(?P=tmp) # Emulate (?> x+)
|
\( [^()]* \)
)+
\)''', re.X)
print (timeit.timeit("re_1.search('((()' + 'a' * 25)",
setup = "from __main__ import re_1",
number = 10))
print (timeit.timeit("re_2.search('((()' + 'a' * 25)",
setup = "from __main__ import re_2",
number = 10))
pat = re.compile(pattern)
pat = re.compile(pattern)
def fs(text, pattern, count):
t = """pat.search("%s")""" % text
print (t)
print ("%e" % timeit.timeit(t,
setup = "from __main__ import pat",
number = count)
)
# -------------------------------------------------------------
def f1(text, pattern):
match = None
for match in re.finditer(pattern, text):
s = match.start()
e = match.end()
#Faster failures with atomic grouping
ft = lambda text, pattern, count: print ("%e" % timeit.timeit('''f1("""%s""","""%s""")'''%(text, pattern),
setup = "from __main__ import f1",
number = count))
print (timeit.timeit("f('Subject', r'^\w+:')",
setup = "from __main__ import f",
number = 1000))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment