yv84 · August 29, 2015 14:01
diff --git a/gistfile1.py b/gistfile1.py
 def f(text, pattern):
    match = None
    for match in re.finditer(pattern, text):
        s = match.start()
        e = match.end()
        print('Found "%s" at %d:%d' % (text[s:e], s, e))
        if match: print(match.groups())

 r = lambda text, pattern, repl : re.compile(pattern).sub(repl, text)



 #lookahead
 f(\92First Last <[email protected]>\92, 
 """(?x)
 # A name is made up of letters, and may include "."
 # for title abbreviations and middle initials.
 ( (?P<name>
    ([\w.,]+\s+)*[\w.,]+
  )
  \s+
 ) # name is no longer optional

 # LOOKAHEAD
 # Email addresses are wrapped in angle brackets, but only
 # if thet are both present or neither is.
 (?= (<.*>$)  # remainder wrapped in angle brackets
    |
    ([^<].*[^>]$) # remainder *not* wrapped in angle brackets
  )
 <? # optional opening angle bracket
 # The address itself: [email protected]
 (?P<email>
  [\w\d.+-]+   # username
  @
  ([\w\d.]+\.)+   # domain name prefix
  (com|org|edu|ru)  # limit the allowed top-level domains
 )
 >? # optional closing angle bracket
    
 """)


 # negative lookahead

 f(\[email protected]\92,
 r"""(?x)
 ^
 # An address: [email protected]
 # Ignore noreply addresses
 (?!noreply@.*$)
 [\w\d.+-]+  # username
 @
 ([\w\d.]+\.)+  # domain name prefix
 (com|org|edu|ru)  #limit the allowed top-level domains
 $

 """)

 #positive lookbehind
 f(r"""This text includes two Twitter handles.
 One for @ThePSF, and one for the author, @doughellmann.
 """,
 r"""(?x)
 # A twitter handle: @username
 (?<=@)
 ([\w\d_]+)  # username
 """)






 amp = lambda text : r(r(r(r(text, r"&", '&amp;'), r"<", '&lt;'), ">", '&gt;'),"(?m)^\s*$", '<p>')

 mail = lambda text: r(text, r"""(?x)(?i)
 (?P<mailhere>
 \b
 # Capture the adress to mailhere
 (
    \w[-.\w]*   # username
    \@
    [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) #hostname
 )
 \b)
 """, r'<a href="mailto:\g<mailhere>">\g<mailhere></a>')



 url = lambda text: r(text, r"""(?x)(?i)
 (?P<urlhere>
 \b
 # Capture the URL to urlhere
 (
    http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info|ru) \b # hostname
    (
        / [-a-z0-9_:\@&?=+,.!/~*'%\$]*
    )?
 )
 )
 """, r'<a href="\g<urlhere>">\g<urlhere></a>')


 udalenieOdinakovixPodryadIduschixSlov = lambda text: r(text,
 r"""(?x)(?i)
 # (regex starts here)
 ### Need to match one word
 \b  # Start of word...
 (?P<wordone> [a-z]+ ) # Grab word, filleng wordone

 ### Now need to allow any number of spaces and/or <TAGS>
 (?P<wordto> # Save what intervenes to $2
    (?: # Non-capturing parens for grouping alternation
        \s #Whitespace (includes newline, which is good).
        |  # -or-
        <[^>]+> #Item like <TAG>.
    )+  #Need at least one of the above, but allow more
 )

 ### Now match the first word again:
 (?P<wordtree> (?P=wordone)\b) # \b ensures not embedded. This copy saved to #3.

 # (regex ends here)
 """,
 '\g<wordone>') # highlight: '\e[7m\g<wordone>\e[m\g<wordto>\e[7m\g<wordtree>\e[m') 


 f('<B>Billions</B> and <B>Zillions</B> of suns', 
 r'''(?x)
 (?:
 <B>  # Match the opening <B>
 (?:    # Now, only as many of the following as needed...
    (?! </?B> )  # If not <B>...
    .          #              ...any character is okay
 )*            #
 </B>           # ...until the closing delimiter can match
 )
 ''')


 r('1.6257', r'(?P<fpoint>\.\d\d(?=(?P<tmp>[1-9]?))(?P=tmp))\d+', '\g<fpoint>')


 #Atomic grouping with (?> [1-9]?) ->  (?=(?P<tmp>[1-9]?))(?P=tmp)




 #atomic grouping
 (?>\w+): -> (?=(?P<tmp>\w+))(?P=tmp):
 >>> r("1.264646747", 
 r"""(?x)
    (?P<fpoint>
    \.\d\d
    (?=(?P<tmp>[1-9]?))(?P=tmp)
    )
    \d+
 """, 
 "\g<fpoint>")


 f("Jan 2", r"Jan (?:[12][0-9]|3[01]|0?[0-9])")
 f("Jan 1", r"Jan (?:31|[123]0|[012]?[1-9])")
 f("Jan 1", r"Jan (?:0[1-9]|[12][0-9]?|3[01]?|[4-9])")



 ####################################################################
 f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
 missing.c msg.c node.c re.c version.c\\""",
 r"""(?x)(?s)(?m)
 ^
 \w+
 =
 .*
 (?:
    (?: \\\n.*)
 )*
 """)

 f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
 missing.c msg.c node.c re.c version.c\\""",
 r"""(?x)(?s)(?m)
 ^
 \w+
 =
 [^\n\\]*
 (?:
    (?: \\\n[^\n\\]*)
 )*
 """)

 f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
 missing.c msg.c node.c re.c version.c\\""",
 r"""(?x)(?s)(?m)
 ^
 \w+
 =
 (?:
    [^\n\r\\]
    |
    \\.
 )*
 """)
 ####################################################################

 #ip adress
 f("0.0.0.1", 
 r"""(?x)
 (?<![\w.])  #(^|\s)
 (?! 0+\.0+\.0+\.0+$)  # if not 0.0.0.0 
 (?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
 (?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
 (?: [01]?\d\d?|2[0-4]\d|25[0-5])\.
 (?: [01]?\d\d?|2[0-4]\d|25[0-5])
 (?![\w.])  #($|\s)
 """)


 # full path -> filename
 r('usr/local/bin/gcc', r"^.*/", "")
 r(r'\Program Files\Yahoo!\Messenger', "^.*\\", "")

 # $depth /194
 f("   foo(bar(this), 3.7) + 2 * (that - 1);   adfasdfas df",
 r"""(?x)
 \(
    [^()]*
    (
        \(
        [^()]*
        \)
        [^()]*
    )*
 \) 
 """)

 #
 f("nothing here, but there .55489 ?", 
 r"""(?x)
    -?
    [0-9]+
    (?:
        \.
        [0-9]*
    )?
 |
    -?
    \.
    [0-9]+
 """)

 f('2\"x3\" likeness', 
 r"""(?x)
 (
    [^\\"]+
    |
    \\.

 )*
 """)

 f('Darth Symbol: "/-|-\\" or "[^-^]"', 
 r"""(?x)
 (?=(?P<tmp>
    (
        [^\\"]+
        |
        \\.
    )*
 ))(?P=tmp)
 """)


 #HTML
 f('<input name=dir value=">">',
 r"""(?x)
 <               # Opening"<"
    (?=(?P<tmp> #  Any amount of...
    (?:
        "[^"]*" #   double-quoted string,
        |       #   or...
        '[^']*' #   single-quoted string,
        |       #   or...
        [^'">]  #   "other stuff"
    )*          #
    ))(?P=tmp)  #
 >               # Closing ">"
 """)

 #HTML Link
 f("""       ?<a href="http://www.oreilly.com">O'Reilly Media</a>    """,
 r"""(?x)(?s)
 <a\b
 (?P<HREF> [^>]+)
 >
 (.*?)
 </a>
 """)

 #HREF -> URL
 f(' href="http://www.oreilly.com"', 
 r"""(?x)(?i)
 \b HREF
 \s* = \s*
 (?:
    "(?: [^"]*)"
    |
    '(?: [^']*)'
    |
    (?:  [^'">\s]+)
 )
 """)


 #URL (?)
 f("http://www.oreilly.com/mysite/:8080", 
 r"""(?x)(?i)(?s)
 ^http://
 (?P<Host> [^/:]+)
 (?P<Path> /[^:]*)?
 (?P<Port> :(?: \d+))?
 $
 """)

 #hostname
 f("www.oreilly.com",
 r"""(?x)
 ^
  (?i)  # apply this regex in a case-instensive manner.
  # One or more dot-separated parts...
  (?: [a-z0-9]\.  |  [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\.  )*
  # Followed by the final suffix part...
  (?: com|edu|gov|int|mil|net|org|biz|info|name|museum|coop|aero|[a-z][a-z] )
 $
 """)

 #URL yahoo-finance  
 # !!!! lower/uppercase on/off (embedding Flags in Patterns)
 f("Read his comments at http://www.OReilly.cOm/asktim/index.html. He ...",
 r"""(?x)
 \b
 # Match the leading part (proto://hostname, or just hostname)
 (
    # ftp://, http://, or https:// leading part
    (ftp|https?)://[-\w]+(\.\w[-\w]*)+
  |
    # or, try to find a hostname with our more specific sub-expression
    (?:
        (?: [a-z0-9]  (?: [-a-z0-9]*[a-z0-9])?  \. )+ # sub domains
    )
    # Now ending .com, etc. For these, require lowercase
    (?:   com\b
        | edu\b
        | biz\b
        | gov\b
        | in(?: t|fo)\b # .int or .info
        | mil\b
        | net\b
        | org\b
        | [a-z][a-z]\b # two-letter country codes
    )
 )

 # Allow an optional port number
 ( : \d )?
 # The rest of the URL is optional, and begins with/...
 (
    /
     The rest are beuristics for what seems to work well
    [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*
    (?:
        [.!,?]+ [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+
    )*
 )?
 """)
 

 # !!! "library"(embedded) of regular expression

 #
 f("0382453144941161521344182950354427275201021744323",
 r"""(?x)
 (?:
 (?!44)
 \d\d\d\d\d)*
 (44\d\d\d)
 """)
 
 
 
 
 
 
 


 # -------------------------------------------------------------
 import re
 import timeit

 re_1 = re.compile(r'''\(
                           (
                             [^()]+           # x+
                           |
                             \( [^()]* \)
                           )+
                       \)
                   ''', re.X)

 re_2 = re.compile(r'''\(
                           (
                             (?=(?P<tmp>[^()]+ ))(?P=tmp) # Emulate (?> x+)
                           |
                             \( [^()]* \)
                           )+
                       \)''', re.X)

 print (timeit.timeit("re_1.search('((()' + 'a' * 25)",
                    setup  = "from __main__ import re_1",
                    number = 10))

 print (timeit.timeit("re_2.search('((()' + 'a' * 25)",
                    setup  = "from __main__ import re_2",
                    number = 10))

 pat = re.compile(pattern)
 pat = re.compile(pattern)
 def fs(text, pattern, count):
    t = """pat.search("%s")""" % text
    print (t)
    print ("%e" % timeit.timeit(t,
                    setup  = "from __main__ import pat",
                    number = count)
          )
    

 # -------------------------------------------------------------

 def f1(text, pattern):
    match = None
    for match in re.finditer(pattern, text):
        s = match.start()
        e = match.end()
        


 #Faster failures with atomic grouping
 ft = lambda text, pattern, count:  print ("%e" % timeit.timeit('''f1("""%s""","""%s""")'''%(text, pattern),
                    setup  = "from __main__ import f1",
                    number = count))


 print (timeit.timeit("f('Subject', r'^\w+:')",
                    setup  = "from __main__ import f",
                    number = 1000))
	def f(text, pattern):
	match = None
	for match in re.finditer(pattern, text):
	s = match.start()
	e = match.end()
	print('Found "%s" at %d:%d' % (text[s:e], s, e))
	if match: print(match.groups())

	r = lambda text, pattern, repl : re.compile(pattern).sub(repl, text)



	#lookahead
	f(\92First Last <[email protected]>\92,
	"""(?x)
	# A name is made up of letters, and may include "."
	# for title abbreviations and middle initials.
	( (?P<name>
	([\w.,]+\s+)*[\w.,]+
	)
	\s+
	) # name is no longer optional

	# LOOKAHEAD
	# Email addresses are wrapped in angle brackets, but only
	# if thet are both present or neither is.
	(?= (<.*>$) # remainder wrapped in angle brackets
	\|
	([^<].[^>]$) # remainder not* wrapped in angle brackets
	)
	<? # optional opening angle bracket
	# The address itself: [email protected]
	(?P<email>
	[\w\d.+-]+ # username
	@
	([\w\d.]+\.)+ # domain name prefix
	(com\|org\|edu\|ru) # limit the allowed top-level domains
	)
	>? # optional closing angle bracket

	""")


	# negative lookahead

	f(\[email protected]\92,
	r"""(?x)
	^
	# An address: [email protected]
	# Ignore noreply addresses
	(?!noreply@.*$)
	[\w\d.+-]+ # username
	@
	([\w\d.]+\.)+ # domain name prefix
	(com\|org\|edu\|ru) #limit the allowed top-level domains
	$

	""")

	#positive lookbehind
	f(r"""This text includes two Twitter handles.
	One for @ThePSF, and one for the author, @doughellmann.
	""",
	r"""(?x)
	# A twitter handle: @username
	(?<=@)
	([\w\d_]+) # username
	""")






	amp = lambda text : r(r(r(r(text, r"&", '&'), r"<", '<'), ">", '>'),"(?m)^\s*$", '<p>')

	mail = lambda text: r(text, r"""(?x)(?i)
	(?P<mailhere>
	\b
	# Capture the adress to mailhere
	(
	\w[-.\w]* # username
	\@
	[-a-z0-9]+(\.[-a-z0-9]+)*\.(com\|edu\|info\|ru) #hostname
	)
	\b)
	""", r'<a href="mailto:\g<mailhere>">\g<mailhere></a>')



	url = lambda text: r(text, r"""(?x)(?i)
	(?P<urlhere>
	\b
	# Capture the URL to urlhere
	(
	http:// [-a-z0-9]+(\.[-a-z0-9]+)*\.(com\|edu\|info\|ru) \b # hostname
	(
	/ [-a-z0-9_:\@&?=+,.!/~'%\$]
	)?
	)
	)
	""", r'<a href="\g<urlhere>">\g<urlhere></a>')


	udalenieOdinakovixPodryadIduschixSlov = lambda text: r(text,
	r"""(?x)(?i)
	# (regex starts here)
	### Need to match one word
	\b # Start of word...
	(?P<wordone> [a-z]+ ) # Grab word, filleng wordone

	### Now need to allow any number of spaces and/or <TAGS>
	(?P<wordto> # Save what intervenes to $2
	(?: # Non-capturing parens for grouping alternation
	\s #Whitespace (includes newline, which is good).
	\| # -or-
	<[^>]+> #Item like <TAG>.
	)+ #Need at least one of the above, but allow more
	)

	### Now match the first word again:
	(?P<wordtree> (?P=wordone)\b) # \b ensures not embedded. This copy saved to #3.

	# (regex ends here)
	""",
	'\g<wordone>') # highlight: '\e[7m\g<wordone>\e[m\g<wordto>\e[7m\g<wordtree>\e[m')


	f('<B>Billions</B> and <B>Zillions</B> of suns',
	r'''(?x)
	(?:
	<B> # Match the opening <B>
	(?: # Now, only as many of the following as needed...
	(?! </?B> ) # If not <B>...
	. # ...any character is okay
	)* #
	</B> # ...until the closing delimiter can match
	)
	''')


	r('1.6257', r'(?P<fpoint>\.\d\d(?=(?P<tmp>[1-9]?))(?P=tmp))\d+', '\g<fpoint>')


	#Atomic grouping with (?> [1-9]?) -> (?=(?P<tmp>[1-9]?))(?P=tmp)




	#atomic grouping
	(?>\w+): -> (?=(?P<tmp>\w+))(?P=tmp):
	>>> r("1.264646747",
	r"""(?x)
	(?P<fpoint>
	\.\d\d
	(?=(?P<tmp>[1-9]?))(?P=tmp)
	)
	\d+
	""",
	"\g<fpoint>")


	f("Jan 2", r"Jan (?:[12][0-9]\|3[01]\|0?[0-9])")
	f("Jan 1", r"Jan (?:31\|[123]0\|[012]?[1-9])")
	f("Jan 1", r"Jan (?:0[1-9]\|[12][0-9]?\|3[01]?\|[4-9])")



	####################################################################
	f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
	missing.c msg.c node.c re.c version.c\\""",
	r"""(?x)(?s)(?m)
	^
	\w+
	=
	.*
	(?:
	(?: \\\n.*)
	)*
	""")

	f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
	missing.c msg.c node.c re.c version.c\\""",
	r"""(?x)(?s)(?m)
	^
	\w+
	=
	[^\n\\]*
	(?:
	(?: \\\n[^\n\\]*)
	)*
	""")

	f("""SRC=array.c \\ builtin.c eval.c field.c gawkmisc.c io.c main.c \\
	missing.c msg.c node.c re.c version.c\\""",
	r"""(?x)(?s)(?m)
	^
	\w+
	=
	(?:
	[^\n\r\\]
	\|
	\\.
	)*
	""")
	####################################################################

	#ip adress
	f("0.0.0.1",
	r"""(?x)
	(?<![\w.]) #(^\|\s)
	(?! 0+\.0+\.0+\.0+$) # if not 0.0.0.0
	(?: [01]?\d\d?\|2[0-4]\d\|25[0-5])\.
	(?: [01]?\d\d?\|2[0-4]\d\|25[0-5])\.
	(?: [01]?\d\d?\|2[0-4]\d\|25[0-5])\.
	(?: [01]?\d\d?\|2[0-4]\d\|25[0-5])
	(?![\w.]) #($\|\s)
	""")


	# full path -> filename
	r('usr/local/bin/gcc', r"^.*/", "")
	r(r'\Program Files\Yahoo!\Messenger', "^.*\\", "")

	# $depth /194
	f(" foo(bar(this), 3.7) + 2 * (that - 1); adfasdfas df",
	r"""(?x)
	\(
	[^()]*
	(
	\(
	[^()]*
	\)
	[^()]*
	)*
	\)
	""")

	#
	f("nothing here, but there .55489 ?",
	r"""(?x)
	-?
	[0-9]+
	(?:
	\.
	[0-9]*
	)?
	\|
	-?
	\.
	[0-9]+
	""")

	f('2\"x3\" likeness',
	r"""(?x)
	(
	[^\\"]+
	\|
	\\.

	)*
	""")

	f('Darth Symbol: "/-\|-\\" or "[^-^]"',
	r"""(?x)
	(?=(?P<tmp>
	(
	[^\\"]+
	\|
	\\.
	)*
	))(?P=tmp)
	""")


	#HTML
	f('<input name=dir value=">">',
	r"""(?x)
	< # Opening"<"
	(?=(?P<tmp> # Any amount of...
	(?:
	"[^"]*" # double-quoted string,
	\| # or...
	'[^']*' # single-quoted string,
	\| # or...
	[^'">] # "other stuff"
	)* #
	))(?P=tmp) #
	> # Closing ">"
	""")

	#HTML Link
	f(""" ?<a href="http://www.oreilly.com">O'Reilly Media</a> """,
	r"""(?x)(?s)
	<a\b
	(?P<HREF> [^>]+)
	>
	(.*?)
	</a>
	""")

	#HREF -> URL
	f(' href="http://www.oreilly.com"',
	r"""(?x)(?i)
	\b HREF
	\s* = \s*
	(?:
	"(?: [^"]*)"
	\|
	'(?: [^']*)'
	\|
	(?: [^'">\s]+)
	)
	""")


	#URL (?)
	f("http://www.oreilly.com/mysite/:8080",
	r"""(?x)(?i)(?s)
	^http://
	(?P<Host> [^/:]+)
	(?P<Path> /[^:]*)?
	(?P<Port> :(?: \d+))?
	$
	""")

	#hostname
	f("www.oreilly.com",
	r"""(?x)
	^
	(?i) # apply this regex in a case-instensive manner.
	# One or more dot-separated parts...
	(?: [a-z0-9]\. \| [a-z0-9][-a-z0-9]{0,61}[a-z0-9]\. )*
	# Followed by the final suffix part...
	(?: com\|edu\|gov\|int\|mil\|net\|org\|biz\|info\|name\|museum\|coop\|aero\|[a-z][a-z] )
	$
	""")

	#URL yahoo-finance
	# !!!! lower/uppercase on/off (embedding Flags in Patterns)
	f("Read his comments at http://www.OReilly.cOm/asktim/index.html. He ...",
	r"""(?x)
	\b
	# Match the leading part (proto://hostname, or just hostname)
	(
	# ftp://, http://, or https:// leading part
	(ftp\|https?)://[-\w]+(\.\w[-\w]*)+
	\|
	# or, try to find a hostname with our more specific sub-expression
	(?:
	(?: [a-z0-9] (?: [-a-z0-9]*[a-z0-9])? \. )+ # sub domains
	)
	# Now ending .com, etc. For these, require lowercase
	(?: com\b
	\| edu\b
	\| biz\b
	\| gov\b
	\| in(?: t\|fo)\b # .int or .info
	\| mil\b
	\| net\b
	\| org\b
	\| [a-z][a-z]\b # two-letter country codes
	)
	)

	# Allow an optional port number
	( : \d )?
	# The rest of the URL is optional, and begins with/...
	(
	/
	The rest are beuristics for what seems to work well
	[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*
	(?:
	[.!,?]+ [^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+
	)*
	)?
	""")


	# !!! "library"(embedded) of regular expression

	#
	f("0382453144941161521344182950354427275201021744323",
	r"""(?x)
	(?:
	(?!44)
	\d\d\d\d\d)*
	(44\d\d\d)
	""")









	# -------------------------------------------------------------
	import re
	import timeit

	re_1 = re.compile(r'''\(
	(
	[^()]+ # x+
	\|
	\( [^()]* \)
	)+
	\)
	''', re.X)

	re_2 = re.compile(r'''\(
	(
	(?=(?P<tmp>[^()]+ ))(?P=tmp) # Emulate (?> x+)
	\|
	\( [^()]* \)
	)+
	\)''', re.X)

	print (timeit.timeit("re_1.search('((()' + 'a' * 25)",
	setup = "from __main__ import re_1",
	number = 10))

	print (timeit.timeit("re_2.search('((()' + 'a' * 25)",
	setup = "from __main__ import re_2",
	number = 10))

	pat = re.compile(pattern)
	pat = re.compile(pattern)
	def fs(text, pattern, count):
	t = """pat.search("%s")""" % text
	print (t)
	print ("%e" % timeit.timeit(t,
	setup = "from __main__ import pat",
	number = count)
	)


	# -------------------------------------------------------------

	def f1(text, pattern):
	match = None
	for match in re.finditer(pattern, text):
	s = match.start()
	e = match.end()



	#Faster failures with atomic grouping
	ft = lambda text, pattern, count: print ("%e" % timeit.timeit('''f1("""%s""","""%s""")'''%(text, pattern),
	setup = "from __main__ import f1",
	number = count))


	print (timeit.timeit("f('Subject', r'^\w+:')",
	setup = "from __main__ import f",
	number = 1000))