exavolt · January 19, 2011 16:37
diff --git a/urlify.py b/urlify.py
 #!/usr/bin/env python

 import cgi
 import re

 import elml


 def _urlify_re_proc(match):
    #TODO: Truncate the display text for long URLs. e.g.:
    # http://example.com/post/this-url-is-too-long-so-it-needs-to-be-trucated-nicely
    # into
    # example.com/post/this-...-nicely
    # see the codes I've written for another app or see the Google search
    urll = match.group(2)
    urlt = urll
    if not ':' in urll:
        if not '/' in urll:
            urll += '/'
        # Naked web URL
        urll = 'http://' + urll
        urlt = urlt.replace('www.', '', 1)
    else:
        # Remove the protocol
        urlt = re.sub(r'[^:]*:', '', urll)
        urlt = urlt.lstrip('/')
        if urlt.endswith('/') and urlt.count('/') == 1:
            urlt = urlt[:-1]
    if urll.startswith('http'):
        urlt = urlt.replace('www.', '', 1)
    return match.group(1) + '<a class="autourl" href="%s">%s</a>' % (urll, urlt)

 # Shamelessly stolen from
 # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 _urlify_re = re.compile(ur"""(?xi)
    ([^"]*?)
    \b
    (                           # Capture 1: entire matched URL
      (?:
        [a-z][\w-]+:                # URL protocol and colon
        (?:
          /{1,3}                        # 1-3 slashes
          |                             #   or
          [a-z0-9%]                     # Single letter or digit or '%'
                                        # (Trying not to match e.g. "URI::Escape")
        )
        |                           #   or
        www\d{0,3}[.]               # "www.", "www1.", "www2." ... "www999."
        |                           #   or
        [a-z0-9.\-]+[.][a-z]{2,4}/  # looks like domain name followed by a slash
      )
      (?:                           # One or more:
        [^\s()<>]+                      # Run of non-space, non-()<>
        |                               #   or
        \(([^\s()<>]+|(\([^\s()<>]+\)))*\)  # balanced parens, up to 2 levels
      )+
      (?:                           # End with:
        \(([^\s()<>]+|(\([^\s()<>]+\)))*\)  # balanced parens, up to 2 levels
        |                                   #   or
        [^\s`!()\[\]{};:'".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019] # not a space or one of these punct chars
      )
    )
    """)


 def urlify(text):
    #TODO: naked email address
    #TODO: use Google's Safe Browsing API to check for malicious sites
    return _urlify_re.sub(_urlify_re_proc, text)

 _user_re = re.compile(r"""\B@([0-9a-zA-Z_]+)""")

 #TODO: custom params
 def convert(text):
    outp = urlify(outp)
    outp = _user_re.sub(r'@<a href="/user/\1">\1</a>', outp)
    return outp
	#!/usr/bin/env python

	import cgi
	import re

	import elml


	def _urlify_re_proc(match):
	#TODO: Truncate the display text for long URLs. e.g.:
	# http://example.com/post/this-url-is-too-long-so-it-needs-to-be-trucated-nicely
	# into
	# example.com/post/this-...-nicely
	# see the codes I've written for another app or see the Google search
	urll = match.group(2)
	urlt = urll
	if not ':' in urll:
	if not '/' in urll:
	urll += '/'
	# Naked web URL
	urll = 'http://' + urll
	urlt = urlt.replace('www.', '', 1)
	else:
	# Remove the protocol
	urlt = re.sub(r'[^:]*:', '', urll)
	urlt = urlt.lstrip('/')
	if urlt.endswith('/') and urlt.count('/') == 1:
	urlt = urlt[:-1]
	if urll.startswith('http'):
	urlt = urlt.replace('www.', '', 1)
	return match.group(1) + '<a class="autourl" href="%s">%s</a>' % (urll, urlt)

	# Shamelessly stolen from
	# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
	_urlify_re = re.compile(ur"""(?xi)
	([^"]*?)
	\b
	( # Capture 1: entire matched URL
	(?:
	[a-z][\w-]+: # URL protocol and colon
	(?:
	/{1,3} # 1-3 slashes
	\| # or
	[a-z0-9%] # Single letter or digit or '%'
	# (Trying not to match e.g. "URI::Escape")
	)
	\| # or
	www\d{0,3}[.] # "www.", "www1.", "www2." ... "www999."
	\| # or
	[a-z0-9.\-]+[.][a-z]{2,4}/ # looks like domain name followed by a slash
	)
	(?: # One or more:
	[^\s()<>]+ # Run of non-space, non-()<>
	\| # or
	\(([^\s()<>]+\|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels
	)+
	(?: # End with:
	\(([^\s()<>]+\|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels
	\| # or
	[^\s`!()\[\]{};:'".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019] # not a space or one of these punct chars
	)
	)
	""")


	def urlify(text):
	#TODO: naked email address
	#TODO: use Google's Safe Browsing API to check for malicious sites
	return _urlify_re.sub(_urlify_re_proc, text)

	_user_re = re.compile(r"""\B@([0-9a-zA-Z_]+)""")

	#TODO: custom params
	def convert(text):
	outp = urlify(outp)
	outp = _user_re.sub(r'@<a href="/user/\1">\1</a>', outp)
	return outp
No results found