Created
January 19, 2011 16:37
-
-
Save exavolt/786418 to your computer and use it in GitHub Desktop.
The name says...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import cgi | |
import re | |
import elml | |
def _urlify_re_proc(match): | |
#TODO: Truncate the display text for long URLs. e.g.: | |
# http://example.com/post/this-url-is-too-long-so-it-needs-to-be-trucated-nicely | |
# into | |
# example.com/post/this-...-nicely | |
# see the codes I've written for another app or see the Google search | |
urll = match.group(2) | |
urlt = urll | |
if not ':' in urll: | |
if not '/' in urll: | |
urll += '/' | |
# Naked web URL | |
urll = 'http://' + urll | |
urlt = urlt.replace('www.', '', 1) | |
else: | |
# Remove the protocol | |
urlt = re.sub(r'[^:]*:', '', urll) | |
urlt = urlt.lstrip('/') | |
if urlt.endswith('/') and urlt.count('/') == 1: | |
urlt = urlt[:-1] | |
if urll.startswith('http'): | |
urlt = urlt.replace('www.', '', 1) | |
return match.group(1) + '<a class="autourl" href="%s">%s</a>' % (urll, urlt) | |
# Shamelessly stolen from | |
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls | |
_urlify_re = re.compile(ur"""(?xi) | |
([^"]*?) | |
\b | |
( # Capture 1: entire matched URL | |
(?: | |
[a-z][\w-]+: # URL protocol and colon | |
(?: | |
/{1,3} # 1-3 slashes | |
| # or | |
[a-z0-9%] # Single letter or digit or '%' | |
# (Trying not to match e.g. "URI::Escape") | |
) | |
| # or | |
www\d{0,3}[.] # "www.", "www1.", "www2." ... "www999." | |
| # or | |
[a-z0-9.\-]+[.][a-z]{2,4}/ # looks like domain name followed by a slash | |
) | |
(?: # One or more: | |
[^\s()<>]+ # Run of non-space, non-()<> | |
| # or | |
\(([^\s()<>]+|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels | |
)+ | |
(?: # End with: | |
\(([^\s()<>]+|(\([^\s()<>]+\)))*\) # balanced parens, up to 2 levels | |
| # or | |
[^\s`!()\[\]{};:'".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019] # not a space or one of these punct chars | |
) | |
) | |
""") | |
def urlify(text): | |
#TODO: naked email address | |
#TODO: use Google's Safe Browsing API to check for malicious sites | |
return _urlify_re.sub(_urlify_re_proc, text) | |
_user_re = re.compile(r"""\B@([0-9a-zA-Z_]+)""") | |
#TODO: custom params | |
def convert(text): | |
outp = urlify(outp) | |
outp = _user_re.sub(r'@<a href="/user/\1">\1</a>', outp) | |
return outp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment