Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save reduxdj/939229 to your computer and use it in GitHub Desktop.

Select an option

Save reduxdj/939229 to your computer and use it in GitHub Desktop.
Here's a simple function that parses out urls in text
import re
def parse_content_for_url(content):
reg = '([a-zA-Z0-9._%-]+)\.(ZW|AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|XN|XXX|YE|YT|ZA|ZM|ZW)?'
li = re.findall(reg,content)
left_index = 0
inc = 0
for item in li:
left_index=0
right_index =-1
if item:
content_index = content.find(item[0])
if inc>0:
left_index = content.find(item[0])
if ". " in content:
right_index = content.find('. ',content_index )
elif " " in content:
right_index = content.find(' ',content_index )
if right_index==-1:
right_index = len(content)
#print left_index,right_index
replacement = content[left_index:right_index]
while " " in replacement:
new_index = replacement.find(" ")
replacement = replacement[replacement.find(" ")+1:len(replacement)]
if "http://" not in replacement:
content = content.replace(replacement,'<a href="http://%s">%s</a>' % ( replacement,replacement ))
else:
content = content.replace(replacement,'<a href="%s">%s</a>' % ( replacement,replacement ))
inc=inc+1
return content
print parse_content_for_url('test.com')
print parse_content_for_url('http://google.com is this check out test.com')
print parse_content_for_url('t test.com')
print parse_content_for_url('check out test.com. is my favorite dot.com')
print parse_content_for_url('test.com big.com is.com')
@reduxdj
Copy link
Copy Markdown
Author

reduxdj commented Apr 24, 2011

uadcore-imac:~ dj$ python index_string.py
test.com
http://google.com is this check out test.com
check out test.com. is my favorite dot.com
test.com big.com is.com
quadcore-imac:~ dj$ python index_string.py
http://google.com is this check out test.com
quadcore-imac:~ dj$ python index_string.py
test.com
http://google.com is this check out test.com
t test.com
check out test.com. is my favorite dot.com
test.com big.com is.com

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment