Created
April 24, 2011 01:43
-
-
Save reduxdj/939229 to your computer and use it in GitHub Desktop.
Here's a simple function that parses out urls in text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def parse_content_for_url(content): | |
| reg = '([a-zA-Z0-9._%-]+)\.(ZW|AC|AD|AE|AERO|AF|AG|AI|AL|AM|AN|AO|AQ|AR|ARPA|AS|ASIA|AT|AU|AW|AX|AZ|BA|BB|BD|BE|BF|BG|BH|BI|BIZ|BJ|BM|BN|BO|BR|BS|BT|BV|BW|BY|BZ|CA|CAT|CC|CD|CF|CG|CH|CI|CK|CL|CM|CN|CO|COM|COOP|CR|CU|CV|CX|CY|CZ|DE|DJ|DK|DM|DO|DZ|EC|EDU|EE|EG|ER|ES|ET|EU|FI|FJ|FK|FM|FO|FR|GA|GB|GD|GE|GF|GG|GH|GI|GL|GM|GN|GOV|GP|GQ|GR|GS|GT|GU|GW|GY|HK|HM|HN|HR|HT|HU|ID|IE|IL|IM|IN|INFO|INT|IO|IQ|IR|IS|IT|JE|JM|JO|JOBS|JP|KE|KG|KH|KI|KM|KN|KP|KR|KW|KY|KZ|LA|LB|LC|LI|LK|LR|LS|LT|LU|LV|LY|MA|MC|MD|ME|MG|MH|MIL|MK|ML|MM|MN|MO|MOBI|MP|MQ|MR|MS|MT|MU|MUSEUM|MV|MW|MX|MY|MZ|NA|NAME|NC|NE|NET|NF|NG|NI|NL|NO|NP|NR|NU|NZ|OM|ORG|PA|PE|PF|PG|PH|PK|PL|PM|PN|PR|PRO|PS|PT|PW|PY|QA|RE|RO|RS|RU|RW|SA|SB|SC|SD|SE|SG|SH|SI|SJ|SK|SL|SM|SN|SO|SR|ST|SU|SV|SY|SZ|TC|TD|TEL|TF|TG|TH|TJ|TK|TL|TM|TN|TO|TP|TR|TRAVEL|TT|TV|TW|TZ|UA|UG|UK|US|UY|UZ|VA|VC|VE|VG|VI|VN|VU|WF|WS|XN|XXX|YE|YT|ZA|ZM|ZW)?' | |
| li = re.findall(reg,content) | |
| left_index = 0 | |
| inc = 0 | |
| for item in li: | |
| left_index=0 | |
| right_index =-1 | |
| if item: | |
| content_index = content.find(item[0]) | |
| if inc>0: | |
| left_index = content.find(item[0]) | |
| if ". " in content: | |
| right_index = content.find('. ',content_index ) | |
| elif " " in content: | |
| right_index = content.find(' ',content_index ) | |
| if right_index==-1: | |
| right_index = len(content) | |
| #print left_index,right_index | |
| replacement = content[left_index:right_index] | |
| while " " in replacement: | |
| new_index = replacement.find(" ") | |
| replacement = replacement[replacement.find(" ")+1:len(replacement)] | |
| if "http://" not in replacement: | |
| content = content.replace(replacement,'<a href="http://%s">%s</a>' % ( replacement,replacement )) | |
| else: | |
| content = content.replace(replacement,'<a href="%s">%s</a>' % ( replacement,replacement )) | |
| inc=inc+1 | |
| return content | |
| print parse_content_for_url('test.com') | |
| print parse_content_for_url('http://google.com is this check out test.com') | |
| print parse_content_for_url('t test.com') | |
| print parse_content_for_url('check out test.com. is my favorite dot.com') | |
| print parse_content_for_url('test.com big.com is.com') |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
uadcore-imac:~ dj$ python index_string.py
test.com
http://google.com is this check out test.com
check out test.com. is my favorite dot.com
test.com big.com is.com
quadcore-imac:~ dj$ python index_string.py
http://google.com is this check out test.com
quadcore-imac:~ dj$ python index_string.py
test.com
http://google.com is this check out test.com
t test.com
check out test.com. is my favorite dot.com
test.com big.com is.com