Skip to content

Instantly share code, notes, and snippets.

@penafieljlm
Last active July 31, 2017 00:11
Show Gist options
  • Save penafieljlm/8bc0000653a74d464c623d04faf80824 to your computer and use it in GitHub Desktop.
Save penafieljlm/8bc0000653a74d464c623d04faf80824 to your computer and use it in GitHub Desktop.
import re
import tld
# regex by @diegoperini
# taken from https://mathiasbynens.be/demo/url-regex
# modified by @penafieljlm
RE_DOMAIN = re.compile(
r'(?:'
r'\S+'
r'(?'
r'::\S*'
r')?'
r'@'
r')?'
r'(?:'
r'(?!'
r'10'
r'(?:'
r'\.\d{1,3}'
r'){3}'
r')'
r'(?!'
r'127'
r'(?:'
r'\.\d{1,3}'
r'){3}'
r')'
r'(?!'
r'169\.254'
r'(?:'
r'\.\d{1,3}'
r'){2}'
r')'
r'(?!'
r'192\.168'
r'(?:'
r'\.\d{1,3}'
r'){2}'
r')'
r'(?!'
r'172\.'
r'(?:'
r'1[6-9]'
r'|'
r'2\d'
r'|'
r'3[0-1]'
r')'
r'(?:'
r'\.\d{1,3}'
r'){2}'
r')'
r'(?:'
r'[1-9]\d?'
r'|'
r'1\d\d'
r'|'
r'2[01]\d'
r'|'
r'22[0-3]'
r')'
r'(?:'
r'\.'
r'(?:'
r'1?\d{1,2}'
r'|'
r'2[0-4]\d'
r'|'
r'25[0-5]'
r')'
r'){2}'
r'(?:'
r'\.'
r'(?:'
r'[1-9]\d?'
r'|'
r'1\d\d'
r'|'
r'2[0-4]\d'
r'|'
r'25[0-4]'
r')'
r')'
r'|'
r'(?:'
r'(?:'
r'[a-z\xa1-\xff0-9]+-?'
r')*'
r'[a-z\xa1-\xff0-9]+'
r')'
r'(?:\.'
r'(?:'
r'[a-z\xa1-\xff0-9]+-?'
r')*'
r'[a-z\xa1-\xff0-9]+'
r')*'
r'(?:\.'
r'(?:'
r'[a-z\xa1-\xff]{2,}'
r')'
r')'
r')'
r'(?:'
r':\d{2,5}'
r')?'
)
def domains(text):
domains = set()
for domain in RE_DOMAIN.findall(text):
if tld.get_tld('http://{}'.format(domain)):
domains.add(domain)
return domains
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment