Skip to content

Instantly share code, notes, and snippets.

@devharsh
Last active February 8, 2018 18:01
Show Gist options
  • Save devharsh/4ef651181d490260cfcf2b39c2b37bb3 to your computer and use it in GitHub Desktop.
Save devharsh/4ef651181d490260cfcf2b39c2b37bb3 to your computer and use it in GitHub Desktop.
Lists down all unique internal and external links from all the given values of URL.
#!/usr/bin/python
__author__ = "Devharsh Trivedi"
__copyright__ = "Copyright 2018, Devharsh Trivedi"
__license__ = "GPL"
__version__ = "1.4"
__maintainer__ = "Devharsh Trivedi"
__email__ = "[email protected]"
__status__ = "Production"
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
try:
for link in sys.argv[1:]:
page = requests.get(link)
soup = BeautifulSoup(page.text, "lxml")
extlist = set()
intlist = set()
for a in soup.findAll("a", attrs={"href":True}):
if len(a['href'].strip()) > 1 and a['href'][0] != '#' and 'javascript:' not in a['href'].strip() and 'mailto:' not in a['href'].strip() and 'tel:' not in a['href'].strip():
if 'http' in a['href'].strip() or 'https' in a['href'].strip():
if urlparse(link).netloc.lower() in urlparse(a['href'].strip()).netloc.lower():
intlist.add(a['href'])
else:
extlist.add(a['href'])
else:
intlist.add(a['href'])
print('\n')
print(link)
print('---------------------')
print('\n')
print(str(len(intlist)) + ' internal links found:')
print('\n')
for il in intlist:
print(il)
print('\n')
print(str(len(extlist)) + ' external links found:')
print('\n')
for el in extlist:
print(el)
print('\n')
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment