Last active
February 8, 2018 18:01
-
-
Save devharsh/4ef651181d490260cfcf2b39c2b37bb3 to your computer and use it in GitHub Desktop.
Lists down all unique internal and external links from all the given values of URL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
__author__ = "Devharsh Trivedi" | |
__copyright__ = "Copyright 2018, Devharsh Trivedi" | |
__license__ = "GPL" | |
__version__ = "1.4" | |
__maintainer__ = "Devharsh Trivedi" | |
__email__ = "[email protected]" | |
__status__ = "Production" | |
import sys | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
try: | |
for link in sys.argv[1:]: | |
page = requests.get(link) | |
soup = BeautifulSoup(page.text, "lxml") | |
extlist = set() | |
intlist = set() | |
for a in soup.findAll("a", attrs={"href":True}): | |
if len(a['href'].strip()) > 1 and a['href'][0] != '#' and 'javascript:' not in a['href'].strip() and 'mailto:' not in a['href'].strip() and 'tel:' not in a['href'].strip(): | |
if 'http' in a['href'].strip() or 'https' in a['href'].strip(): | |
if urlparse(link).netloc.lower() in urlparse(a['href'].strip()).netloc.lower(): | |
intlist.add(a['href']) | |
else: | |
extlist.add(a['href']) | |
else: | |
intlist.add(a['href']) | |
print('\n') | |
print(link) | |
print('---------------------') | |
print('\n') | |
print(str(len(intlist)) + ' internal links found:') | |
print('\n') | |
for il in intlist: | |
print(il) | |
print('\n') | |
print(str(len(extlist)) + ' external links found:') | |
print('\n') | |
for el in extlist: | |
print(el) | |
print('\n') | |
except Exception as e: | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment