Last active
November 18, 2017 11:12
-
-
Save estevaofon/6607460ea9c13c50e58f5ba31cc40722 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen | |
from urllib.parse import urlparse | |
from bs4 import BeautifulSoup | |
import re | |
import datetime | |
import random | |
pages = set() | |
random.seed(datetime.datetime.now()) | |
# Retrieves a list of all Internal links found on a page | |
def getInternalLinks(bsObj, includeUrl): | |
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc | |
internalLinks = [] | |
# Finds all links that begin with a "/" | |
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")): | |
if link.attrs['href'] is not None: | |
if link.attrs['href'] not in internalLinks: | |
if(link.attrs['href'].startswith("/")): | |
internalLinks.append(includeUrl+link.attrs['href']) | |
else: | |
internalLinks.append(link.attrs['href']) | |
return internalLinks | |
# Retrieves a list of all external links found on a page | |
def getExternalLinks(bsObj, excludeUrl): | |
externalLinks = [] | |
# Finds all links that start with "http" that do | |
# not contain the current URL | |
for link in bsObj.findAll("a", href=re.compile( | |
"^(http|www)((?!"+excludeUrl+").)*$")): | |
if link.attrs['href'] is not None: | |
if link.attrs['href'] not in externalLinks: | |
externalLinks.append(link.attrs['href']) | |
return externalLinks | |
def getRandomExternalLink(startingPage): | |
html = urlopen(startingPage) | |
bsObj = BeautifulSoup(html) | |
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) | |
if len(externalLinks) == 0: | |
print("No external links, looking around the site for one") | |
domain = urlparse(startingPage).scheme+"://" \ | |
+ urlparse(startingPage).netloc | |
internalLinks = getInternalLinks(bsObj, domain) | |
return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)]) | |
else: | |
return externalLinks[random.randint(0, len(externalLinks)-1)] | |
def followExternalOnly(startingSite): | |
externalLink = getRandomExternalLink(startingSite) | |
print("Random external link is: "+externalLink) | |
followExternalOnly(externalLink) | |
if __name__ == "__main__": | |
html = urlopen("https://www.xxxx.com.br") | |
bsObj = BeautifulSoup(html, "lxml") | |
links_to_crawl = crawler.getInternalLinks(bsObj, "https://www.xxxx.com.br") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment