Created
January 11, 2017 20:10
-
-
Save Blizzardo1/57cf16eadf5467f66c53b7a00735c90c to your computer and use it in GitHub Desktop.
Made with a little help because I don't know Python for shit...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
from urllib.request import urlopen | |
from urllib import parse | |
import traceback | |
class Crawler(HTMLParser): | |
def handle_starttag(self, tag, attribs): | |
if tag == 'a': | |
for (key, val) in attribs: | |
if key == 'href': | |
newUrl = parse.urljoin(self.baseUrl, val) | |
self.links = self.links + [newUrl] | |
def getLinks(self, url): | |
self.links = [] | |
self.baseUrl = url | |
response = urlopen(url) | |
if response.getheader('Content-Type') == 'text/html': | |
htmlBytes = response.read() | |
htmlString = htmlBytes.decode("utf-8") | |
self.feed(htmlString) | |
return htmlString, self.links | |
else: | |
return "",[] | |
def crawl(url, word, maxPages): | |
pagesToVisit = [url] | |
numberVisited = 0 | |
foundWord = False | |
while numberVisited < maxPages and pagesToVisit != [] and not foundWord: | |
numberVisited = numberVisited + 1 | |
url = pagesToVisit[0] | |
pagesToVisit = pagesToVisit[1:] | |
try: | |
print(numberVisited, "Visiting:", url) | |
parser = Crawler() | |
data, links = parser.getLinks(url) | |
if data.find(word) > -1: | |
foundWord = True | |
pagesToVisit = pagesToVisit + links | |
print(" **Success!** ") | |
except: | |
print(" **Failure!** ") | |
traceback.print_exc() | |
if foundWord: | |
print("The word \"", word, "\" was found at \"",url,"\"") | |
else: | |
print("Could not find the word \"", word, "\" anywhere! D:") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment