Last active
August 29, 2015 14:16
-
-
Save benkant/c93aeb50046aae2ab2e9 to your computer and use it in GitHub Desktop.
check a bunch of sites for a bunch of terms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re, urllib, sys | |
from urlparse import urlparse, urlunparse | |
terms = ['deepmind', 'recursive'] | |
sites = ['http://torch.ch', 'http://www.arcadelearningenvironment.org/'] | |
for site in sites: | |
base_url = urlparse(site) | |
visited_links = [] | |
url_queue = [] | |
url_queue.append(base_url.geturl()) | |
while len(url_queue) > 0: | |
s_url = url_queue.pop() | |
visited_links.append(s_url) | |
parsed_url = urlparse(s_url) | |
content = urllib.urlopen(parsed_url.geturl()).read() | |
found = False | |
for term in terms: | |
if term in content.lower(): | |
print "{} contains {}".format(parsed_url.geturl(), term) | |
found = True | |
break | |
# if any terms are found, move to the next site | |
if found: | |
break | |
for link in re.findall('''href=["'](.[^"']+)["']''', content, re.I): | |
try: | |
check_url = urlparse(link) | |
except ValueError: | |
# some garbage | |
continue | |
# check if this link is to another site | |
if check_url.scheme != '': | |
if base_url.netloc not in check_url.netloc or 'http' not in check_url.scheme: | |
continue | |
normalised_url = check_url.geturl() | |
else: | |
# normalise the url | |
normalised_url = urlunparse( | |
(base_url[0], base_url[1], check_url[2], check_url[3], check_url[4], '') | |
) | |
# check if we've already visited | |
if normalised_url not in visited_links and normalised_url not in url_queue: | |
url_queue.append(normalised_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment