Created
August 23, 2019 03:54
-
-
Save harpocrates/688dbbe9fe390cd2f4c7b4631e94c29c to your computer and use it in GitHub Desktop.
Crawl a website looking for download links
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
import shutil | |
import os | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
import re | |
import argparse | |
class Crawler: | |
def __init__(self, url_regex="", credentials=None): | |
self.todo = [] | |
self.visited = set() | |
self.url_regex = re.compile(url_regex) | |
self.credentials = credentials | |
def queue_page(self, url): | |
"""add a page to the TODO list, if it isn't already visited""" | |
if url in self.visited: return | |
self.visited.add(url) | |
self.todo.append(url) | |
def download_file(self, url, file_name): | |
"""download a file""" | |
print("Downloading " + url + " to " + file_name) | |
# Create missing directories | |
dir_name = os.path.dirname(file_name) | |
if dir_name and not os.path.exists(dir_name): | |
os.makedirs(dir_name) | |
# Stream download the file | |
with requests.get(url, stream=True, auth=self.credentials) as r: | |
with open(file_name, 'wb') as file_handle: | |
shutil.copyfileobj(r.raw, file_handle) | |
def crawl_page(self, url): | |
"""Crawls a page, then returns a set of next pages""" | |
# Don't follow anything outside of BBN | |
if not self.url_regex.match(url): | |
print("Skipping " + url) | |
return [] | |
# Check we've got HTML on our hands before crawling forward | |
head_req = requests.head(url, stream=True, auth=self.credentials) | |
if "text/html" in head_req.headers.get("content-type", ""): | |
with requests.get(url, auth=self.credentials) as r: | |
soup = BeautifulSoup(r.content, features="html.parser") | |
hrefs = soup.findAll('a', href=True) | |
return [ urljoin(url, link['href']) for link in hrefs ] | |
# Download anything else... | |
local_path = urlparse(url).path | |
if local_path.startswith("/"): local_path = local_path[1:] | |
self.download_file(url, local_path) | |
return [] | |
def visit_bounded(self, limit=10000): | |
"""Processes the next item in the TODO list""" | |
count = 0 | |
while self.todo and count < limit: | |
some_url = self.todo.pop() | |
print(str(count) + ": " + some_url) | |
for new_url in self.crawl_page(some_url): | |
self.queue_page(new_url) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Crawl webpages for download links') | |
parser.add_argument('username', metavar='USERNAME', type=str, nargs='?') | |
parser.add_argument('password', metavar='PASSWORD', type=str, nargs='?') | |
parser.add_argument('url_regex', metavar='REGEX', type=str, nargs='?') | |
parser.add_argument('limit_pages', metavar='N', type=int) | |
parser.add_argument('start_url', metavar='URL', type=str) | |
args = parser.parse_args() | |
creds = (args.username, args.password) if args.username and args.password else None | |
crawler = Crawler(url_regex=args.url_regex, credentials=creds) | |
crawler.queue_page(args.start_url) | |
crawler.visit_bounded(args.limit_pages) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment