Skip to content

Instantly share code, notes, and snippets.

@harpocrates
Created August 23, 2019 03:54
Show Gist options
  • Save harpocrates/688dbbe9fe390cd2f4c7b4631e94c29c to your computer and use it in GitHub Desktop.
Save harpocrates/688dbbe9fe390cd2f4c7b4631e94c29c to your computer and use it in GitHub Desktop.
Crawl a website looking for download links
#!/usr/bin/env python3
import requests
import shutil
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import argparse
class Crawler:
def __init__(self, url_regex="", credentials=None):
self.todo = []
self.visited = set()
self.url_regex = re.compile(url_regex)
self.credentials = credentials
def queue_page(self, url):
"""add a page to the TODO list, if it isn't already visited"""
if url in self.visited: return
self.visited.add(url)
self.todo.append(url)
def download_file(self, url, file_name):
"""download a file"""
print("Downloading " + url + " to " + file_name)
# Create missing directories
dir_name = os.path.dirname(file_name)
if dir_name and not os.path.exists(dir_name):
os.makedirs(dir_name)
# Stream download the file
with requests.get(url, stream=True, auth=self.credentials) as r:
with open(file_name, 'wb') as file_handle:
shutil.copyfileobj(r.raw, file_handle)
def crawl_page(self, url):
"""Crawls a page, then returns a set of next pages"""
# Don't follow anything outside of BBN
if not self.url_regex.match(url):
print("Skipping " + url)
return []
# Check we've got HTML on our hands before crawling forward
head_req = requests.head(url, stream=True, auth=self.credentials)
if "text/html" in head_req.headers.get("content-type", ""):
with requests.get(url, auth=self.credentials) as r:
soup = BeautifulSoup(r.content, features="html.parser")
hrefs = soup.findAll('a', href=True)
return [ urljoin(url, link['href']) for link in hrefs ]
# Download anything else...
local_path = urlparse(url).path
if local_path.startswith("/"): local_path = local_path[1:]
self.download_file(url, local_path)
return []
def visit_bounded(self, limit=10000):
"""Processes the next item in the TODO list"""
count = 0
while self.todo and count < limit:
some_url = self.todo.pop()
print(str(count) + ": " + some_url)
for new_url in self.crawl_page(some_url):
self.queue_page(new_url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Crawl webpages for download links')
parser.add_argument('username', metavar='USERNAME', type=str, nargs='?')
parser.add_argument('password', metavar='PASSWORD', type=str, nargs='?')
parser.add_argument('url_regex', metavar='REGEX', type=str, nargs='?')
parser.add_argument('limit_pages', metavar='N', type=int)
parser.add_argument('start_url', metavar='URL', type=str)
args = parser.parse_args()
creds = (args.username, args.password) if args.username and args.password else None
crawler = Crawler(url_regex=args.url_regex, credentials=creds)
crawler.queue_page(args.start_url)
crawler.visit_bounded(args.limit_pages)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment