Skip to content

Instantly share code, notes, and snippets.

@clayadavis
Created November 29, 2016 20:09
Show Gist options
  • Save clayadavis/f96cf1815a9644cfb228aaee0657b821 to your computer and use it in GitHub Desktop.
Save clayadavis/f96cf1815a9644cfb228aaee0657b821 to your computer and use it in GitHub Desktop.
Wiki crawler
'''
(c)2016 Clayton A Davis
'''
import itertools
import random
import time
from html.parser import HTMLParser
from urllib.parse import urlparse, urljoin, unquote
import requests
class WikiLinkParser(HTMLParser):
'''
Simple parser for Wikipedia HTML pages. Extends stdlib html.parser:
https://docs.python.org/3/library/html.parser.html
For more serious scraping tasks, check out scrapy
https://scrapy.org/
'''
def __init__(self):
self.links = set()
super().__init__()
def handle_starttag(self, tag, attrs):
if tag == 'a':
attributes = dict(attrs)
title = attributes.get('title')
href = attributes.get('href')
if title and href:
url_parse = urlparse(href)
url_path = url_parse.path
# We want links that look like '/wiki/Social_web' but we want to
# exclude "special" links like '/wiki/Talk:Network_Science'
if url_path.startswith('/wiki/') and ':' not in url_path.split('/')[2]:
self.links.add(url_path)
class WikiCrawler(object):
'''
Crawler that starts at a given Wikipedia page and recursively follows wiki links.
arguments
start_url: The full URL for the wiki page to start the crawl
links_to_follow (default 10): the total number of links to follow for the crawl
delay (default 0.2): the delay, in seconds, between HTTP requests
'''
def __init__(self, start_url, links_to_follow=10, delay=0.2):
url_parse = urlparse(start_url)
self.root = '{scheme}://{netloc}'.format(
scheme=url_parse.scheme,
netloc=url_parse.netloc,
)
self.start_page = url_parse.path
self.max_links_to_crawl = links_to_follow
self.fetch_delay = delay
self.adj_list = {}
def _fetch_links(self, path):
print('Fetching {}...'.format(path))
url = urljoin(self.root, path)
resp = requests.get(url)
parser = WikiLinkParser()
parser.feed(resp.text)
self.adj_list[path] = parser.links
print(' ...{} links found.'.format(len(parser.links)))
def crawl(self):
num_links_crawled = len(self.adj_list)
num_links_to_crawl = self.max_links_to_crawl - num_links_crawled
if num_links_to_crawl <= 0:
return
if num_links_crawled == 0:
links = [self.start_page]
else:
possible_links = itertools.chain.from_iterable(self.adj_list.values())
links = [l for l in possible_links if l not in self.adj_list]
if len(links) == 0:
return
elif len(links) <= num_links_to_crawl:
to_crawl = links
else:
to_crawl = random.sample(links, num_links_to_crawl)
for idx, link in enumerate(to_crawl):
if idx > 0:
time.sleep(self.fetch_delay)
self._fetch_links(link)
self.crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment