Created
November 29, 2016 20:09
-
-
Save clayadavis/f96cf1815a9644cfb228aaee0657b821 to your computer and use it in GitHub Desktop.
Wiki crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
(c)2016 Clayton A Davis | |
''' | |
import itertools | |
import random | |
import time | |
from html.parser import HTMLParser | |
from urllib.parse import urlparse, urljoin, unquote | |
import requests | |
class WikiLinkParser(HTMLParser): | |
''' | |
Simple parser for Wikipedia HTML pages. Extends stdlib html.parser: | |
https://docs.python.org/3/library/html.parser.html | |
For more serious scraping tasks, check out scrapy | |
https://scrapy.org/ | |
''' | |
def __init__(self): | |
self.links = set() | |
super().__init__() | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
attributes = dict(attrs) | |
title = attributes.get('title') | |
href = attributes.get('href') | |
if title and href: | |
url_parse = urlparse(href) | |
url_path = url_parse.path | |
# We want links that look like '/wiki/Social_web' but we want to | |
# exclude "special" links like '/wiki/Talk:Network_Science' | |
if url_path.startswith('/wiki/') and ':' not in url_path.split('/')[2]: | |
self.links.add(url_path) | |
class WikiCrawler(object): | |
''' | |
Crawler that starts at a given Wikipedia page and recursively follows wiki links. | |
arguments | |
start_url: The full URL for the wiki page to start the crawl | |
links_to_follow (default 10): the total number of links to follow for the crawl | |
delay (default 0.2): the delay, in seconds, between HTTP requests | |
''' | |
def __init__(self, start_url, links_to_follow=10, delay=0.2): | |
url_parse = urlparse(start_url) | |
self.root = '{scheme}://{netloc}'.format( | |
scheme=url_parse.scheme, | |
netloc=url_parse.netloc, | |
) | |
self.start_page = url_parse.path | |
self.max_links_to_crawl = links_to_follow | |
self.fetch_delay = delay | |
self.adj_list = {} | |
def _fetch_links(self, path): | |
print('Fetching {}...'.format(path)) | |
url = urljoin(self.root, path) | |
resp = requests.get(url) | |
parser = WikiLinkParser() | |
parser.feed(resp.text) | |
self.adj_list[path] = parser.links | |
print(' ...{} links found.'.format(len(parser.links))) | |
def crawl(self): | |
num_links_crawled = len(self.adj_list) | |
num_links_to_crawl = self.max_links_to_crawl - num_links_crawled | |
if num_links_to_crawl <= 0: | |
return | |
if num_links_crawled == 0: | |
links = [self.start_page] | |
else: | |
possible_links = itertools.chain.from_iterable(self.adj_list.values()) | |
links = [l for l in possible_links if l not in self.adj_list] | |
if len(links) == 0: | |
return | |
elif len(links) <= num_links_to_crawl: | |
to_crawl = links | |
else: | |
to_crawl = random.sample(links, num_links_to_crawl) | |
for idx, link in enumerate(to_crawl): | |
if idx > 0: | |
time.sleep(self.fetch_delay) | |
self._fetch_links(link) | |
self.crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment