clayadavis · November 29, 2016 20:09
diff --git a/wiki_crawler.py b/wiki_crawler.py
 '''
 (c)2016 Clayton A Davis
 '''

 import itertools
 import random
 import time
 from html.parser import HTMLParser
 from urllib.parse import urlparse, urljoin, unquote

 import requests


 class WikiLinkParser(HTMLParser):
    '''
    Simple parser for Wikipedia HTML pages. Extends stdlib html.parser:
    https://docs.python.org/3/library/html.parser.html
    
    For more serious scraping tasks, check out scrapy
    https://scrapy.org/
    '''
    def __init__(self):
        self.links = set()
        super().__init__()

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            attributes = dict(attrs)
            title = attributes.get('title')
            href = attributes.get('href')
            if title and href:
                url_parse = urlparse(href)
                url_path = url_parse.path
                # We want links that look like '/wiki/Social_web' but we want to
                # exclude "special" links like '/wiki/Talk:Network_Science'
                if url_path.startswith('/wiki/') and ':' not in url_path.split('/')[2]:
                    self.links.add(url_path)

                    
 class WikiCrawler(object):
    '''
    Crawler that starts at a given Wikipedia page and recursively follows wiki links.
    
    arguments
        start_url: The full URL for the wiki page to start the crawl
        links_to_follow (default 10): the total number of links to follow for the crawl
        delay (default 0.2): the delay, in seconds, between HTTP requests
    '''
    def __init__(self, start_url, links_to_follow=10, delay=0.2):
        url_parse = urlparse(start_url)
        
        self.root = '{scheme}://{netloc}'.format(
            scheme=url_parse.scheme,
            netloc=url_parse.netloc,
            )        
        self.start_page = url_parse.path
        self.max_links_to_crawl = links_to_follow
        self.fetch_delay = delay
        self.adj_list = {}
    
    def _fetch_links(self, path):
        print('Fetching {}...'.format(path))
        url = urljoin(self.root, path)
        resp = requests.get(url)
        parser = WikiLinkParser()
        parser.feed(resp.text)
        self.adj_list[path] = parser.links
        print('    ...{} links found.'.format(len(parser.links)))
    
    def crawl(self):
        num_links_crawled = len(self.adj_list)
        num_links_to_crawl = self.max_links_to_crawl - num_links_crawled
        
        if num_links_to_crawl <= 0:
            return
        
        if num_links_crawled == 0:
            links = [self.start_page]
        else:
            possible_links = itertools.chain.from_iterable(self.adj_list.values())
            links = [l for l in possible_links if l not in self.adj_list]
        
        if len(links) == 0:
            return
        elif len(links) <= num_links_to_crawl:
            to_crawl = links
        else:
            to_crawl = random.sample(links, num_links_to_crawl)
        
        for idx, link in enumerate(to_crawl):
            if idx > 0:
                time.sleep(self.fetch_delay)
            self._fetch_links(link)
            
        self.crawl()
	'''
	(c)2016 Clayton A Davis
	'''

	import itertools
	import random
	import time
	from html.parser import HTMLParser
	from urllib.parse import urlparse, urljoin, unquote

	import requests


	class WikiLinkParser(HTMLParser):
	'''
	Simple parser for Wikipedia HTML pages. Extends stdlib html.parser:
	https://docs.python.org/3/library/html.parser.html

	For more serious scraping tasks, check out scrapy
	https://scrapy.org/
	'''
	def __init__(self):
	self.links = set()
	super().__init__()

	def handle_starttag(self, tag, attrs):
	if tag == 'a':
	attributes = dict(attrs)
	title = attributes.get('title')
	href = attributes.get('href')
	if title and href:
	url_parse = urlparse(href)
	url_path = url_parse.path
	# We want links that look like '/wiki/Social_web' but we want to
	# exclude "special" links like '/wiki/Talk:Network_Science'
	if url_path.startswith('/wiki/') and ':' not in url_path.split('/')[2]:
	self.links.add(url_path)


	class WikiCrawler(object):
	'''
	Crawler that starts at a given Wikipedia page and recursively follows wiki links.

	arguments
	start_url: The full URL for the wiki page to start the crawl
	links_to_follow (default 10): the total number of links to follow for the crawl
	delay (default 0.2): the delay, in seconds, between HTTP requests
	'''
	def __init__(self, start_url, links_to_follow=10, delay=0.2):
	url_parse = urlparse(start_url)

	self.root = '{scheme}://{netloc}'.format(
	scheme=url_parse.scheme,
	netloc=url_parse.netloc,
	)
	self.start_page = url_parse.path
	self.max_links_to_crawl = links_to_follow
	self.fetch_delay = delay
	self.adj_list = {}

	def _fetch_links(self, path):
	print('Fetching {}...'.format(path))
	url = urljoin(self.root, path)
	resp = requests.get(url)
	parser = WikiLinkParser()
	parser.feed(resp.text)
	self.adj_list[path] = parser.links
	print(' ...{} links found.'.format(len(parser.links)))

	def crawl(self):
	num_links_crawled = len(self.adj_list)
	num_links_to_crawl = self.max_links_to_crawl - num_links_crawled

	if num_links_to_crawl <= 0:
	return

	if num_links_crawled == 0:
	links = [self.start_page]
	else:
	possible_links = itertools.chain.from_iterable(self.adj_list.values())
	links = [l for l in possible_links if l not in self.adj_list]

	if len(links) == 0:
	return
	elif len(links) <= num_links_to_crawl:
	to_crawl = links
	else:
	to_crawl = random.sample(links, num_links_to_crawl)

	for idx, link in enumerate(to_crawl):
	if idx > 0:
	time.sleep(self.fetch_delay)
	self._fetch_links(link)

	self.crawl()