dmpayton · July 27, 2016 03:11 · MrCsabaToth · Jul 27, 2016 · MrCsabaToth · Jul 27, 2016
diff --git a/README b/README
 1. `mkvirtualenv philosophy -p /usr/bin/python3.5`

 2. Install dependencies:

  `pip install requests beautifulsoup4`

 3a. run `python crawler.py`
 3b. run `python crawler.py <article_slug>`
 4. Profit^wPhilosophy
diff --git a/crawler.py b/crawler.py
 #!/usr/bin/env python

 # https://xkcd.com/903/ (2011-05-25)
 # https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy

 import re
 import sys
 from itertools import chain
 from urllib.parse import quote

 import requests
 from bs4 import BeautifulSoup

 crawled = []


 def extract_paragraphs(soup):
    # Remove italic text and tables
    invalid = chain(
        soup.find_all('i'),
        soup.find_all('div', class_='hatnote'),
        soup.find_all('table'),
    )
    for item in invalid:
        item.extract()

    # Loop through paragraphs, removing text in ()'s
    # and yielding cleaned content.
    for paragraph in chain(soup.find_all('p'), soup.find_all('li')):
        # Track how many ()s we're in
        paren_count = 0

        # A flag to skip ()'s inside tags (e.g., a[href])
        skip = False

        # Keep track of cleaned content
        cleaned = ''

        for char in str(paragraph):
            # Keep track of when we enter and exit tags
            if char == '<':
                skip = True
            elif char == '>':
                skip = False

            if skip is False:
                # Track how deeply nested in ()'s we are
                if char == '(':
                    paren_count += 1
                elif char == ')':
                    paren_count -= 1
                    continue

            # If we're not inside ()'s, the character is clean
            if paren_count == 0:
                cleaned += char

        yield BeautifulSoup(cleaned, 'html.parser')


 def crawl(page, n=0):
    if page == quote('/wiki/Philosophy'):
        # We've arrived
        print('{0}. !! Philosophy !!'.format(n))
        return

    # Track what pages we've crawled so we can detect infinite loops
    # /wiki/Net_register_tonnage -> /wiki/Gross_register_tonnage
    if page in crawled:
        print('{0}. !! Infinite loop detected !!'.format(n))
        print(page)
        return
    else:
        crawled.append(page)

    # Get the page content
    url = 'https://en.wikipedia.org{0}'.format(page)
    response = requests.get(url)
    html = response.content

    # Parse the html
    soup = BeautifulSoup(html, 'html.parser')

    # Print the title of the page
    title = soup.find('h1', id='firstHeading')
    article = soup.find(id='mw-content-text')

    print('{0}. {1}'.format(n, title.text))

    # Iterate over the paragraphs until we find one where the first
    # link is another wikipedia page, then crawl that page.
    anchor = None
    for para in extract_paragraphs(article):
        anchor = para.find('a', href=re.compile(r'^/wiki/[^\:]+$'))
        if anchor is not None:
            next = dict(anchor.attrs)['href']
            return crawl(next, n + 1)

    if anchor is None:
        print('The trail went cold. :(')

 if __name__ == '__main__':
    try:
        page = '/wiki/{0}'.format(sys.argv[1])
    except IndexError:
        page = '/wiki/Special:Random'

    crawl(page)
	1. `mkvirtualenv philosophy -p /usr/bin/python3.5`

	2. Install dependencies:

	`pip install requests beautifulsoup4`

	3a. run `python crawler.py`
	3b. run `python crawler.py <article_slug>`
	4. Profit^wPhilosophy
	#!/usr/bin/env python

	# https://xkcd.com/903/ (2011-05-25)
	# https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy

	import re
	import sys
	from itertools import chain
	from urllib.parse import quote

	import requests
	from bs4 import BeautifulSoup

	crawled = []


	def extract_paragraphs(soup):
	# Remove italic text and tables
	invalid = chain(
	soup.find_all('i'),
	soup.find_all('div', class_='hatnote'),
	soup.find_all('table'),
	)
	for item in invalid:
	item.extract()

	# Loop through paragraphs, removing text in ()'s
	# and yielding cleaned content.
	for paragraph in chain(soup.find_all('p'), soup.find_all('li')):
	# Track how many ()s we're in
	paren_count = 0

	# A flag to skip ()'s inside tags (e.g., a[href])
	skip = False

	# Keep track of cleaned content
	cleaned = ''

	for char in str(paragraph):
	# Keep track of when we enter and exit tags
	if char == '<':
	skip = True
	elif char == '>':
	skip = False

	if skip is False:
	# Track how deeply nested in ()'s we are
	if char == '(':
	paren_count += 1
	elif char == ')':
	paren_count -= 1
	continue

	# If we're not inside ()'s, the character is clean
	if paren_count == 0:
	cleaned += char

	yield BeautifulSoup(cleaned, 'html.parser')


	def crawl(page, n=0):
	if page == quote('/wiki/Philosophy'):
	# We've arrived
	print('{0}. !! Philosophy !!'.format(n))
	return

	# Track what pages we've crawled so we can detect infinite loops
	# /wiki/Net_register_tonnage -> /wiki/Gross_register_tonnage
	if page in crawled:
	print('{0}. !! Infinite loop detected !!'.format(n))
	print(page)
	return
	else:
	crawled.append(page)

	# Get the page content
	url = 'https://en.wikipedia.org{0}'.format(page)
	response = requests.get(url)
	html = response.content

	# Parse the html
	soup = BeautifulSoup(html, 'html.parser')

	# Print the title of the page
	title = soup.find('h1', id='firstHeading')
	article = soup.find(id='mw-content-text')

	print('{0}. {1}'.format(n, title.text))

	# Iterate over the paragraphs until we find one where the first
	# link is another wikipedia page, then crawl that page.
	anchor = None
	for para in extract_paragraphs(article):
	anchor = para.find('a', href=re.compile(r'^/wiki/[^\:]+$'))
	if anchor is not None:
	next = dict(anchor.attrs)['href']
	return crawl(next, n + 1)

	if anchor is None:
	print('The trail went cold. :(')

	if __name__ == '__main__':
	try:
	page = '/wiki/{0}'.format(sys.argv[1])
	except IndexError:
	page = '/wiki/Special:Random'

	crawl(page)