Created
July 27, 2016 03:11
-
-
Save dmpayton/e8cf14c569cacaf3a6d6212db2238cce to your computer and use it in GitHub Desktop.
FresnoPython web crawler demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. `mkvirtualenv philosophy -p /usr/bin/python3.5` | |
2. Install dependencies: | |
`pip install requests beautifulsoup4` | |
3a. run `python crawler.py` | |
3b. run `python crawler.py <article_slug>` | |
4. Profit^wPhilosophy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# https://xkcd.com/903/ (2011-05-25) | |
# https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy | |
import re | |
import sys | |
from itertools import chain | |
from urllib.parse import quote | |
import requests | |
from bs4 import BeautifulSoup | |
crawled = [] | |
def extract_paragraphs(soup): | |
# Remove italic text and tables | |
invalid = chain( | |
soup.find_all('i'), | |
soup.find_all('div', class_='hatnote'), | |
soup.find_all('table'), | |
) | |
for item in invalid: | |
item.extract() | |
# Loop through paragraphs, removing text in ()'s | |
# and yielding cleaned content. | |
for paragraph in chain(soup.find_all('p'), soup.find_all('li')): | |
# Track how many ()s we're in | |
paren_count = 0 | |
# A flag to skip ()'s inside tags (e.g., a[href]) | |
skip = False | |
# Keep track of cleaned content | |
cleaned = '' | |
for char in str(paragraph): | |
# Keep track of when we enter and exit tags | |
if char == '<': | |
skip = True | |
elif char == '>': | |
skip = False | |
if skip is False: | |
# Track how deeply nested in ()'s we are | |
if char == '(': | |
paren_count += 1 | |
elif char == ')': | |
paren_count -= 1 | |
continue | |
# If we're not inside ()'s, the character is clean | |
if paren_count == 0: | |
cleaned += char | |
yield BeautifulSoup(cleaned, 'html.parser') | |
def crawl(page, n=0): | |
if page == quote('/wiki/Philosophy'): | |
# We've arrived | |
print('{0}. !! Philosophy !!'.format(n)) | |
return | |
# Track what pages we've crawled so we can detect infinite loops | |
# /wiki/Net_register_tonnage -> /wiki/Gross_register_tonnage | |
if page in crawled: | |
print('{0}. !! Infinite loop detected !!'.format(n)) | |
print(page) | |
return | |
else: | |
crawled.append(page) | |
# Get the page content | |
url = 'https://en.wikipedia.org{0}'.format(page) | |
response = requests.get(url) | |
html = response.content | |
# Parse the html | |
soup = BeautifulSoup(html, 'html.parser') | |
# Print the title of the page | |
title = soup.find('h1', id='firstHeading') | |
article = soup.find(id='mw-content-text') | |
print('{0}. {1}'.format(n, title.text)) | |
# Iterate over the paragraphs until we find one where the first | |
# link is another wikipedia page, then crawl that page. | |
anchor = None | |
for para in extract_paragraphs(article): | |
anchor = para.find('a', href=re.compile(r'^/wiki/[^\:]+$')) | |
if anchor is not None: | |
next = dict(anchor.attrs)['href'] | |
return crawl(next, n + 1) | |
if anchor is None: | |
print('The trail went cold. :(') | |
if __name__ == '__main__': | |
try: | |
page = '/wiki/{0}'.format(sys.argv[1]) | |
except IndexError: | |
page = '/wiki/Special:Random' | |
crawl(page) |
Question: does this script use any features of Python 3 which wouldn't work with 2.7?
@MrCsabaToth The only change you should need for Python 2 is to change line 9 to from urllib import quote
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interesting! French, Spanish and Hungarian languages lead to a lot of infinite loops, so apparently only English articles are well formed enough to not get into infinite loops. With
urllib
the script support those languages which have accent in the translated version ofphilosophy
likefilozófia
. Pretty cool.