Last active
August 8, 2017 23:26
-
-
Save ed-flanagan/6e60b47524be2696c5eedd8656f0f4d3 to your computer and use it in GitHub Desktop.
Print list of all comics from http://questionablecontent.net/archive.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
'''Print list of all comics from http://questionablecontent.net/archive.php''' | |
import csv | |
import re | |
import sys | |
from urllib.error import HTTPError, URLError | |
from urllib.parse import urljoin | |
from urllib.request import Request, urlopen | |
from bs4 import BeautifulSoup as BS | |
__author__ = 'Ed Flanagan' | |
__email__ = '[email protected]' | |
__license__ = 'BSD-2-Clause' | |
BASE_URL = 'http://questionablecontent.net' | |
ARCHIVE_URL = urljoin(BASE_URL, 'archive.php') | |
def eprint(*args, **kwargs): | |
'''Print to stderr''' | |
print(*args, file=sys.stderr, **kwargs) | |
def scrape_page(url): | |
'''Return URL response content''' | |
headers = {'User-Agent': 'urllib'} | |
req = Request(url, headers=headers) | |
try: | |
res = urlopen(req) | |
encoding = res.info().get_content_charset(failobj='utf-8') | |
content = res.read() | |
try: | |
return content.decode(encoding) | |
except UnicodeDecodeError: | |
eprint('Failed to decode to unicode.') | |
return content | |
except ValueError: | |
eprint('Failed to decode with charset {!r}'.format(encoding)) | |
return content | |
except (HTTPError, URLError) as err: | |
raise err | |
def get_comic_links(html): | |
'''Return list of comic links sorted by id''' | |
link_regex = re.compile(r'^view\.php\?comic\=(\d+)') | |
title_regex = re.compile(r'Comic\s*(\d+)\:\s*(.+)') | |
soup = BS(html, 'html5lib') | |
comic_links = soup.find_all('a', href=link_regex) | |
links = [] | |
for link in comic_links: | |
title_matches = title_regex.match(link.get_text()) | |
comic_id = title_matches.group(1) | |
comic_title = title_matches.group(2) | |
comic_link = urljoin(BASE_URL, link['href']) | |
links.append({ | |
'id': int(comic_id), | |
'title': comic_title, | |
'link': comic_link | |
}) | |
return sorted(links, key=lambda row: row['id']) | |
def print_tsv(rows): | |
'''Print TSV to stdout''' | |
fieldnames = ['id', 'title', 'link'] | |
writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=fieldnames, | |
quoting=csv.QUOTE_MINIMAL) | |
for row in rows: | |
writer.writerow(row) | |
def main(): | |
'''Main function''' | |
# Get archive page content | |
try: | |
raw_html = scrape_page(ARCHIVE_URL) | |
except HTTPError as err: | |
eprint('urllib HTTPError:') | |
eprint('Error code: ', err.code) | |
eprint(err.reason) | |
sys.exit(1) | |
except URLError as err: | |
eprint('urllib URLError:') | |
eprint('Reason: ', err.reason) | |
sys.exit(1) | |
# Extract links from archive page | |
links = get_comic_links(raw_html) | |
# Print links as TSV to stdout | |
print_tsv(links) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment