Skip to content

Instantly share code, notes, and snippets.

@ed-flanagan
Last active August 8, 2017 23:26
Show Gist options
  • Save ed-flanagan/6e60b47524be2696c5eedd8656f0f4d3 to your computer and use it in GitHub Desktop.
Save ed-flanagan/6e60b47524be2696c5eedd8656f0f4d3 to your computer and use it in GitHub Desktop.
Print list of all comics from http://questionablecontent.net/archive.php
#!/usr/bin/env python3
'''Print list of all comics from http://questionablecontent.net/archive.php'''
import csv
import re
import sys
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as BS
__author__ = 'Ed Flanagan'
__email__ = '[email protected]'
__license__ = 'BSD-2-Clause'
BASE_URL = 'http://questionablecontent.net'
ARCHIVE_URL = urljoin(BASE_URL, 'archive.php')
def eprint(*args, **kwargs):
'''Print to stderr'''
print(*args, file=sys.stderr, **kwargs)
def scrape_page(url):
'''Return URL response content'''
headers = {'User-Agent': 'urllib'}
req = Request(url, headers=headers)
try:
res = urlopen(req)
encoding = res.info().get_content_charset(failobj='utf-8')
content = res.read()
try:
return content.decode(encoding)
except UnicodeDecodeError:
eprint('Failed to decode to unicode.')
return content
except ValueError:
eprint('Failed to decode with charset {!r}'.format(encoding))
return content
except (HTTPError, URLError) as err:
raise err
def get_comic_links(html):
'''Return list of comic links sorted by id'''
link_regex = re.compile(r'^view\.php\?comic\=(\d+)')
title_regex = re.compile(r'Comic\s*(\d+)\:\s*(.+)')
soup = BS(html, 'html5lib')
comic_links = soup.find_all('a', href=link_regex)
links = []
for link in comic_links:
title_matches = title_regex.match(link.get_text())
comic_id = title_matches.group(1)
comic_title = title_matches.group(2)
comic_link = urljoin(BASE_URL, link['href'])
links.append({
'id': int(comic_id),
'title': comic_title,
'link': comic_link
})
return sorted(links, key=lambda row: row['id'])
def print_tsv(rows):
'''Print TSV to stdout'''
fieldnames = ['id', 'title', 'link']
writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=fieldnames,
quoting=csv.QUOTE_MINIMAL)
for row in rows:
writer.writerow(row)
def main():
'''Main function'''
# Get archive page content
try:
raw_html = scrape_page(ARCHIVE_URL)
except HTTPError as err:
eprint('urllib HTTPError:')
eprint('Error code: ', err.code)
eprint(err.reason)
sys.exit(1)
except URLError as err:
eprint('urllib URLError:')
eprint('Reason: ', err.reason)
sys.exit(1)
# Extract links from archive page
links = get_comic_links(raw_html)
# Print links as TSV to stdout
print_tsv(links)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment