Skip to content

Instantly share code, notes, and snippets.

@dansanderson
Created November 6, 2015 19:23
Show Gist options
  • Save dansanderson/e6df90a0fb8ac49bb0e0 to your computer and use it in GitHub Desktop.
Save dansanderson/e6df90a0fb8ac49bb0e0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import datetime
import os
import re
import time
import urllib.request
CPOSTS_RE = re.compile(r'"(/bbs/cposts/.*\.p8(.png)?)"')
SLEEP_SECS = 1
class Error(Exception):
pass
def msg(s):
print('{} {}'.format(datetime.datetime.now(), s))
def get_url(u):
"""Get a URL, then sleep a polite duration.
Args:
u: full URL
Returns:
Bytes, or None.
"""
msg('Getting URL: {}'.format(u))
resp = urllib.request.urlopen(u)
time.sleep(SLEEP_SECS)
if resp is None:
msg('... Returned None')
return None
return resp.read()
def save_cart(path):
"""Fetch and save a cart for a path.
This saves it to a file named similarly to the path, in the current
working directory.
Args:
path: the URL path for the cart as it might appear in a BBS post.
Raises:
Error: could not fetch cart
"""
cart_fname = os.path.basename(path)
if not path.startswith('http://'):
path = 'http://www.lexaloffle.com' + path
cart_data = get_url(path)
if cart_data is None:
raise Error('no cart at {}'.format(path))
msg('Saving cart {} ({} bytes)'.format(cart_fname, len(cart_data)))
with open(cart_fname, 'wb') as fh:
fh.write(cart_data)
def get_bbspost(id):
"""Fetch the text of a BBS post given a post ID.
Args:
id: The BBS post ID as an int.
Returns:
The utf-8 encoded text of the page, or None.
"""
data = get_url('http://www.lexaloffle.com/bbs/?tid='+str(id))
if data is None:
return None
return str(data, encoding='utf-8')
def save_carts_for_post(id):
"""Save all carts on a given BBS post.
Args:
id: The BBS post ID as an int.
Raises:
Error
"""
post = get_bbspost(id)
if post is None:
raise Error('no post for ID {}'.format(id))
for path, x in CPOSTS_RE.findall(post):
save_cart(path)
def crawl():
for x in range(2000,1800,-1):
msg('Crawling post {}'.format(x))
try:
save_carts_for_post(x)
except UnicodeDecodeError as e:
msg('*** UnicodeDecodeError for post {}, skipping'.format(x))
except Error as e:
msg('*** Error for post {}: {}; skipping'.format(x, e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment