Created
November 6, 2015 19:23
-
-
Save dansanderson/e6df90a0fb8ac49bb0e0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import datetime | |
import os | |
import re | |
import time | |
import urllib.request | |
CPOSTS_RE = re.compile(r'"(/bbs/cposts/.*\.p8(.png)?)"') | |
SLEEP_SECS = 1 | |
class Error(Exception): | |
pass | |
def msg(s): | |
print('{} {}'.format(datetime.datetime.now(), s)) | |
def get_url(u): | |
"""Get a URL, then sleep a polite duration. | |
Args: | |
u: full URL | |
Returns: | |
Bytes, or None. | |
""" | |
msg('Getting URL: {}'.format(u)) | |
resp = urllib.request.urlopen(u) | |
time.sleep(SLEEP_SECS) | |
if resp is None: | |
msg('... Returned None') | |
return None | |
return resp.read() | |
def save_cart(path): | |
"""Fetch and save a cart for a path. | |
This saves it to a file named similarly to the path, in the current | |
working directory. | |
Args: | |
path: the URL path for the cart as it might appear in a BBS post. | |
Raises: | |
Error: could not fetch cart | |
""" | |
cart_fname = os.path.basename(path) | |
if not path.startswith('http://'): | |
path = 'http://www.lexaloffle.com' + path | |
cart_data = get_url(path) | |
if cart_data is None: | |
raise Error('no cart at {}'.format(path)) | |
msg('Saving cart {} ({} bytes)'.format(cart_fname, len(cart_data))) | |
with open(cart_fname, 'wb') as fh: | |
fh.write(cart_data) | |
def get_bbspost(id): | |
"""Fetch the text of a BBS post given a post ID. | |
Args: | |
id: The BBS post ID as an int. | |
Returns: | |
The utf-8 encoded text of the page, or None. | |
""" | |
data = get_url('http://www.lexaloffle.com/bbs/?tid='+str(id)) | |
if data is None: | |
return None | |
return str(data, encoding='utf-8') | |
def save_carts_for_post(id): | |
"""Save all carts on a given BBS post. | |
Args: | |
id: The BBS post ID as an int. | |
Raises: | |
Error | |
""" | |
post = get_bbspost(id) | |
if post is None: | |
raise Error('no post for ID {}'.format(id)) | |
for path, x in CPOSTS_RE.findall(post): | |
save_cart(path) | |
def crawl(): | |
for x in range(2000,1800,-1): | |
msg('Crawling post {}'.format(x)) | |
try: | |
save_carts_for_post(x) | |
except UnicodeDecodeError as e: | |
msg('*** UnicodeDecodeError for post {}, skipping'.format(x)) | |
except Error as e: | |
msg('*** Error for post {}: {}; skipping'.format(x, e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment