Created
January 29, 2012 11:02
-
-
Save huonw/1698286 to your computer and use it in GitHub Desktop.
Link mine Less Wrong
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2, re, gzip | |
# NB. Data to fill in on lines 16-19 and 30 | |
# Outputs two tab separated files (possibly gzip'd) where the first 3 | |
# fields of each line are: | |
# | |
# 1. id of the item (base 36) | |
# 2. the date (UTC time) | |
# 3. total votes | |
# | |
# The rest of the line consists of each LW link separated by tabs | |
# (including the possibility of no links, in which case there is no | |
# trailing tab) | |
connection = psycopg2.connect(database=, | |
user=, | |
password=, | |
host=) | |
# Output files (can be skipped by setting to None) | |
ARTICLE_FILENAME = 'article-links.txt' | |
COMMENT_FILENAME = 'comment-links.txt' | |
# use gzip to automatically compress the output? | |
# (appends .gz to file name) | |
GZ=False | |
# the ids of the public subreddits (Main and Discussion) | |
PUBLIC_SUBREDDIT_IDS=(,) | |
# CODE | |
cursor = connection.cursor() | |
# Take everything up to the first whitespace (trimming non-link | |
# information can be done later) | |
SEARCH_RE = re.compile(r'lesswrong.com/\S*') | |
# convert to the base-36 identifier used | |
digits = '0123456789abcdefghijklmnopqrstuvwxyz' | |
def int2str(i): | |
build=[] | |
while i: | |
i,rem = divmod(i,36) | |
build.append(digits[rem]) | |
return ''.join(build[::-1]) | |
def go(fname, query): | |
if GZ: | |
f = gzip.open(fname+'.gz','w') | |
else: | |
f = open(fname,'w') | |
cursor.execute(query) | |
for id,date,karma,data in cursor: | |
links = SEARCH_RE.findall(data) | |
# convert to microsecondless, UTC time | |
utc_date = date.replace(microsecond=0,tzinfo=None) - date.utcoffset() | |
# print a tab sep line with "post-id date score[ link ...]" | |
f.write('\t'.join([int2str(id),utc_date.isoformat(),str(karma)] + | |
links) + '\n') | |
f.close() | |
# Articles (filtered by subreddit, keeping only those specified as | |
# "public" above) | |
if ARTICLE_FILENAME: | |
go(ARTICLE_FILENAME, | |
'''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value | |
FROM reddit_thing_link AS thing | |
INNER JOIN reddit_data_link AS data | |
ON thing.thing_id = data.thing_id | |
INNER JOIN reddit_data_link AS data2 | |
ON thing.thing_id = data2.thing_id | |
WHERE NOT thing.deleted AND NOT thing.spam AND data.key='article' AND | |
data2.key='sr_id' AND data2.value IN (%s)''' \ | |
% ','.join("'%d'" % id for id in PUBLIC_SUBREDDIT_IDS)) | |
# Comments (I think all comments are public, so no filtering needed?) | |
if COMMENT_FILENAME: | |
go(COMMENT_FILENAME, | |
'''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value | |
FROM reddit_thing_comment AS thing | |
INNER JOIN reddit_data_comment AS data | |
ON thing.thing_id = data.thing_id | |
WHERE NOT thing.deleted AND NOT thing.spam AND data.key='body' ''') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment