huonw · January 29, 2012 11:02
diff --git a/link-mine.py b/link-mine.py
 import psycopg2, re, gzip

 # NB. Data to fill in on lines 16-19 and 30

 # Outputs two tab separated files (possibly gzip'd) where the first 3
 # fields of each line are:
 #
 #   1. id of the item (base 36)
 #   2. the date (UTC time)
 #   3. total votes
 #
 # The rest of the line consists of each LW link separated by tabs
 # (including the possibility of no links, in which case there is no
 # trailing tab)

 connection = psycopg2.connect(database=,
                              user=,
                              password=,
                              host=)

 # Output files (can be skipped by setting to None)
 ARTICLE_FILENAME = 'article-links.txt'
 COMMENT_FILENAME = 'comment-links.txt'

 # use gzip to automatically compress the output?
 # (appends .gz to file name)
 GZ=False

 # the ids of the public subreddits (Main and Discussion)
 PUBLIC_SUBREDDIT_IDS=(,)



 # CODE

 cursor = connection.cursor()

 # Take everything up to the first whitespace (trimming non-link
 # information can be done later)
 SEARCH_RE = re.compile(r'lesswrong.com/\S*')

 # convert to the base-36 identifier used
 digits = '0123456789abcdefghijklmnopqrstuvwxyz'
 def int2str(i):
    build=[]
    while i:
        i,rem = divmod(i,36)
        build.append(digits[rem])
    return ''.join(build[::-1])

 def go(fname, query):
    if GZ:
        f = gzip.open(fname+'.gz','w')
    else:
        f = open(fname,'w')

    cursor.execute(query)
    for id,date,karma,data in cursor:
        links = SEARCH_RE.findall(data)

        # convert to microsecondless, UTC time
        utc_date = date.replace(microsecond=0,tzinfo=None) - date.utcoffset()

        # print a tab sep line with "post-id   date   score[   link ...]"
        f.write('\t'.join([int2str(id),utc_date.isoformat(),str(karma)] +
                          links) + '\n')
    f.close()

 # Articles (filtered by subreddit, keeping only those specified as
 # "public" above)
 if ARTICLE_FILENAME:
    go(ARTICLE_FILENAME,
 '''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value
    FROM reddit_thing_link AS thing
        INNER JOIN reddit_data_link AS data
            ON thing.thing_id = data.thing_id
        INNER JOIN reddit_data_link AS data2
            ON thing.thing_id = data2.thing_id
    WHERE NOT thing.deleted AND NOT thing.spam AND data.key='article' AND
          data2.key='sr_id' AND data2.value IN (%s)''' \
           % ','.join("'%d'" % id for id in PUBLIC_SUBREDDIT_IDS))

 # Comments (I think all comments are public, so no filtering needed?)
 if COMMENT_FILENAME:
    go(COMMENT_FILENAME,
 '''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value
    FROM reddit_thing_comment AS thing
        INNER JOIN reddit_data_comment AS data
            ON thing.thing_id = data.thing_id
    WHERE NOT thing.deleted AND NOT thing.spam AND data.key='body' ''')
	import psycopg2, re, gzip

	# NB. Data to fill in on lines 16-19 and 30

	# Outputs two tab separated files (possibly gzip'd) where the first 3
	# fields of each line are:
	#
	# 1. id of the item (base 36)
	# 2. the date (UTC time)
	# 3. total votes
	#
	# The rest of the line consists of each LW link separated by tabs
	# (including the possibility of no links, in which case there is no
	# trailing tab)

	connection = psycopg2.connect(database=,
	user=,
	password=,
	host=)

	# Output files (can be skipped by setting to None)
	ARTICLE_FILENAME = 'article-links.txt'
	COMMENT_FILENAME = 'comment-links.txt'

	# use gzip to automatically compress the output?
	# (appends .gz to file name)
	GZ=False

	# the ids of the public subreddits (Main and Discussion)
	PUBLIC_SUBREDDIT_IDS=(,)



	# CODE

	cursor = connection.cursor()

	# Take everything up to the first whitespace (trimming non-link
	# information can be done later)
	SEARCH_RE = re.compile(r'lesswrong.com/\S*')

	# convert to the base-36 identifier used
	digits = '0123456789abcdefghijklmnopqrstuvwxyz'
	def int2str(i):
	build=[]
	while i:
	i,rem = divmod(i,36)
	build.append(digits[rem])
	return ''.join(build[::-1])

	def go(fname, query):
	if GZ:
	f = gzip.open(fname+'.gz','w')
	else:
	f = open(fname,'w')

	cursor.execute(query)
	for id,date,karma,data in cursor:
	links = SEARCH_RE.findall(data)

	# convert to microsecondless, UTC time
	utc_date = date.replace(microsecond=0,tzinfo=None) - date.utcoffset()

	# print a tab sep line with "post-id date score[ link ...]"
	f.write('\t'.join([int2str(id),utc_date.isoformat(),str(karma)] +
	links) + '\n')
	f.close()

	# Articles (filtered by subreddit, keeping only those specified as
	# "public" above)
	if ARTICLE_FILENAME:
	go(ARTICLE_FILENAME,
	'''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value
	FROM reddit_thing_link AS thing
	INNER JOIN reddit_data_link AS data
	ON thing.thing_id = data.thing_id
	INNER JOIN reddit_data_link AS data2
	ON thing.thing_id = data2.thing_id
	WHERE NOT thing.deleted AND NOT thing.spam AND data.key='article' AND
	data2.key='sr_id' AND data2.value IN (%s)''' \
	% ','.join("'%d'" % id for id in PUBLIC_SUBREDDIT_IDS))

	# Comments (I think all comments are public, so no filtering needed?)
	if COMMENT_FILENAME:
	go(COMMENT_FILENAME,
	'''SELECT thing.thing_id, thing.date, thing.ups - thing.downs, data.value
	FROM reddit_thing_comment AS thing
	INNER JOIN reddit_data_comment AS data
	ON thing.thing_id = data.thing_id
	WHERE NOT thing.deleted AND NOT thing.spam AND data.key='body' ''')