shriphani · April 25, 2013 10:23
diff --git a/nabble_scrape_index_pages.py b/nabble_scrape_index_pages.py
 '''
 Index pages scraper that goes through and finds the most recent pages
 '''

 import argparse
 import os
 import sys
 import warc

 from BeautifulSoup import BeautifulSoup, SoupStrainer
 from datetime import datetime

 START_DATE = datetime(2012, 1, 1, 0, 0, 0)
 END_DATE = datetime(2012, 6, 30, 0, 0, 0)

 ERR_FILE = 'nabble_fuck_ups.err'
 sys.stderr = open(ERR_FILE, 'w+')

 def handle_nabble_job_dir(nabble_job_dir):
 	for root, dirs, files in os.walk(nabble_job_dir):
 		for filename in files:
 			if filename.find('.warc.gz') >= 0:
 				handle_warc_file(os.path.join(root, filename))

 def handle_warc_file(warc_file):
 	f = warc.open(warc_file)

 	for record in f:
 		if record.header.type != 'response':
 			continue

 		if not record.header['WARC-Target-URI'].startswith('http'):
 			continue

 		if is_not_200(record):
 			continue

 		soup = record.payload.read()
 		soup_tables = SoupStrainer('table', {'class' : 'main medium-border-color'})

 		for table in BeautifulSoup(soup, parseOnlyThese = soup_tables):
 			for row in table.findAll('tr'):
 				row_links = row.findAll('a')
 				if row_links and len(row_links) == 4:
 					try:
 						link, epoch_str = parse_row_links(row_links)
 						post_date = datetime.fromtimestamp(int(epoch_str)/1000)
 						if post_date < END_DATE and post_date > START_DATE:
 							print link, post_date
 							sys.stdout.flush()
 					except:
 						pass
 	f.close()

 def parse_row_links(row_links):

 	def parse_date_js(js_str):
 		_, date_portion = js_str.split('new Date')
 		return date_portion.replace('(', '').replace(')', '').replace(';', '')

 	try:
 		link = row_links[1]['href']
 		js_unix_date = row_links[3].findAll('script')[0].findAll(text = True)[0].strip()
 		return link, parse_date_js(js_unix_date)
 	except:
 		sys.stderr.write(row_links)
 		sys.stderr.write('\n')

 def is_not_200(record):
 	first_line = record.payload.readline()

 	if first_line.find('200') < 0:
 		return True
 	return False

 if __name__ == '__main__':
 	def parse_cmdline_args():
 		parser = argparse.ArgumentParser()

 		parser.add_argument(
 			'nabble_jobs_dir',
 			metavar = 'nabble-jobs-dir',
 			help = 'Contains a list of nabble thread links with the time next to them'
 		)

 		return parser.parse_args()

 	parsed = parse_cmdline_args()

 	handle_nabble_job_dir(parsed.nabble_jobs_dir)
	'''
	Index pages scraper that goes through and finds the most recent pages
	'''

	import argparse
	import os
	import sys
	import warc

	from BeautifulSoup import BeautifulSoup, SoupStrainer
	from datetime import datetime

	START_DATE = datetime(2012, 1, 1, 0, 0, 0)
	END_DATE = datetime(2012, 6, 30, 0, 0, 0)

	ERR_FILE = 'nabble_fuck_ups.err'
	sys.stderr = open(ERR_FILE, 'w+')

	def handle_nabble_job_dir(nabble_job_dir):
	for root, dirs, files in os.walk(nabble_job_dir):
	for filename in files:
	if filename.find('.warc.gz') >= 0:
	handle_warc_file(os.path.join(root, filename))

	def handle_warc_file(warc_file):
	f = warc.open(warc_file)

	for record in f:
	if record.header.type != 'response':
	continue

	if not record.header['WARC-Target-URI'].startswith('http'):
	continue

	if is_not_200(record):
	continue

	soup = record.payload.read()
	soup_tables = SoupStrainer('table', {'class' : 'main medium-border-color'})

	for table in BeautifulSoup(soup, parseOnlyThese = soup_tables):
	for row in table.findAll('tr'):
	row_links = row.findAll('a')
	if row_links and len(row_links) == 4:
	try:
	link, epoch_str = parse_row_links(row_links)
	post_date = datetime.fromtimestamp(int(epoch_str)/1000)
	if post_date < END_DATE and post_date > START_DATE:
	print link, post_date
	sys.stdout.flush()
	except:
	pass
	f.close()

	def parse_row_links(row_links):

	def parse_date_js(js_str):
	_, date_portion = js_str.split('new Date')
	return date_portion.replace('(', '').replace(')', '').replace(';', '')

	try:
	link = row_links[1]['href']
	js_unix_date = row_links[3].findAll('script')[0].findAll(text = True)[0].strip()
	return link, parse_date_js(js_unix_date)
	except:
	sys.stderr.write(row_links)
	sys.stderr.write('\n')

	def is_not_200(record):
	first_line = record.payload.readline()

	if first_line.find('200') < 0:
	return True
	return False

	if __name__ == '__main__':
	def parse_cmdline_args():
	parser = argparse.ArgumentParser()

	parser.add_argument(
	'nabble_jobs_dir',
	metavar = 'nabble-jobs-dir',
	help = 'Contains a list of nabble thread links with the time next to them'
	)

	return parser.parse_args()

	parsed = parse_cmdline_args()

	handle_nabble_job_dir(parsed.nabble_jobs_dir)