Skip to content

Instantly share code, notes, and snippets.

@shriphani
Created April 25, 2013 10:23
Show Gist options
  • Save shriphani/5458828 to your computer and use it in GitHub Desktop.
Save shriphani/5458828 to your computer and use it in GitHub Desktop.
scrapes the downloaded index pages and sets up next stage of the crawl
'''
Index pages scraper that goes through and finds the most recent pages
'''
import argparse
import os
import sys
import warc
from BeautifulSoup import BeautifulSoup, SoupStrainer
from datetime import datetime
START_DATE = datetime(2012, 1, 1, 0, 0, 0)
END_DATE = datetime(2012, 6, 30, 0, 0, 0)
ERR_FILE = 'nabble_fuck_ups.err'
sys.stderr = open(ERR_FILE, 'w+')
def handle_nabble_job_dir(nabble_job_dir):
for root, dirs, files in os.walk(nabble_job_dir):
for filename in files:
if filename.find('.warc.gz') >= 0:
handle_warc_file(os.path.join(root, filename))
def handle_warc_file(warc_file):
f = warc.open(warc_file)
for record in f:
if record.header.type != 'response':
continue
if not record.header['WARC-Target-URI'].startswith('http'):
continue
if is_not_200(record):
continue
soup = record.payload.read()
soup_tables = SoupStrainer('table', {'class' : 'main medium-border-color'})
for table in BeautifulSoup(soup, parseOnlyThese = soup_tables):
for row in table.findAll('tr'):
row_links = row.findAll('a')
if row_links and len(row_links) == 4:
try:
link, epoch_str = parse_row_links(row_links)
post_date = datetime.fromtimestamp(int(epoch_str)/1000)
if post_date < END_DATE and post_date > START_DATE:
print link, post_date
sys.stdout.flush()
except:
pass
f.close()
def parse_row_links(row_links):
def parse_date_js(js_str):
_, date_portion = js_str.split('new Date')
return date_portion.replace('(', '').replace(')', '').replace(';', '')
try:
link = row_links[1]['href']
js_unix_date = row_links[3].findAll('script')[0].findAll(text = True)[0].strip()
return link, parse_date_js(js_unix_date)
except:
sys.stderr.write(row_links)
sys.stderr.write('\n')
def is_not_200(record):
first_line = record.payload.readline()
if first_line.find('200') < 0:
return True
return False
if __name__ == '__main__':
def parse_cmdline_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'nabble_jobs_dir',
metavar = 'nabble-jobs-dir',
help = 'Contains a list of nabble thread links with the time next to them'
)
return parser.parse_args()
parsed = parse_cmdline_args()
handle_nabble_job_dir(parsed.nabble_jobs_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment