Created
April 25, 2013 10:23
-
-
Save shriphani/5458828 to your computer and use it in GitHub Desktop.
scrapes the downloaded index pages and sets up next stage of the crawl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| Index pages scraper that goes through and finds the most recent pages | |
| ''' | |
| import argparse | |
| import os | |
| import sys | |
| import warc | |
| from BeautifulSoup import BeautifulSoup, SoupStrainer | |
| from datetime import datetime | |
| START_DATE = datetime(2012, 1, 1, 0, 0, 0) | |
| END_DATE = datetime(2012, 6, 30, 0, 0, 0) | |
| ERR_FILE = 'nabble_fuck_ups.err' | |
| sys.stderr = open(ERR_FILE, 'w+') | |
| def handle_nabble_job_dir(nabble_job_dir): | |
| for root, dirs, files in os.walk(nabble_job_dir): | |
| for filename in files: | |
| if filename.find('.warc.gz') >= 0: | |
| handle_warc_file(os.path.join(root, filename)) | |
| def handle_warc_file(warc_file): | |
| f = warc.open(warc_file) | |
| for record in f: | |
| if record.header.type != 'response': | |
| continue | |
| if not record.header['WARC-Target-URI'].startswith('http'): | |
| continue | |
| if is_not_200(record): | |
| continue | |
| soup = record.payload.read() | |
| soup_tables = SoupStrainer('table', {'class' : 'main medium-border-color'}) | |
| for table in BeautifulSoup(soup, parseOnlyThese = soup_tables): | |
| for row in table.findAll('tr'): | |
| row_links = row.findAll('a') | |
| if row_links and len(row_links) == 4: | |
| try: | |
| link, epoch_str = parse_row_links(row_links) | |
| post_date = datetime.fromtimestamp(int(epoch_str)/1000) | |
| if post_date < END_DATE and post_date > START_DATE: | |
| print link, post_date | |
| sys.stdout.flush() | |
| except: | |
| pass | |
| f.close() | |
| def parse_row_links(row_links): | |
| def parse_date_js(js_str): | |
| _, date_portion = js_str.split('new Date') | |
| return date_portion.replace('(', '').replace(')', '').replace(';', '') | |
| try: | |
| link = row_links[1]['href'] | |
| js_unix_date = row_links[3].findAll('script')[0].findAll(text = True)[0].strip() | |
| return link, parse_date_js(js_unix_date) | |
| except: | |
| sys.stderr.write(row_links) | |
| sys.stderr.write('\n') | |
| def is_not_200(record): | |
| first_line = record.payload.readline() | |
| if first_line.find('200') < 0: | |
| return True | |
| return False | |
| if __name__ == '__main__': | |
| def parse_cmdline_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| 'nabble_jobs_dir', | |
| metavar = 'nabble-jobs-dir', | |
| help = 'Contains a list of nabble thread links with the time next to them' | |
| ) | |
| return parser.parse_args() | |
| parsed = parse_cmdline_args() | |
| handle_nabble_job_dir(parsed.nabble_jobs_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment