Skip to content

Instantly share code, notes, and snippets.

@shriphani
Created April 11, 2013 20:50
Show Gist options
  • Save shriphani/5367068 to your computer and use it in GitHub Desktop.
Save shriphani/5367068 to your computer and use it in GitHub Desktop.
Gets a list of files from the new KBA corpus. Can use this to filter out stuff off a wget dump
#!/usr/bin/env python
'''
Script to download the 2013 corpus
'''
import requests
import urlparse
from BeautifulSoup import BeautifulSoup, SoupStrainer
KBA_SEED = 'http://s3.amazonaws.com/aws-publicdatasets/trec/kba/kba-streamcorpus-2013-v0_2_0/index.html'
def get_kba_directories_list(kba_link):
resp = requests.get(kba_link)
for link in BeautifulSoup(resp.text, parseOnlyThese = SoupStrainer('a')):
if link.has_key('href'):
yield urlparse.urljoin(kba_link, link['href'])
def get_kba_files_list(directory_link):
resp = requests.get(directory_link)
for link in BeautifulSoup(resp.text, parseOnlyThese = SoupStrainer('a')):
if link.has_key('href'):
yield urlparse.urljoin(directory_link, link['href'])
if __name__ == '__main__':
for dir_link in get_kba_directories_list(KBA_SEED):
for file_link in get_kba_files_list(dir_link):
print file_link
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment