Created
April 11, 2013 20:50
-
-
Save shriphani/5367068 to your computer and use it in GitHub Desktop.
Gets a list of files from the new KBA corpus. Can use this to filter out stuff off a wget dump
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| ''' | |
| Script to download the 2013 corpus | |
| ''' | |
| import requests | |
| import urlparse | |
| from BeautifulSoup import BeautifulSoup, SoupStrainer | |
| KBA_SEED = 'http://s3.amazonaws.com/aws-publicdatasets/trec/kba/kba-streamcorpus-2013-v0_2_0/index.html' | |
| def get_kba_directories_list(kba_link): | |
| resp = requests.get(kba_link) | |
| for link in BeautifulSoup(resp.text, parseOnlyThese = SoupStrainer('a')): | |
| if link.has_key('href'): | |
| yield urlparse.urljoin(kba_link, link['href']) | |
| def get_kba_files_list(directory_link): | |
| resp = requests.get(directory_link) | |
| for link in BeautifulSoup(resp.text, parseOnlyThese = SoupStrainer('a')): | |
| if link.has_key('href'): | |
| yield urlparse.urljoin(directory_link, link['href']) | |
| if __name__ == '__main__': | |
| for dir_link in get_kba_directories_list(KBA_SEED): | |
| for file_link in get_kba_files_list(dir_link): | |
| print file_link |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment