Created
January 22, 2015 17:25
-
-
Save basilesimon/9eb2f7e7449268a39ca0 to your computer and use it in GitHub Desktop.
Scraping CENTCOM's PRs with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from bs4 import BeautifulSoup | |
from urllib2 import urlopen | |
from urlparse import urljoin | |
BASE_URL = "http://www.centcom.mil" | |
BASE_NEWS_URL = "http://www.centcom.mil/en/news" | |
NEWS_PAGE_URL = BASE_NEWS_URL + "/P" | |
def make_soup(url): | |
html = urlopen(url).read() | |
return BeautifulSoup(html) | |
def get_links(section_url): | |
print('Scraping %s for press release URLs...' % (section_url)) | |
soup = make_soup(section_url) | |
table = soup.find("table", "blog") | |
tds = table.findAll("td", "contentheading") | |
return [urljoin(BASE_URL, td.a["href"]) for td in tds] | |
def get_content(link): | |
print('Scraping press release from %s...' % (link)) | |
soup = make_soup(link) | |
table = soup.findAll("table", "contentpaneopen")[1] | |
paras = table.findAll("p") | |
content = [p.text for p in paras] | |
return '\n\n'.join(content) | |
if __name__ == '__main__': | |
links, releases = [], [] | |
urls = [BASE_NEWS_URL] + [NEWS_PAGE_URL + str(i) for i in range(0, 165, 11)] | |
# Scrape following pages | |
for url in urls: | |
links.extend(get_links(url)) | |
# Scrape press releases | |
for link in links: | |
releases.append(get_content(link)) | |
with open('press-releases.json', 'w') as f: | |
json.dump(releases, f, indent=4) | |
print "Output result in press-releases.json" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment