Last active
February 2, 2017 20:21
-
-
Save jeremyboggs/57a07f01a7cb7ed36da2484b22c41969 to your computer and use it in GitHub Desktop.
Let's download all the Kim Kardashian articles on US Weekly.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import urllib2 | |
from bs4 import BeautifulSoup | |
# Base URL for US Magazine. We'll need this to build the full URL for individual | |
# articles, since it looks like they use relative URLs. | |
base_url = 'http://www.usmagazine.com' | |
# Build the full URL to Kim Kardashian's celebrity page. | |
celebrity_url = base_url + '/celebrities/kim-kardashian?page=' | |
# Hard code range of pages. Quick and dirty. | |
pages = range(1,138) | |
newfile = open('usweekly-articles.csv', 'w') | |
wr = csv.writer(newfile, lineterminator='\n', delimiter=',', escapechar='\\', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) | |
wr.writerow(['title','url']) | |
# Loop over page ranges, and append the number to the celebrity_url | |
for p in enumerate(pages): | |
page_string = str(p) | |
soup = BeautifulSoup(urllib2.urlopen(celebrity_url + page_string).read()) | |
# Find all the articles on each page. | |
for article in soup.find_all('article', {'class': 'celebrity-news-article'}): | |
title = article.h3.contents[0].encode('utf-8') | |
url = article.a['href'].encode('utf-8') | |
url = base_url + url | |
row = [title, url] | |
wr.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment