|
# Screen scrapes the Knight News Challenge entries (all 64 pages of them) |
|
# and counts the number of votes/hearts for each entry. Then displays them |
|
# in rank order. |
|
# |
|
# This script runs in about 20 seconds. |
|
|
|
import requests |
|
from BeautifulSoup import BeautifulSoup |
|
|
|
page = 1 |
|
total_entry_count = 0 |
|
entries = [] |
|
|
|
while True: |
|
print " ---> Found %s entries so far. Now on page: %s" % (len(entries), page) |
|
|
|
knight_url = "http://newschallenge.tumblr.com/page/%s" % (page) |
|
html = requests.get(knight_url).content |
|
soup = BeautifulSoup(html) |
|
postboxes = soup.findAll("div", "postbox") |
|
|
|
# Done if only sticky entry is left. |
|
if len(postboxes) <= 1: |
|
break |
|
|
|
page += 1 |
|
|
|
# 15 entries per page, plus a sticky throwaway entry |
|
for entry in postboxes: |
|
if 'stickyPost' in entry.get('class'): continue |
|
|
|
total_entry_count += 1 |
|
likes = entry.find("", "home-likes") |
|
if likes and likes.text: |
|
likes = int(likes.text) |
|
else: |
|
likes = 0 |
|
|
|
comments = entry.find("", "home-comments") |
|
if comments and comments.text: |
|
comments = int(comments.text) |
|
else: |
|
comments = 0 |
|
|
|
title = entry.find("h2") |
|
if title: |
|
title = title.text |
|
|
|
url = entry.find('a', "home-view") |
|
if url: |
|
url = url.get('href') |
|
|
|
# Only record active entries |
|
if comments or likes: |
|
entries.append({ |
|
'likes': likes, |
|
'comments': comments, |
|
'title': title, |
|
'url': url, |
|
}) |
|
# time.sleep(random.randint(0, 2)) |
|
|
|
entries.sort(key=lambda e: e['comments'] + e['likes']) |
|
active_entry_count = len(entries) |
|
|
|
for i, entry in enumerate(entries): |
|
print " * #%s: %s likes - [%s](%s)" % ( |
|
active_entry_count - i, |
|
entry['likes'], entry['title'], |
|
entry['url']) |
|
|
|
print " ***> Found %s active entries among %s total applications." % ( |
|
active_entry_count, total_entry_count) |