Created
December 13, 2011 23:05
-
-
Save ryanpitts/1474369 to your computer and use it in GitHub Desktop.
This script uses the Facebook API to fetch stats for a given list of URLs. We use it at year's end to compile a "most shared" story list.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, traceback, optparse, urllib2, time | |
from lxml import etree | |
from BeautifulSoup import BeautifulSoup | |
from datetime import datetime, timedelta | |
TOP_URLS_FILE = "top_urls.csv" | |
COUNTING_STATS_FILE = "counting_stats.txt" | |
TOP_THRESHHOLD = 50 | |
MEDIUM_THRESHOLD = 10 | |
SMALL_THRESHOLD = 1 | |
SLEEP_SECONDS = 2 | |
def add_url_to_top_urls(like_count, total_count, share_count, url): | |
f = open(TOP_URLS_FILE, "a") | |
record = "%s,%s,%s,%s\n" % (like_count, total_count, share_count, url) | |
f.write(record) | |
f.close() | |
def update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict): | |
if total_count >= TOP_THRESHHOLD: | |
add_url_to_top_urls(like_count, total_count, share_count, url) | |
url_counting_dict['top_urls'] += 1 | |
if total_count >= MEDIUM_THRESHOLD: | |
url_counting_dict['medium_urls'] += 1 | |
if total_count >= SMALL_THRESHOLD: | |
url_counting_dict['small_urls'] += 1 | |
def fetch_facebook_stats_for_url(url, url_counting_dict): | |
# Make the proper FB url and fetch it | |
api_url = 'https://api.facebook.com/method/fql.query?query=select%%20%%20like_count,%%20total_count,%%20share_count,%%20click_count%%20from%%20link_stat%%20where%%20url=%%22%s%%22' % url | |
facebook_xml = urllib2.urlopen(api_url) | |
# Get the XML object | |
tree = etree.parse(facebook_xml) | |
# Namespace madness in lxml. Punting. Throw string at BeautifulSoup | |
tree = etree.tostring(tree) | |
soup = BeautifulSoup(tree) | |
#Get the stats we want | |
like_count = int(soup.like_count.contents[0]) | |
total_count = int(soup.total_count.contents[0]) | |
share_count = int(soup.share_count.contents[0]) | |
# Pass off to the stats functions | |
update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict) | |
def update_counting_stats(start, end, url_counting_dict): | |
f = open(COUNTING_STATS_FILE, "a") | |
record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict) | |
f.write(record) | |
f.close() | |
def build_url_list_to_parse(start, end): | |
''' | |
Rewrite this function to fetch your list of URLs in what way | |
seems best to you. | |
''' | |
from django.conf import settings | |
from cannonball.stories.models import Story | |
story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate') | |
url_list = [] | |
for story in story_list: | |
url_list.append('http://www.spokesman.com'+story.get_absolute_url()) | |
return url_list | |
def process_options(arglist=None): | |
global options, args | |
parser = optparse.OptionParser() | |
parser.add_option( | |
'-s', '--start', '--start_date', | |
dest='start_date', | |
help="Start date for Facebook stat collection.") | |
parser.add_option( | |
'-e', '--end', '--end_date', | |
dest='end_date', | |
help="End date for Facebook stat collection.") | |
options, args = parser.parse_args(arglist) | |
return options, args | |
def main(args=None): | |
""" | |
To run, enter something like the following at a command line: | |
python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31 | |
A csv file will be created and named based on your TOP_URLS_FILE value; | |
this file will log any story with more than 100 shares. | |
A txt file will be created and named based on your COUNTING_STATS_FILE | |
value; it will log raw numbers for URLs that surpass your TOP_THRESHHOLD, | |
MEDIUM_THRESHOLD = 10 | |
SMALL_THRESHOLD | |
""" | |
if args is None: | |
args = sys.argv[1:] | |
options, args = process_options(args) | |
try: | |
f = open(TOP_URLS_FILE) | |
f.close() | |
except: | |
f = open(TOP_URLS_FILE, "a") | |
record = "Likes,Total,Shares,URL\n" | |
f.write(record) | |
f.close() | |
url_counting_dict = { | |
'top_urls': 0, | |
'medium_urls': 0, | |
'small_urls': 0, | |
} | |
start = datetime.strptime(options.start_date,'%Y-%m-%d') | |
end = datetime.strptime(options.end_date,'%Y-%m-%d')+timedelta(days=1) | |
url_list = build_url_list_to_parse(start, end) | |
for url in url_list: | |
print url | |
fetch_facebook_stats_for_url(url, url_counting_dict) | |
time.sleep(SLEEP_SECONDS) | |
update_counting_stats(options.start_date, options.end_date, url_counting_dict) | |
if __name__ == '__main__': | |
try: | |
main() | |
except Exception, e: | |
sys.stderr.write('\n') | |
traceback.print_exc(file=sys.stderr) | |
sys.stderr.write('\n') | |
sys.exit(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment