ryanpitts · December 13, 2011 23:05 · Dec 15, 2011 · Dec 15, 2011 · Dec 14, 2011 · Dec 14, 2011
diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -13,13 +13,18 @@
 
 - create a txt file that's named based on the value of COUNTING_STATS_FILE. 
 It logs raw numbers for URLs that surpass the TOP_THRESHHOLD, MEDIUM_THRESHOLD 
-and SMALL_THRESHOLD values.
+and SMALL_THRESHOLD values. It also logs the total number of URLs scanned.
 
 This file is updated only when the loop finishes, so an exception or a lost
 connection will drop these stats. I added this tracking just out of curiosity,
 but didn't really feel like adding the overhead of writing to file every
 single time through the loop.
 
+- create a file called `failed_urls.txt` to log URLs that ... fail. Once in
+a while Facebook throws a bad status code, which particularly sucks when
+you're on day 27 of a 30-day run. Logging failed URLs in this file allows 
+your loop to keep on going. Later on, just run the failed URLs manually.
+
 
 Notes:
 

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -23,8 +23,8 @@
 
 Notes:
 
-- the sleep interval is set at 2 seconds because Facebook seems to cut you off
-after ~600 requests at a 1-second interval. At a 2-second interval, I've been
+- the sleep interval is set at 2.5 seconds because Facebook seems to cut you 
+off after ~600 requests at a 1-second interval. At this interval, I've been 
 able to run off a month at a time without trouble. (For us, that's about 3,000
 story URLs.)
 
@@ -46,7 +46,7 @@
 MEDIUM_THRESHOLD = 10
 SMALL_THRESHOLD = 1
 
-SLEEP_SECONDS = 2
+SLEEP_SECONDS = 2.5
 
 
 # --------------
@@ -181,12 +181,14 @@ def main(args=None):
         'top_urls': 0,
         'medium_urls': 0,
         'small_urls': 0,
+        'urls_scanned': 0,
     }
 
     start = datetime.strptime(options.start_date,'%Y-%m-%d')
     end = datetime.strptime(options.end_date,'%Y-%m-%d')+timedelta(days=1)
 
     url_list = build_url_list_to_parse(start, end)
+    url_counting_dict['urls_scanned'] = len(url_list)
 
     for url in url_list:
         print url

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -42,7 +42,7 @@
 TOP_URLS_FILE = "top_urls.csv"
 COUNTING_STATS_FILE = "counting_stats.txt"
 
-TOP_THRESHHOLD = 50
+TOP_THRESHHOLD = 100
 MEDIUM_THRESHOLD = 10
 SMALL_THRESHOLD = 1
 
@@ -105,23 +105,29 @@ def update_stats_for_url(like_count, total_count, share_count, url, url_counting
 def fetch_facebook_stats_for_url(url, url_counting_dict):
     # Make the proper FB url and fetch it
     api_url = 'https://api.facebook.com/method/fql.query?query=select%%20%%20like_count,%%20total_count,%%20share_count,%%20click_count%%20from%%20link_stat%%20where%%20url=%%22%s%%22' % url
-    facebook_xml = urllib2.urlopen(api_url)
+
+    try:
+        facebook_xml = urllib2.urlopen(api_url)
 
-    # Get the XML object
-    tree = etree.parse(facebook_xml)
+        # Get the XML object
+        tree = etree.parse(facebook_xml)
 
-    # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
-    tree = etree.tostring(tree)
-    soup = BeautifulSoup(tree)
+        # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
+        tree = etree.tostring(tree)
+        soup = BeautifulSoup(tree)
 
-    #Get the stats we want
-    like_count = int(soup.like_count.contents[0])
-    total_count = int(soup.total_count.contents[0])
-    share_count = int(soup.share_count.contents[0])
+        #Get the stats we want
+        like_count = int(soup.like_count.contents[0])
+        total_count = int(soup.total_count.contents[0])
+        share_count = int(soup.share_count.contents[0])
 
-    # Pass off to the stats functions
-    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)
-
+        # Pass off to the stats functions
+        update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)
+    except:
+        f = open("failed_urls.txt", "a")
+        record = "'%s',\n" % (url)
+        f.write(record)
+        f.close()
 
 
 # -------------

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -46,7 +46,7 @@
 MEDIUM_THRESHOLD = 10
 SMALL_THRESHOLD = 1
 
-SLEEP_SECONDS = 2 #Facebook seems to cut you off after ~600 requests
+SLEEP_SECONDS = 2
 
 
 # --------------

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -1,18 +1,82 @@
+"""
+One of our editors thought it would be fun to run a year-end list of our
+most-shared stories. This script uses the Facebook API to fetch stats for 
+a given list of URLs.
+
+
+The script will:
+
+- create a csv file that's named based on the value of TOP_URLS_FILE. This 
+file logs URLs that have enough shares to surpass your TOP_THRESHHOLD number.
+
+This file is updated throughout the loop, on each match.
+
+- create a txt file that's named based on the value of COUNTING_STATS_FILE. 
+It logs raw numbers for URLs that surpass the TOP_THRESHHOLD, MEDIUM_THRESHOLD 
+and SMALL_THRESHOLD values.
+
+This file is updated only when the loop finishes, so an exception or a lost
+connection will drop these stats. I added this tracking just out of curiosity,
+but didn't really feel like adding the overhead of writing to file every
+single time through the loop.
+
+
+Notes:
+
+- the sleep interval is set at 2 seconds because Facebook seems to cut you off
+after ~600 requests at a 1-second interval. At a 2-second interval, I've been
+able to run off a month at a time without trouble. (For us, that's about 3,000
+story URLs.)
+
+"""
 import sys, traceback, optparse, urllib2, time
 from lxml import etree
 from BeautifulSoup import BeautifulSoup
 from datetime import datetime, timedelta
 
 
+# ------------------
+# SET US UP THE VARS
+# ------------------
+
 TOP_URLS_FILE = "top_urls.csv"
 COUNTING_STATS_FILE = "counting_stats.txt"
 
 TOP_THRESHHOLD = 50
 MEDIUM_THRESHOLD = 10
 SMALL_THRESHOLD = 1
 
-SLEEP_SECONDS = 2
+SLEEP_SECONDS = 2 #Facebook seems to cut you off after ~600 requests
+
+
+# --------------
+# CUSTOMIZE THIS
+# --------------
+
+def build_url_list_to_parse(start, end):
+    '''
+    Rewrite this function to fetch your list of URLs in what way
+    seems best to you.
+    
+    main() down below will pass in two datetime objects like so:
+    
+        url_list = build_url_list_to_parse(start, end)
+    
+    '''
+    from django.conf import settings
+    from cannonball.stories.models import Story
+
+    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
+    url_list = []
+    for story in story_list:
+        url_list.append('http://www.spokesman.com'+story.get_absolute_url())
+
+    return url_list
+
 
+# ---------
+# UTILITIES
+# ---------
 
 def add_url_to_top_urls(like_count, total_count, share_count, url):
     f = open(TOP_URLS_FILE, "a")
@@ -21,6 +85,13 @@ def add_url_to_top_urls(like_count, total_count, share_count, url):
     f.close()
 
 
+def update_counting_stats(start, end, url_counting_dict):
+    f = open(COUNTING_STATS_FILE, "a")
+    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
+    f.write(record)
+    f.close()
+
+
 def update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict):
     if total_count >= TOP_THRESHHOLD:
         add_url_to_top_urls(like_count, total_count, share_count, url)
@@ -52,30 +123,11 @@ def fetch_facebook_stats_for_url(url, url_counting_dict):
     update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)
 
 
-def update_counting_stats(start, end, url_counting_dict):
-    f = open(COUNTING_STATS_FILE, "a")
-    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
-    f.write(record)
-    f.close()
-
 
-def build_url_list_to_parse(start, end):
-    '''
-    Rewrite this function to fetch your list of URLs in what way
-    seems best to you.
-    
-    '''
-    from django.conf import settings
-    from cannonball.stories.models import Story
-
-    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
-    url_list = []
-    for story in story_list:
-        url_list.append('http://www.spokesman.com'+story.get_absolute_url())
-
-    return url_list
-
-
+# -------------
+# RUN THE THING
+# e.g. >> python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
+# -------------
 
 def process_options(arglist=None):
     global options, args

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -98,12 +98,12 @@ def main(args=None):
     python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
     
     A csv file will be created and named based on your TOP_URLS_FILE value;
-    this file will log any story with more than 100 shares.
+    this file will log any URL that surpass your TOP_THRESHHOLD share number.
 
     A txt file will be created and named based on your COUNTING_STATS_FILE
     value; it will log raw numbers for URLs that surpass your TOP_THRESHHOLD,
-MEDIUM_THRESHOLD = 10
-SMALL_THRESHOLD
+    MEDIUM_THRESHOLD, and SMALL_THRESHOLD values.
+
     """
 
     if args is None:

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -97,8 +97,13 @@ def main(args=None):
     To run, enter something like the following at a command line:
     python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
     
-    A 'top_stories' csv file will be created, logging Facebook sharing stats
-    and the URL for any story with more than 100 shares.
+    A csv file will be created and named based on your TOP_URLS_FILE value;
+    this file will log any story with more than 100 shares.
+
+    A txt file will be created and named based on your COUNTING_STATS_FILE
+    value; it will log raw numbers for URLs that surpass your TOP_THRESHHOLD,
+MEDIUM_THRESHOLD = 10
+SMALL_THRESHOLD
     """
 
     if args is None:

diff --git a/collect_facebook_stats.py b/collect_facebook_stats.py
@@ -0,0 +1,144 @@
+import sys, traceback, optparse, urllib2, time
+from lxml import etree
+from BeautifulSoup import BeautifulSoup
+from datetime import datetime, timedelta
+
+
+TOP_URLS_FILE = "top_urls.csv"
+COUNTING_STATS_FILE = "counting_stats.txt"
+
+TOP_THRESHHOLD = 50
+MEDIUM_THRESHOLD = 10
+SMALL_THRESHOLD = 1
+
+SLEEP_SECONDS = 2
+
+
+def add_url_to_top_urls(like_count, total_count, share_count, url):
+    f = open(TOP_URLS_FILE, "a")
+    record = "%s,%s,%s,%s\n" % (like_count, total_count, share_count, url)
+    f.write(record)
+    f.close()
+
+
+def update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict):
+    if total_count >= TOP_THRESHHOLD:
+        add_url_to_top_urls(like_count, total_count, share_count, url)
+        url_counting_dict['top_urls'] += 1
+    if total_count >= MEDIUM_THRESHOLD:
+        url_counting_dict['medium_urls'] += 1
+    if total_count >= SMALL_THRESHOLD:
+        url_counting_dict['small_urls'] += 1
+
+
+def fetch_facebook_stats_for_url(url, url_counting_dict):
+    # Make the proper FB url and fetch it
+    api_url = 'https://api.facebook.com/method/fql.query?query=select%%20%%20like_count,%%20total_count,%%20share_count,%%20click_count%%20from%%20link_stat%%20where%%20url=%%22%s%%22' % url
+    facebook_xml = urllib2.urlopen(api_url)
+
+    # Get the XML object
+    tree = etree.parse(facebook_xml)
+
+    # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
+    tree = etree.tostring(tree)
+    soup = BeautifulSoup(tree)
+
+    #Get the stats we want
+    like_count = int(soup.like_count.contents[0])
+    total_count = int(soup.total_count.contents[0])
+    share_count = int(soup.share_count.contents[0])
+
+    # Pass off to the stats functions
+    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)
+
+
+def update_counting_stats(start, end, url_counting_dict):
+    f = open(COUNTING_STATS_FILE, "a")
+    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
+    f.write(record)
+    f.close()
+
+
+def build_url_list_to_parse(start, end):
+    '''
+    Rewrite this function to fetch your list of URLs in what way
+    seems best to you.
+    
+    '''
+    from django.conf import settings
+    from cannonball.stories.models import Story
+
+    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
+    url_list = []
+    for story in story_list:
+        url_list.append('http://www.spokesman.com'+story.get_absolute_url())
+
+    return url_list
+
+
+
+def process_options(arglist=None):
+    global options, args
+    parser = optparse.OptionParser()
+    parser.add_option(
+        '-s', '--start', '--start_date',
+        dest='start_date',
+        help="Start date for Facebook stat collection.")
+    parser.add_option(
+        '-e', '--end', '--end_date',
+        dest='end_date',
+        help="End date for Facebook stat collection.")
+    options, args = parser.parse_args(arglist)
+    return options, args
+
+
+def main(args=None):
+    """
+    To run, enter something like the following at a command line:
+    python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
+    
+    A 'top_stories' csv file will be created, logging Facebook sharing stats
+    and the URL for any story with more than 100 shares.
+    """
+
+    if args is None:
+        args = sys.argv[1:]
+    options, args = process_options(args)
+
+    try:
+        f = open(TOP_URLS_FILE)
+        f.close()
+    except:
+        f = open(TOP_URLS_FILE, "a")
+        record = "Likes,Total,Shares,URL\n"
+        f.write(record)
+        f.close()
+
+    url_counting_dict = {
+        'top_urls': 0,
+        'medium_urls': 0,
+        'small_urls': 0,
+    }
+
+    start = datetime.strptime(options.start_date,'%Y-%m-%d')
+    end = datetime.strptime(options.end_date,'%Y-%m-%d')+timedelta(days=1)
+
+    url_list = build_url_list_to_parse(start, end)
+
+    for url in url_list:
+        print url
+        fetch_facebook_stats_for_url(url, url_counting_dict)
+        time.sleep(SLEEP_SECONDS)
+
+    update_counting_stats(options.start_date, options.end_date, url_counting_dict)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception, e:
+        sys.stderr.write('\n')
+        traceback.print_exc(file=sys.stderr)
+        sys.stderr.write('\n')
+        sys.exit(1)
+