Skip to content

Instantly share code, notes, and snippets.

@ryanpitts
Created December 13, 2011 23:05

Revisions

  1. ryanpitts revised this gist Dec 15, 2011. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -13,13 +13,18 @@
    - create a txt file that's named based on the value of COUNTING_STATS_FILE.
    It logs raw numbers for URLs that surpass the TOP_THRESHHOLD, MEDIUM_THRESHOLD
    and SMALL_THRESHOLD values.
    and SMALL_THRESHOLD values. It also logs the total number of URLs scanned.
    This file is updated only when the loop finishes, so an exception or a lost
    connection will drop these stats. I added this tracking just out of curiosity,
    but didn't really feel like adding the overhead of writing to file every
    single time through the loop.
    - create a file called `failed_urls.txt` to log URLs that ... fail. Once in
    a while Facebook throws a bad status code, which particularly sucks when
    you're on day 27 of a 30-day run. Logging failed URLs in this file allows
    your loop to keep on going. Later on, just run the failed URLs manually.
    Notes:
  2. ryanpitts revised this gist Dec 15, 2011. 1 changed file with 5 additions and 3 deletions.
    8 changes: 5 additions & 3 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -23,8 +23,8 @@
    Notes:
    - the sleep interval is set at 2 seconds because Facebook seems to cut you off
    after ~600 requests at a 1-second interval. At a 2-second interval, I've been
    - the sleep interval is set at 2.5 seconds because Facebook seems to cut you
    off after ~600 requests at a 1-second interval. At this interval, I've been
    able to run off a month at a time without trouble. (For us, that's about 3,000
    story URLs.)
    @@ -46,7 +46,7 @@
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD = 1

    SLEEP_SECONDS = 2
    SLEEP_SECONDS = 2.5


    # --------------
    @@ -181,12 +181,14 @@ def main(args=None):
    'top_urls': 0,
    'medium_urls': 0,
    'small_urls': 0,
    'urls_scanned': 0,
    }

    start = datetime.strptime(options.start_date,'%Y-%m-%d')
    end = datetime.strptime(options.end_date,'%Y-%m-%d')+timedelta(days=1)

    url_list = build_url_list_to_parse(start, end)
    url_counting_dict['urls_scanned'] = len(url_list)

    for url in url_list:
    print url
  3. ryanpitts revised this gist Dec 14, 2011. 1 changed file with 20 additions and 14 deletions.
    34 changes: 20 additions & 14 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -42,7 +42,7 @@
    TOP_URLS_FILE = "top_urls.csv"
    COUNTING_STATS_FILE = "counting_stats.txt"

    TOP_THRESHHOLD = 50
    TOP_THRESHHOLD = 100
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD = 1

    @@ -105,23 +105,29 @@ def update_stats_for_url(like_count, total_count, share_count, url, url_counting
    def fetch_facebook_stats_for_url(url, url_counting_dict):
    # Make the proper FB url and fetch it
    api_url = 'https://api.facebook.com/method/fql.query?query=select%%20%%20like_count,%%20total_count,%%20share_count,%%20click_count%%20from%%20link_stat%%20where%%20url=%%22%s%%22' % url
    facebook_xml = urllib2.urlopen(api_url)

    try:
    facebook_xml = urllib2.urlopen(api_url)

    # Get the XML object
    tree = etree.parse(facebook_xml)
    # Get the XML object
    tree = etree.parse(facebook_xml)

    # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
    tree = etree.tostring(tree)
    soup = BeautifulSoup(tree)
    # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
    tree = etree.tostring(tree)
    soup = BeautifulSoup(tree)

    #Get the stats we want
    like_count = int(soup.like_count.contents[0])
    total_count = int(soup.total_count.contents[0])
    share_count = int(soup.share_count.contents[0])
    #Get the stats we want
    like_count = int(soup.like_count.contents[0])
    total_count = int(soup.total_count.contents[0])
    share_count = int(soup.share_count.contents[0])

    # Pass off to the stats functions
    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)

    # Pass off to the stats functions
    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)
    except:
    f = open("failed_urls.txt", "a")
    record = "'%s',\n" % (url)
    f.write(record)
    f.close()


    # -------------
  4. ryanpitts revised this gist Dec 14, 2011. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -46,7 +46,7 @@
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD = 1

    SLEEP_SECONDS = 2 #Facebook seems to cut you off after ~600 requests
    SLEEP_SECONDS = 2


    # --------------
  5. ryanpitts revised this gist Dec 14, 2011. 1 changed file with 76 additions and 24 deletions.
    100 changes: 76 additions & 24 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -1,18 +1,82 @@
    """
    One of our editors thought it would be fun to run a year-end list of our
    most-shared stories. This script uses the Facebook API to fetch stats for
    a given list of URLs.
    The script will:
    - create a csv file that's named based on the value of TOP_URLS_FILE. This
    file logs URLs that have enough shares to surpass your TOP_THRESHHOLD number.
    This file is updated throughout the loop, on each match.
    - create a txt file that's named based on the value of COUNTING_STATS_FILE.
    It logs raw numbers for URLs that surpass the TOP_THRESHHOLD, MEDIUM_THRESHOLD
    and SMALL_THRESHOLD values.
    This file is updated only when the loop finishes, so an exception or a lost
    connection will drop these stats. I added this tracking just out of curiosity,
    but didn't really feel like adding the overhead of writing to file every
    single time through the loop.
    Notes:
    - the sleep interval is set at 2 seconds because Facebook seems to cut you off
    after ~600 requests at a 1-second interval. At a 2-second interval, I've been
    able to run off a month at a time without trouble. (For us, that's about 3,000
    story URLs.)
    """
    import sys, traceback, optparse, urllib2, time
    from lxml import etree
    from BeautifulSoup import BeautifulSoup
    from datetime import datetime, timedelta


    # ------------------
    # SET US UP THE VARS
    # ------------------

    TOP_URLS_FILE = "top_urls.csv"
    COUNTING_STATS_FILE = "counting_stats.txt"

    TOP_THRESHHOLD = 50
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD = 1

    SLEEP_SECONDS = 2
    SLEEP_SECONDS = 2 #Facebook seems to cut you off after ~600 requests


    # --------------
    # CUSTOMIZE THIS
    # --------------

    def build_url_list_to_parse(start, end):
    '''
    Rewrite this function to fetch your list of URLs in what way
    seems best to you.
    main() down below will pass in two datetime objects like so:
    url_list = build_url_list_to_parse(start, end)
    '''
    from django.conf import settings
    from cannonball.stories.models import Story

    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
    url_list = []
    for story in story_list:
    url_list.append('http://www.spokesman.com'+story.get_absolute_url())

    return url_list


    # ---------
    # UTILITIES
    # ---------

    def add_url_to_top_urls(like_count, total_count, share_count, url):
    f = open(TOP_URLS_FILE, "a")
    @@ -21,6 +85,13 @@ def add_url_to_top_urls(like_count, total_count, share_count, url):
    f.close()


    def update_counting_stats(start, end, url_counting_dict):
    f = open(COUNTING_STATS_FILE, "a")
    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
    f.write(record)
    f.close()


    def update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict):
    if total_count >= TOP_THRESHHOLD:
    add_url_to_top_urls(like_count, total_count, share_count, url)
    @@ -52,30 +123,11 @@ def fetch_facebook_stats_for_url(url, url_counting_dict):
    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)


    def update_counting_stats(start, end, url_counting_dict):
    f = open(COUNTING_STATS_FILE, "a")
    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
    f.write(record)
    f.close()


    def build_url_list_to_parse(start, end):
    '''
    Rewrite this function to fetch your list of URLs in what way
    seems best to you.
    '''
    from django.conf import settings
    from cannonball.stories.models import Story

    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
    url_list = []
    for story in story_list:
    url_list.append('http://www.spokesman.com'+story.get_absolute_url())

    return url_list


    # -------------
    # RUN THE THING
    # e.g. >> python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
    # -------------

    def process_options(arglist=None):
    global options, args
  6. ryanpitts revised this gist Dec 13, 2011. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -98,12 +98,12 @@ def main(args=None):
    python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
    A csv file will be created and named based on your TOP_URLS_FILE value;
    this file will log any story with more than 100 shares.
    this file will log any URL that surpass your TOP_THRESHHOLD share number.
    A txt file will be created and named based on your COUNTING_STATS_FILE
    value; it will log raw numbers for URLs that surpass your TOP_THRESHHOLD,
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD
    MEDIUM_THRESHOLD, and SMALL_THRESHOLD values.
    """

    if args is None:
  7. ryanpitts revised this gist Dec 13, 2011. 1 changed file with 7 additions and 2 deletions.
    9 changes: 7 additions & 2 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -97,8 +97,13 @@ def main(args=None):
    To run, enter something like the following at a command line:
    python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
    A 'top_stories' csv file will be created, logging Facebook sharing stats
    and the URL for any story with more than 100 shares.
    A csv file will be created and named based on your TOP_URLS_FILE value;
    this file will log any story with more than 100 shares.
    A txt file will be created and named based on your COUNTING_STATS_FILE
    value; it will log raw numbers for URLs that surpass your TOP_THRESHHOLD,
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD
    """

    if args is None:
  8. ryanpitts created this gist Dec 13, 2011.
    144 changes: 144 additions & 0 deletions collect_facebook_stats.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,144 @@
    import sys, traceback, optparse, urllib2, time
    from lxml import etree
    from BeautifulSoup import BeautifulSoup
    from datetime import datetime, timedelta


    TOP_URLS_FILE = "top_urls.csv"
    COUNTING_STATS_FILE = "counting_stats.txt"

    TOP_THRESHHOLD = 50
    MEDIUM_THRESHOLD = 10
    SMALL_THRESHOLD = 1

    SLEEP_SECONDS = 2


    def add_url_to_top_urls(like_count, total_count, share_count, url):
    f = open(TOP_URLS_FILE, "a")
    record = "%s,%s,%s,%s\n" % (like_count, total_count, share_count, url)
    f.write(record)
    f.close()


    def update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict):
    if total_count >= TOP_THRESHHOLD:
    add_url_to_top_urls(like_count, total_count, share_count, url)
    url_counting_dict['top_urls'] += 1
    if total_count >= MEDIUM_THRESHOLD:
    url_counting_dict['medium_urls'] += 1
    if total_count >= SMALL_THRESHOLD:
    url_counting_dict['small_urls'] += 1


    def fetch_facebook_stats_for_url(url, url_counting_dict):
    # Make the proper FB url and fetch it
    api_url = 'https://api.facebook.com/method/fql.query?query=select%%20%%20like_count,%%20total_count,%%20share_count,%%20click_count%%20from%%20link_stat%%20where%%20url=%%22%s%%22' % url
    facebook_xml = urllib2.urlopen(api_url)

    # Get the XML object
    tree = etree.parse(facebook_xml)

    # Namespace madness in lxml. Punting. Throw string at BeautifulSoup
    tree = etree.tostring(tree)
    soup = BeautifulSoup(tree)

    #Get the stats we want
    like_count = int(soup.like_count.contents[0])
    total_count = int(soup.total_count.contents[0])
    share_count = int(soup.share_count.contents[0])

    # Pass off to the stats functions
    update_stats_for_url(like_count, total_count, share_count, url, url_counting_dict)


    def update_counting_stats(start, end, url_counting_dict):
    f = open(COUNTING_STATS_FILE, "a")
    record = "Period: %s through %s\n%s\n\n" % (start, end, url_counting_dict)
    f.write(record)
    f.close()


    def build_url_list_to_parse(start, end):
    '''
    Rewrite this function to fetch your list of URLs in what way
    seems best to you.
    '''
    from django.conf import settings
    from cannonball.stories.models import Story

    story_list = Story.live_objects.filter(pubdate__gte=start, pubdate__lt=end).order_by('pubdate')
    url_list = []
    for story in story_list:
    url_list.append('http://www.spokesman.com'+story.get_absolute_url())

    return url_list



    def process_options(arglist=None):
    global options, args
    parser = optparse.OptionParser()
    parser.add_option(
    '-s', '--start', '--start_date',
    dest='start_date',
    help="Start date for Facebook stat collection.")
    parser.add_option(
    '-e', '--end', '--end_date',
    dest='end_date',
    help="End date for Facebook stat collection.")
    options, args = parser.parse_args(arglist)
    return options, args


    def main(args=None):
    """
    To run, enter something like the following at a command line:
    python collect_facebook_stats.py --start=2011-01-01 --end=2011-01-31
    A 'top_stories' csv file will be created, logging Facebook sharing stats
    and the URL for any story with more than 100 shares.
    """

    if args is None:
    args = sys.argv[1:]
    options, args = process_options(args)

    try:
    f = open(TOP_URLS_FILE)
    f.close()
    except:
    f = open(TOP_URLS_FILE, "a")
    record = "Likes,Total,Shares,URL\n"
    f.write(record)
    f.close()

    url_counting_dict = {
    'top_urls': 0,
    'medium_urls': 0,
    'small_urls': 0,
    }

    start = datetime.strptime(options.start_date,'%Y-%m-%d')
    end = datetime.strptime(options.end_date,'%Y-%m-%d')+timedelta(days=1)

    url_list = build_url_list_to_parse(start, end)

    for url in url_list:
    print url
    fetch_facebook_stats_for_url(url, url_counting_dict)
    time.sleep(SLEEP_SECONDS)

    update_counting_stats(options.start_date, options.end_date, url_counting_dict)


    if __name__ == '__main__':
    try:
    main()
    except Exception, e:
    sys.stderr.write('\n')
    traceback.print_exc(file=sys.stderr)
    sys.stderr.write('\n')
    sys.exit(1)