Skip to content

Instantly share code, notes, and snippets.

@zvoase
Created December 10, 2008 23:04
Show Gist options
  • Save zvoase/34531 to your computer and use it in GitHub Desktop.
Save zvoase/34531 to your computer and use it in GitHub Desktop.
# tweet_correlation.py - Calculate the correlation of two users' tweet clouds,
# based on data from http://tweetstats.com. You need to have visited this site
# before using the app, as this app relies on the cached data made when you
# first use the service.
#
# This module also exports functionality for retrieving and manipulating a
# cached tweet cloud.
##############################################################################
# Copyright (c) 2008 Zachary Voase
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import math
import operator
import urllib2
try:
import json
except ImportError:
try:
import simplejson as json
except ImportError:
from django.utils import simplejson as json
class TweetCloud(dict):
"""
>>> tweet_cloud = TweetCloud('zvoase')
>>> print repr(tweet_cloud)
TweetCloud('zvoase')
>>> print tweet_cloud
http://tweetstats.com/cache/zvoase/tag_cloud.json
>>> print tweet_cloud['django'] #doctest: +SKIP
65
>>> tweet_cloud_2 = TweetCloud('stephenfry')
>>> print tweet_cloud.compare(tweet_cloud)
1.0
>>> print tweet_cloud.compare(tweet_cloud_2) #doctest: +SKIP
0.71247443083593875
>>> comparison_1 = tweet_cloud.compare(tweet_cloud_2)
>>> comparison_2 = tweet_cloud_2.compare(tweet_cloud)
>>> comparison_1 == comparison_2
True
"""
def __init__(self, username, safe=False):
self.username = username
if not safe:
self._load_data()
def __repr__(self):
return 'TweetCloud(%r)' % (self.username,)
def __str__(self):
return 'http://tweetstats.com/cache/%s/tag_cloud.json' % (
self.username,)
def _load_data(self):
"""Load tweet cloud data from http://tweetstats.com."""
try:
conn = urllib2.urlopen(
'http://tweetstats.com/cache/%s/tag_cloud.json' % (
self.username,))
except urllib2.HTTPError, exc:
if exc.code == 404:
raise Exception('Please visit tweetstats '
'(http://tweetstats.com) before using this application.')
try:
data = json.load(conn)
finally:
conn.close()
for word, count in data:
self[word] = count
def filter_replies(self):
"""Return a new TweetCloud with stats only on this one's replies."""
replies = self.__class__(self.username, safe=True)
for word in self:
if word.startswith('@'):
replies[word] = self[word]
return replies
def filter_words(self):
"""Return a new TweetCloud with stats only on this one's words."""
words = self.__class__(self.username, safe=True)
for word in self:
if not word.startswith('@'):
words[word] = self[word]
return words
def compare(self, tag_cloud):
"""Return a comparison coefficient between two TweetClouds."""
keys = set(self.keys()).intersection(set(tag_cloud.keys()))
result = {}
for key in keys:
result[key] = min((self[key], tag_cloud[key]))
return 2 * (float(sum(result.values())) /
(sum(self.values()) + sum(tag_cloud.values())))
BOTH = 0
WORDS_ONLY = 1
REPLIES_ONLY = 2
def compare(username1, username2, consider=BOTH):
tc1 = TweetCloud(username1)
tc2 = TweetCloud(username2)
if consider == WORDS_ONLY:
tc1 = tc1.filter_words()
tc2 = tc2.filter_words()
elif consider == REPLIES_ONLY:
tc1 = tc1.filter_replies()
tc2 = tc2.filter_replies()
return tc1.compare(tc2)
if __name__ == '__main__':
import doctest
doctest.testmod(verbose=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment