Created
January 17, 2010 19:01
-
-
Save criccomini/279511 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# print a user's tweets | |
import simplejson | |
import re | |
import urllib2 | |
import string | |
me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/peteskomoroch.json?count=200') | |
for tweet in simplejson.loads(me.read()): | |
print tweet['text'].encode('utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# me is a file with a user's tweets (one tweet per line) | |
# garden hose is a file with one tweet per line (one tweet per line from twitter's stream) | |
# sw is a stopwords file (one word per line) | |
# will compute tfidf for user's stream, then go over twitter's stream and compute cosine similarity between me's stream and every tweet. In short, will try to find related tweets to your stream. | |
# tfidf on a user's profile | |
import string | |
import math | |
me = open('/Users/criccomi/twitter/me.stream', 'r') | |
gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r') | |
sw = open('/Users/criccomi/twitter/stopwords.txt', 'r') | |
me_words = {} | |
me_tfs = {} | |
me_total = 0 | |
gh_words = {} | |
gh_total = 0 | |
me_tfidfs = {} | |
stop_words = {} | |
# load stop words | |
for word in sw: | |
stop_words[word.strip()] = True | |
exclude = set(string.punctuation) | |
def pstrip(tostrip): | |
return ''.join(ch for ch in tostrip if ch not in exclude) | |
# calculate tfs for all words in me.stream | |
for tweet in me: | |
for word in pstrip(tweet).lower().replace("'s", '').split(): | |
me_words[word] = me_words.get(word, 0) + 1 | |
me_total = me_total + 1 | |
for word, count in me_words.items(): | |
if not stop_words.has_key(word): | |
me_tfs[word] = float(count) / me_total | |
# calculate idfs for all words in garden.hose.stream | |
for tweet in gh: | |
distinct_words = {} | |
for word in pstrip(tweet).lower().replace("'s", '').split(): | |
distinct_words[word] = True | |
for word, bool in distinct_words.items(): | |
gh_words[word] = gh_words.get(word, 0) + 1 | |
gh_total = gh_total + 1 | |
# calculate important words in me.stream using tf x idf | |
for word, tf in me_tfs.items(): | |
if not stop_words.has_key(word): | |
me_tfidfs[word] = tf * math.log(gh_total / float(1 + gh_words.get(word, 0))) | |
# sort by tfidf ascending | |
tfidf_items = me_tfidfs.items() | |
tfidf_items.sort(key=lambda x: x[1]) | |
for ws in tfidf_items: | |
if ws[1] < 0.02: | |
del me_tfs[ws[0]] | |
else: | |
print ws | |
# re-open to reset stream | |
gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r') | |
cosims = {} | |
# compute cosine similarities between every tweet and my twitter feed | |
for tweet in gh: | |
tweet_words = {} | |
tweet_total = 0 | |
cosim_numerator = 0 | |
for word in pstrip(tweet).lower().replace("'s", '').split(): | |
tweet_words[word] = tweet_words.get(word, 0) + 1 | |
tweet_total = tweet_total + 1 | |
for word, count in tweet_words.items(): | |
cosim_numerator = cosim_numerator + (float(count) / tweet_total) * me_tfs.get(word, 0) | |
cosim_denominator = tweet_total * me_total | |
if cosim_denominator > 0: | |
cosim = cosim_numerator / (tweet_total * me_total) | |
cosims[tweet.strip()] = cosim | |
# sort by cosim ascending | |
cosims_items = cosims.items() | |
cosims_items.sort(key=lambda x: x[1]) | |
for tweet, score in cosims_items: | |
print score, tweet |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# unshorten a user's urls | |
import simplejson | |
import re | |
import urllib2 | |
me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/criccomini.json?count=200') | |
for line in me: | |
tweets = simplejson.loads(line) | |
for tweet in tweets: | |
url = re.search("(?P<url>https?://[^\s]+)", tweet['text']) | |
if url: | |
url = url.group("url") | |
# unshorten | |
try: | |
url = urllib2.urlopen(url).geturl() | |
print url | |
except: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment