criccomini · January 17, 2010 19:01
diff --git a/twitter-print-feed.py b/twitter-print-feed.py
 # print a user's tweets
 import simplejson
 import re
 import urllib2
 import string

 me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/peteskomoroch.json?count=200')

 for tweet in simplejson.loads(me.read()):
 	print tweet['text'].encode('utf-8')
diff --git a/twitter-tfidf-cosim-feed.py b/twitter-tfidf-cosim-feed.py
 # me is a file with a user's tweets (one tweet per line)
 # garden hose is a file with one tweet per line (one tweet per line from twitter's stream)
 # sw is a stopwords file (one word per line)

 # will compute tfidf for user's stream, then go over twitter's stream and compute cosine similarity between me's stream and every tweet. In short, will try to find related tweets to your stream.

 # tfidf on a user's profile
 import string
 import math

 me = open('/Users/criccomi/twitter/me.stream', 'r')
 gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
 sw = open('/Users/criccomi/twitter/stopwords.txt', 'r')

 me_words = {}
 me_tfs = {}
 me_total = 0
 gh_words = {}
 gh_total = 0
 me_tfidfs = {}
 stop_words = {}

 # load stop words
 for word in sw:
 	stop_words[word.strip()] = True

 exclude = set(string.punctuation)

 def pstrip(tostrip):
 	return ''.join(ch for ch in tostrip if ch not in exclude)

 # calculate tfs for all words in me.stream
 for tweet in me:
 	for word in pstrip(tweet).lower().replace("'s", '').split():
 		me_words[word] = me_words.get(word, 0) + 1
 		me_total = me_total + 1

 for word, count in me_words.items():
 	if not stop_words.has_key(word):
 		me_tfs[word] = float(count) / me_total

 # calculate idfs for all words in garden.hose.stream
 for tweet in gh:
 	distinct_words = {}
 	
 	for word in pstrip(tweet).lower().replace("'s", '').split():
 		distinct_words[word] = True
 	
 	for word, bool in distinct_words.items():
 		gh_words[word] = gh_words.get(word, 0) + 1

 	gh_total = gh_total + 1

 # calculate important words in me.stream using tf x idf
 for word, tf in me_tfs.items():
 	if not stop_words.has_key(word):
 		me_tfidfs[word] = tf * math.log(gh_total / float(1 + gh_words.get(word, 0)))

 # sort by tfidf ascending
 tfidf_items = me_tfidfs.items()
 tfidf_items.sort(key=lambda x: x[1])

 for ws in tfidf_items:
 	if ws[1] < 0.02:
 		del me_tfs[ws[0]]
 	else:
 		print ws

 # re-open to reset stream
 gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
 cosims = {}

 # compute cosine similarities between every tweet and my twitter feed
 for tweet in gh:
 	tweet_words = {}
 	tweet_total = 0
 	cosim_numerator = 0
 	
 	for word in pstrip(tweet).lower().replace("'s", '').split():
 		tweet_words[word] = tweet_words.get(word, 0) + 1
 		tweet_total = tweet_total + 1
 	
 	for word, count in tweet_words.items():
 		cosim_numerator = cosim_numerator + (float(count) / tweet_total) * me_tfs.get(word, 0)
 	
 	cosim_denominator = tweet_total * me_total
 	
 	if cosim_denominator > 0:
 		cosim = cosim_numerator / (tweet_total * me_total)
 		cosims[tweet.strip()] = cosim

 # sort by cosim ascending
 cosims_items = cosims.items()
 cosims_items.sort(key=lambda x: x[1])

 for tweet, score in cosims_items:
 	print score, tweet
diff --git a/twitter-unshorten-feed-urls.py b/twitter-unshorten-feed-urls.py
 # unshorten a user's urls
 import simplejson
 import re
 import urllib2

 me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/criccomini.json?count=200')

 for line in me:
 	tweets = simplejson.loads(line)
 	for tweet in tweets:
 		url = re.search("(?P<url>https?://[^\s]+)", tweet['text'])
 		if url:
 			url = url.group("url")
 			
 			# unshorten
 			try:
 				url = urllib2.urlopen(url).geturl()
 				print url
 			except:
 				pass
	# print a user's tweets
	import simplejson
	import re
	import urllib2
	import string

	me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/peteskomoroch.json?count=200')

	for tweet in simplejson.loads(me.read()):
	print tweet['text'].encode('utf-8')
	# me is a file with a user's tweets (one tweet per line)
	# garden hose is a file with one tweet per line (one tweet per line from twitter's stream)
	# sw is a stopwords file (one word per line)

	# will compute tfidf for user's stream, then go over twitter's stream and compute cosine similarity between me's stream and every tweet. In short, will try to find related tweets to your stream.

	# tfidf on a user's profile
	import string
	import math

	me = open('/Users/criccomi/twitter/me.stream', 'r')
	gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
	sw = open('/Users/criccomi/twitter/stopwords.txt', 'r')

	me_words = {}
	me_tfs = {}
	me_total = 0
	gh_words = {}
	gh_total = 0
	me_tfidfs = {}
	stop_words = {}

	# load stop words
	for word in sw:
	stop_words[word.strip()] = True

	exclude = set(string.punctuation)

	def pstrip(tostrip):
	return ''.join(ch for ch in tostrip if ch not in exclude)

	# calculate tfs for all words in me.stream
	for tweet in me:
	for word in pstrip(tweet).lower().replace("'s", '').split():
	me_words[word] = me_words.get(word, 0) + 1
	me_total = me_total + 1

	for word, count in me_words.items():
	if not stop_words.has_key(word):
	me_tfs[word] = float(count) / me_total

	# calculate idfs for all words in garden.hose.stream
	for tweet in gh:
	distinct_words = {}

	for word in pstrip(tweet).lower().replace("'s", '').split():
	distinct_words[word] = True

	for word, bool in distinct_words.items():
	gh_words[word] = gh_words.get(word, 0) + 1

	gh_total = gh_total + 1

	# calculate important words in me.stream using tf x idf
	for word, tf in me_tfs.items():
	if not stop_words.has_key(word):
	me_tfidfs[word] = tf * math.log(gh_total / float(1 + gh_words.get(word, 0)))

	# sort by tfidf ascending
	tfidf_items = me_tfidfs.items()
	tfidf_items.sort(key=lambda x: x[1])

	for ws in tfidf_items:
	if ws[1] < 0.02:
	del me_tfs[ws[0]]
	else:
	print ws

	# re-open to reset stream
	gh = open('/Users/criccomi/twitter/garden.hose.stream', 'r')
	cosims = {}

	# compute cosine similarities between every tweet and my twitter feed
	for tweet in gh:
	tweet_words = {}
	tweet_total = 0
	cosim_numerator = 0

	for word in pstrip(tweet).lower().replace("'s", '').split():
	tweet_words[word] = tweet_words.get(word, 0) + 1
	tweet_total = tweet_total + 1

	for word, count in tweet_words.items():
	cosim_numerator = cosim_numerator + (float(count) / tweet_total) * me_tfs.get(word, 0)

	cosim_denominator = tweet_total * me_total

	if cosim_denominator > 0:
	cosim = cosim_numerator / (tweet_total * me_total)
	cosims[tweet.strip()] = cosim

	# sort by cosim ascending
	cosims_items = cosims.items()
	cosims_items.sort(key=lambda x: x[1])

	for tweet, score in cosims_items:
	print score, tweet
	# unshorten a user's urls
	import simplejson
	import re
	import urllib2

	me = urllib2.urlopen('http://twitter.com/statuses/user_timeline/criccomini.json?count=200')

	for line in me:
	tweets = simplejson.loads(line)
	for tweet in tweets:
	url = re.search("(?P<url>https?://[^\s]+)", tweet['text'])
	if url:
	url = url.group("url")

	# unshorten
	try:
	url = urllib2.urlopen(url).geturl()
	print url
	except:
	pass