Skip to content

Instantly share code, notes, and snippets.

@heuristicfencepost
Created March 29, 2011 05:55
Show Gist options
  • Save heuristicfencepost/891869 to your computer and use it in GitHub Desktop.
Save heuristicfencepost/891869 to your computer and use it in GitHub Desktop.
Python scripts used in an initial analysis of the Cassandra data model
from twitter.api import Twitter
import pycassa
from itertools import ifilterfalse
# Query to use when finding tweets.
searchquery = "#cassandra"
# Borrowed from the itertools docs
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
seen = set()
seen_add = seen.add
if key is None:
for element in ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
def populate():
cassandra = pycassa.connect("twitter")
authors_cf = pycassa.ColumnFamily(cassandra,"authors")
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")
# The Twitter API returns Unicode vals for all string results. In addition
# Pycassa appears to complain when we give it a string encoded in something
# other than UTF-8 or a non-string value. To get around this we perform
# an intelligent string conversion; if we get a Unicode type return the
# UTF-8 encoding of that string, otherwise return the standard string
# representation.
def smart_str(val):
if isinstance(val,unicode):
return val.encode('utf-8')
else:
return str(val)
search = Twitter(domain="search.twitter.com")
twitter = Twitter()
search_results = search.search(q=searchquery,rpp=100)
tweets = search_results["results"]
print tweets[0]
for tweet in tweets:
tweet_str = dict([(k,smart_str(v)) for (k,v) in tweet.iteritems()])
tweets_cf.insert(tweet["from_user"],{tweet["id_str"]:tweet_str})
print "Found %d tweets" % len(tweets)
author_names = list(unique_everseen([t["from_user"] for t in tweets]))
print "Found %d distinct authors" % len(author_names)
# Convert everything into strings; in Cassandra name and values of a column
# are apparently normally converted into strings
for author_info in (twitter.users.show(id=name) for name in author_names):
author_name = author_info['screen_name']
author_info_str = dict([(k,smart_str(v)) for (k,v) in author_info.iteritems()])
authors_cf.insert(author_name,author_info_str)
print "Added data for author %s" % author_name
if __name__ == "__main__":
populate()
import pycassa
cassandra = pycassa.connect("twitter")
authors_cf = pycassa.ColumnFamily(cassandra,"authors")
for (k,values) in authors_cf.get_range('!','~',["lang","followers_count"]):
print "Key: %s, language: %s, followers: %s" % (k,values["lang"],values["followers_count"])
import pycassa
cassandra = pycassa.connect("twitter")
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")
# Key in return value from "tweets" super column family is the
# tweet ID, value is a map of per-tweet data. We're only interested
# in the number of tweets so we only need to compute the size
# of the returned hash.
for (k,values) in tweets_cf.get_range('!','~'):
print "Authors: %s, tweets written: %d" % (k,len(values))
from twitter.api import Twitter
import pycassa
from pycassa.index import *
cassandra = pycassa.connect("twitter")
authors_cf = pycassa.ColumnFamily(cassandra,"authors")
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")
twitter = Twitter()
# Iterate through the set of IDs returned by the Twitter API and execute
# an index search against each ID. The Pycassa API will return a generator
# for each query so we make use of the for expression to determine when
# we should increment the total count.
count = 0
for authorid in twitter.followers.ids(id="spyced"):
author_expr = create_index_expression('id_str',str(authorid))
author_clause = create_index_clause([author_expr])
for (authorkey,authorprops) in authors_cf.get_indexed_slices(author_clause):
print authorkey
count += 1
print count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment