Created
March 29, 2011 05:55
-
-
Save heuristicfencepost/891869 to your computer and use it in GitHub Desktop.
Python scripts used in an initial analysis of the Cassandra data model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from twitter.api import Twitter | |
import pycassa | |
from itertools import ifilterfalse | |
# Query to use when finding tweets. | |
searchquery = "#cassandra" | |
# Borrowed from the itertools docs | |
def unique_everseen(iterable, key=None): | |
"List unique elements, preserving order. Remember all elements ever seen." | |
seen = set() | |
seen_add = seen.add | |
if key is None: | |
for element in ifilterfalse(seen.__contains__, iterable): | |
seen_add(element) | |
yield element | |
else: | |
for element in iterable: | |
k = key(element) | |
if k not in seen: | |
seen_add(k) | |
yield element | |
def populate(): | |
cassandra = pycassa.connect("twitter") | |
authors_cf = pycassa.ColumnFamily(cassandra,"authors") | |
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets") | |
# The Twitter API returns Unicode vals for all string results. In addition | |
# Pycassa appears to complain when we give it a string encoded in something | |
# other than UTF-8 or a non-string value. To get around this we perform | |
# an intelligent string conversion; if we get a Unicode type return the | |
# UTF-8 encoding of that string, otherwise return the standard string | |
# representation. | |
def smart_str(val): | |
if isinstance(val,unicode): | |
return val.encode('utf-8') | |
else: | |
return str(val) | |
search = Twitter(domain="search.twitter.com") | |
twitter = Twitter() | |
search_results = search.search(q=searchquery,rpp=100) | |
tweets = search_results["results"] | |
print tweets[0] | |
for tweet in tweets: | |
tweet_str = dict([(k,smart_str(v)) for (k,v) in tweet.iteritems()]) | |
tweets_cf.insert(tweet["from_user"],{tweet["id_str"]:tweet_str}) | |
print "Found %d tweets" % len(tweets) | |
author_names = list(unique_everseen([t["from_user"] for t in tweets])) | |
print "Found %d distinct authors" % len(author_names) | |
# Convert everything into strings; in Cassandra name and values of a column | |
# are apparently normally converted into strings | |
for author_info in (twitter.users.show(id=name) for name in author_names): | |
author_name = author_info['screen_name'] | |
author_info_str = dict([(k,smart_str(v)) for (k,v) in author_info.iteritems()]) | |
authors_cf.insert(author_name,author_info_str) | |
print "Added data for author %s" % author_name | |
if __name__ == "__main__": | |
populate() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycassa | |
cassandra = pycassa.connect("twitter") | |
authors_cf = pycassa.ColumnFamily(cassandra,"authors") | |
for (k,values) in authors_cf.get_range('!','~',["lang","followers_count"]): | |
print "Key: %s, language: %s, followers: %s" % (k,values["lang"],values["followers_count"]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycassa | |
cassandra = pycassa.connect("twitter") | |
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets") | |
# Key in return value from "tweets" super column family is the | |
# tweet ID, value is a map of per-tweet data. We're only interested | |
# in the number of tweets so we only need to compute the size | |
# of the returned hash. | |
for (k,values) in tweets_cf.get_range('!','~'): | |
print "Authors: %s, tweets written: %d" % (k,len(values)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from twitter.api import Twitter | |
import pycassa | |
from pycassa.index import * | |
cassandra = pycassa.connect("twitter") | |
authors_cf = pycassa.ColumnFamily(cassandra,"authors") | |
tweets_cf = pycassa.ColumnFamily(cassandra,"tweets") | |
twitter = Twitter() | |
# Iterate through the set of IDs returned by the Twitter API and execute | |
# an index search against each ID. The Pycassa API will return a generator | |
# for each query so we make use of the for expression to determine when | |
# we should increment the total count. | |
count = 0 | |
for authorid in twitter.followers.ids(id="spyced"): | |
author_expr = create_index_expression('id_str',str(authorid)) | |
author_clause = create_index_clause([author_expr]) | |
for (authorkey,authorprops) in authors_cf.get_indexed_slices(author_clause): | |
print authorkey | |
count += 1 | |
print count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment