heuristicfencepost · March 29, 2011 05:55
diff --git a/PopulateCassandra.py b/PopulateCassandra.py
 from twitter.api import Twitter
 import pycassa

 from itertools import ifilterfalse

 # Query to use when finding tweets.
 searchquery = "#cassandra"

 # Borrowed from the itertools docs
 def unique_everseen(iterable, key=None):
    "List unique elements, preserving order. Remember all elements ever seen."
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in ifilterfalse(seen.__contains__, iterable):
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

 def populate():

    cassandra = pycassa.connect("twitter")
    authors_cf = pycassa.ColumnFamily(cassandra,"authors")
    tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")

    # The Twitter API returns Unicode vals for all string results.  In addition
    # Pycassa appears to complain when we give it a string encoded in something
    # other than UTF-8 or a non-string value.  To get around this we perform
    # an intelligent string conversion; if we get a Unicode type return the
    # UTF-8 encoding of that string, otherwise return the standard string
    # representation.
    def smart_str(val):
        if isinstance(val,unicode):
            return val.encode('utf-8')
        else:
            return str(val)

    search = Twitter(domain="search.twitter.com")
    twitter = Twitter()

    search_results = search.search(q=searchquery,rpp=100)
    tweets = search_results["results"]

    print tweets[0]
    for tweet in tweets:
        tweet_str = dict([(k,smart_str(v)) for (k,v) in tweet.iteritems()])
        tweets_cf.insert(tweet["from_user"],{tweet["id_str"]:tweet_str})

    print "Found %d tweets" % len(tweets)
    author_names = list(unique_everseen([t["from_user"] for t in tweets]))
    print "Found %d distinct authors" % len(author_names)

    # Convert everything into strings; in Cassandra name and values of a column
    # are apparently normally converted into strings
    for author_info in (twitter.users.show(id=name) for name in author_names):

        author_name = author_info['screen_name']
        author_info_str = dict([(k,smart_str(v)) for (k,v) in author_info.iteritems()])
        authors_cf.insert(author_name,author_info_str)
        print "Added data for author %s" % author_name

 if __name__ == "__main__":
    populate()
diff --git a/Query1.py b/Query1.py
 import pycassa

 cassandra = pycassa.connect("twitter")
 authors_cf = pycassa.ColumnFamily(cassandra,"authors")
 for (k,values) in authors_cf.get_range('!','~',["lang","followers_count"]):
    print "Key: %s, language: %s, followers: %s" % (k,values["lang"],values["followers_count"])
diff --git a/Query2.py b/Query2.py
 import pycassa

 cassandra = pycassa.connect("twitter")
 tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")

 # Key in return value from "tweets" super column family is the
 # tweet ID, value is a map of per-tweet data.  We're only interested
 # in the number of tweets so we only need to compute the size
 # of the returned hash.
 for (k,values) in tweets_cf.get_range('!','~'):
    print "Authors: %s, tweets written: %d" % (k,len(values))
diff --git a/Query3.py b/Query3.py
 from twitter.api import Twitter
 import pycassa
 from pycassa.index import *

 cassandra = pycassa.connect("twitter")
 authors_cf = pycassa.ColumnFamily(cassandra,"authors")
 tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")

 twitter = Twitter()

 # Iterate through the set of IDs returned by the Twitter API and execute
 # an index search against each ID.  The Pycassa API will return a generator
 # for each query so we make use of the for expression to determine when
 # we should increment the total count.
 count = 0
 for authorid in twitter.followers.ids(id="spyced"):
    author_expr = create_index_expression('id_str',str(authorid))
    author_clause = create_index_clause([author_expr])
    for (authorkey,authorprops) in authors_cf.get_indexed_slices(author_clause):
        print authorkey
        count += 1
 print count
	from twitter.api import Twitter
	import pycassa

	from itertools import ifilterfalse

	# Query to use when finding tweets.
	searchquery = "#cassandra"

	# Borrowed from the itertools docs
	def unique_everseen(iterable, key=None):
	"List unique elements, preserving order. Remember all elements ever seen."
	seen = set()
	seen_add = seen.add
	if key is None:
	for element in ifilterfalse(seen.__contains__, iterable):
	seen_add(element)
	yield element
	else:
	for element in iterable:
	k = key(element)
	if k not in seen:
	seen_add(k)
	yield element

	def populate():

	cassandra = pycassa.connect("twitter")
	authors_cf = pycassa.ColumnFamily(cassandra,"authors")
	tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")

	# The Twitter API returns Unicode vals for all string results. In addition
	# Pycassa appears to complain when we give it a string encoded in something
	# other than UTF-8 or a non-string value. To get around this we perform
	# an intelligent string conversion; if we get a Unicode type return the
	# UTF-8 encoding of that string, otherwise return the standard string
	# representation.
	def smart_str(val):
	if isinstance(val,unicode):
	return val.encode('utf-8')
	else:
	return str(val)

	search = Twitter(domain="search.twitter.com")
	twitter = Twitter()

	search_results = search.search(q=searchquery,rpp=100)
	tweets = search_results["results"]

	print tweets[0]
	for tweet in tweets:
	tweet_str = dict([(k,smart_str(v)) for (k,v) in tweet.iteritems()])
	tweets_cf.insert(tweet["from_user"],{tweet["id_str"]:tweet_str})

	print "Found %d tweets" % len(tweets)
	author_names = list(unique_everseen([t["from_user"] for t in tweets]))
	print "Found %d distinct authors" % len(author_names)

	# Convert everything into strings; in Cassandra name and values of a column
	# are apparently normally converted into strings
	for author_info in (twitter.users.show(id=name) for name in author_names):

	author_name = author_info['screen_name']
	author_info_str = dict([(k,smart_str(v)) for (k,v) in author_info.iteritems()])
	authors_cf.insert(author_name,author_info_str)
	print "Added data for author %s" % author_name

	if __name__ == "__main__":
	populate()
	from twitter.api import Twitter
	import pycassa
	from pycassa.index import *

	cassandra = pycassa.connect("twitter")
	authors_cf = pycassa.ColumnFamily(cassandra,"authors")
	tweets_cf = pycassa.ColumnFamily(cassandra,"tweets")

	twitter = Twitter()

	# Iterate through the set of IDs returned by the Twitter API and execute
	# an index search against each ID. The Pycassa API will return a generator
	# for each query so we make use of the for expression to determine when
	# we should increment the total count.
	count = 0
	for authorid in twitter.followers.ids(id="spyced"):
	author_expr = create_index_expression('id_str',str(authorid))
	author_clause = create_index_clause([author_expr])
	for (authorkey,authorprops) in authors_cf.get_indexed_slices(author_clause):
	print authorkey
	count += 1
	print count