emaadmanzoor · August 29, 2015 14:18
diff --git a/calculate_cluster_statistics.py b/calculate_cluster_statistics.py
 #!/usr/bin/env python

 MAX_CLUSTER_SIZE = 5
 MAX_POSITION = 4

 from collections import defaultdict
 import cPickle as pickle
 from itertools import groupby

 def update_cluster_stats(user, followees, tweets,
                         cluster_reactions,
                         position_counts, position_reactions):

    # reconstruct user's timeline
    timeline = sorted([tweet for tweet in tweets
                       if tweet[0] in followees],
                      reverse=True, key=lambda x: x[1])

    # empty timeline
    if len(timeline) == 0:
        return cluster_reactions, position_counts, position_reactions

    # ids of all tweets reacted to by user
    reactions = [tweet[5] for tweet in tweets           # retweets
                 if tweet[0] == user and tweet[5] > 0] + \
                [tweet[6] for tweet in tweets           # replies
                 if tweet[0] == user and tweet[6] > 0]
    reactions = set(reactions)

    # calculate cluster statistics
    current_cluster_size = 0
    current_cluster_reactions = []
    current_cluster_user = timeline[0][0]
    for tweet in timeline:
        if tweet[0] != current_cluster_user: # new cluster
            cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

            for pos, is_reaction in enumerate(current_cluster_reactions):
                position_counts[(pos, current_cluster_size)] += 1

                if is_reaction:
                    position_reactions[(pos, current_cluster_size)] += 1

            current_cluster_size = 1
            current_cluster_user = tweet[0]
            current_cluster_reactions = []

        elif tweet[0] == current_cluster_user: # in the same cluster
            current_cluster_size += 1

        if tweet[4] in reactions:
            current_cluster_reactions += [1]
        else:
            current_cluster_reactions += [0]

    # last cluster
    cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

    for pos, is_reaction in enumerate(current_cluster_reactions):
        position_counts[(pos, current_cluster_size)] += 1

        if is_reaction:
            position_reactions[(pos, current_cluster_size)] += 1

    return cluster_reactions, position_counts, position_reactions 

 if __name__ == "__main__":
    tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
    links = sorted(pickle.load(open('all_links.p', 'rb')),
                   key=lambda x: x[1])

    cluster_reactions = defaultdict(list)
    position_counts = defaultdict(int)
    position_reactions = defaultdict(int)
    for user, followee_links in groupby(links, key=lambda x: x[1]):
        followees = set([link[0] for link in followee_links])

        cluster_reactions, position_counts,\
        position_reactions = update_cluster_stats(user, followees, tweets,
                                                  cluster_reactions,
                                                  position_counts, position_reactions)

    for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1):
        # statistics for clusters of this size
        reaction_counts = cluster_reactions[cluster_size]
        number_of_clusters = len(reaction_counts)
        number_of_nonempty_clusters = sum([1 for count in reaction_counts
                                           if count > 0])
        number_of_tweets = cluster_size * number_of_clusters
        number_of_reactions = sum(reaction_counts)

        print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\
                                      number_of_tweets, number_of_clusters,\
                                      number_of_nonempty_clusters)
diff --git a/QuantifyingMonotonyAversion.md b/QuantifyingMonotonyAversion.md
	#!/usr/bin/env python

	MAX_CLUSTER_SIZE = 5
	MAX_POSITION = 4

	from collections import defaultdict
	import cPickle as pickle
	from itertools import groupby

	def update_cluster_stats(user, followees, tweets,
	cluster_reactions,
	position_counts, position_reactions):

	# reconstruct user's timeline
	timeline = sorted([tweet for tweet in tweets
	if tweet[0] in followees],
	reverse=True, key=lambda x: x[1])

	# empty timeline
	if len(timeline) == 0:
	return cluster_reactions, position_counts, position_reactions

	# ids of all tweets reacted to by user
	reactions = [tweet[5] for tweet in tweets # retweets
	if tweet[0] == user and tweet[5] > 0] + \
	[tweet[6] for tweet in tweets # replies
	if tweet[0] == user and tweet[6] > 0]
	reactions = set(reactions)

	# calculate cluster statistics
	current_cluster_size = 0
	current_cluster_reactions = []
	current_cluster_user = timeline[0][0]
	for tweet in timeline:
	if tweet[0] != current_cluster_user: # new cluster
	cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

	for pos, is_reaction in enumerate(current_cluster_reactions):
	position_counts[(pos, current_cluster_size)] += 1

	if is_reaction:
	position_reactions[(pos, current_cluster_size)] += 1

	current_cluster_size = 1
	current_cluster_user = tweet[0]
	current_cluster_reactions = []

	elif tweet[0] == current_cluster_user: # in the same cluster
	current_cluster_size += 1

	if tweet[4] in reactions:
	current_cluster_reactions += [1]
	else:
	current_cluster_reactions += [0]

	# last cluster
	cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]

	for pos, is_reaction in enumerate(current_cluster_reactions):
	position_counts[(pos, current_cluster_size)] += 1

	if is_reaction:
	position_reactions[(pos, current_cluster_size)] += 1

	return cluster_reactions, position_counts, position_reactions

	if __name__ == "__main__":
	tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
	links = sorted(pickle.load(open('all_links.p', 'rb')),
	key=lambda x: x[1])

	cluster_reactions = defaultdict(list)
	position_counts = defaultdict(int)
	position_reactions = defaultdict(int)
	for user, followee_links in groupby(links, key=lambda x: x[1]):
	followees = set([link[0] for link in followee_links])

	cluster_reactions, position_counts,\
	position_reactions = update_cluster_stats(user, followees, tweets,
	cluster_reactions,
	position_counts, position_reactions)

	for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1):
	# statistics for clusters of this size
	reaction_counts = cluster_reactions[cluster_size]
	number_of_clusters = len(reaction_counts)
	number_of_nonempty_clusters = sum([1 for count in reaction_counts
	if count > 0])
	number_of_tweets = cluster_size * number_of_clusters
	number_of_reactions = sum(reaction_counts)

	print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\
	number_of_tweets, number_of_clusters,\
	number_of_nonempty_clusters)