Skip to content

Instantly share code, notes, and snippets.

@emaadmanzoor
Last active August 29, 2015 14:18
Show Gist options
  • Save emaadmanzoor/55f2b1c72764a2ba9bfd to your computer and use it in GitHub Desktop.
Save emaadmanzoor/55f2b1c72764a2ba9bfd to your computer and use it in GitHub Desktop.
Quantifying Monotony Aversion
#!/usr/bin/env python
MAX_CLUSTER_SIZE = 5
MAX_POSITION = 4
from collections import defaultdict
import cPickle as pickle
from itertools import groupby
def update_cluster_stats(user, followees, tweets,
cluster_reactions,
position_counts, position_reactions):
# reconstruct user's timeline
timeline = sorted([tweet for tweet in tweets
if tweet[0] in followees],
reverse=True, key=lambda x: x[1])
# empty timeline
if len(timeline) == 0:
return cluster_reactions, position_counts, position_reactions
# ids of all tweets reacted to by user
reactions = [tweet[5] for tweet in tweets # retweets
if tweet[0] == user and tweet[5] > 0] + \
[tweet[6] for tweet in tweets # replies
if tweet[0] == user and tweet[6] > 0]
reactions = set(reactions)
# calculate cluster statistics
current_cluster_size = 0
current_cluster_reactions = []
current_cluster_user = timeline[0][0]
for tweet in timeline:
if tweet[0] != current_cluster_user: # new cluster
cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]
for pos, is_reaction in enumerate(current_cluster_reactions):
position_counts[(pos, current_cluster_size)] += 1
if is_reaction:
position_reactions[(pos, current_cluster_size)] += 1
current_cluster_size = 1
current_cluster_user = tweet[0]
current_cluster_reactions = []
elif tweet[0] == current_cluster_user: # in the same cluster
current_cluster_size += 1
if tweet[4] in reactions:
current_cluster_reactions += [1]
else:
current_cluster_reactions += [0]
# last cluster
cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)]
for pos, is_reaction in enumerate(current_cluster_reactions):
position_counts[(pos, current_cluster_size)] += 1
if is_reaction:
position_reactions[(pos, current_cluster_size)] += 1
return cluster_reactions, position_counts, position_reactions
if __name__ == "__main__":
tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
links = sorted(pickle.load(open('all_links.p', 'rb')),
key=lambda x: x[1])
cluster_reactions = defaultdict(list)
position_counts = defaultdict(int)
position_reactions = defaultdict(int)
for user, followee_links in groupby(links, key=lambda x: x[1]):
followees = set([link[0] for link in followee_links])
cluster_reactions, position_counts,\
position_reactions = update_cluster_stats(user, followees, tweets,
cluster_reactions,
position_counts, position_reactions)
for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1):
# statistics for clusters of this size
reaction_counts = cluster_reactions[cluster_size]
number_of_clusters = len(reaction_counts)
number_of_nonempty_clusters = sum([1 for count in reaction_counts
if count > 0])
number_of_tweets = cluster_size * number_of_clusters
number_of_reactions = sum(reaction_counts)
print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\
number_of_tweets, number_of_clusters,\
number_of_nonempty_clusters)

See the project website for more details.

Please report any issues to [email protected].

Execution

Running this requires having the following files in the same directory as calculate_cluster_statistics.py:

  • all_links.p
  • all_tweets.p

It is run by a simple Python call: python calculate_cluster_statistics.py

Output:

1	15895 8437262 8437262	15895
2	2756  1818212 909106	2582
3	715 586551  195517  623
4	301 243848  60962 249
5	126 126250  25250 101

The output is tab-separated, with the columns as follows:

cluster_size | number_of_reactions | number_of_tweets | number_of_clusters | number_of_nonempty_clusters

A non-empty cluster is one in which at least one tweet has been reacted to.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment