|
#!/usr/bin/env python |
|
|
|
MAX_CLUSTER_SIZE = 5 |
|
MAX_POSITION = 4 |
|
|
|
from collections import defaultdict |
|
import cPickle as pickle |
|
from itertools import groupby |
|
|
|
def update_cluster_stats(user, followees, tweets, |
|
cluster_reactions, |
|
position_counts, position_reactions): |
|
|
|
# reconstruct user's timeline |
|
timeline = sorted([tweet for tweet in tweets |
|
if tweet[0] in followees], |
|
reverse=True, key=lambda x: x[1]) |
|
|
|
# empty timeline |
|
if len(timeline) == 0: |
|
return cluster_reactions, position_counts, position_reactions |
|
|
|
# ids of all tweets reacted to by user |
|
reactions = [tweet[5] for tweet in tweets # retweets |
|
if tweet[0] == user and tweet[5] > 0] + \ |
|
[tweet[6] for tweet in tweets # replies |
|
if tweet[0] == user and tweet[6] > 0] |
|
reactions = set(reactions) |
|
|
|
# calculate cluster statistics |
|
current_cluster_size = 0 |
|
current_cluster_reactions = [] |
|
current_cluster_user = timeline[0][0] |
|
for tweet in timeline: |
|
if tweet[0] != current_cluster_user: # new cluster |
|
cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)] |
|
|
|
for pos, is_reaction in enumerate(current_cluster_reactions): |
|
position_counts[(pos, current_cluster_size)] += 1 |
|
|
|
if is_reaction: |
|
position_reactions[(pos, current_cluster_size)] += 1 |
|
|
|
current_cluster_size = 1 |
|
current_cluster_user = tweet[0] |
|
current_cluster_reactions = [] |
|
|
|
elif tweet[0] == current_cluster_user: # in the same cluster |
|
current_cluster_size += 1 |
|
|
|
if tweet[4] in reactions: |
|
current_cluster_reactions += [1] |
|
else: |
|
current_cluster_reactions += [0] |
|
|
|
# last cluster |
|
cluster_reactions[current_cluster_size] += [sum(current_cluster_reactions)] |
|
|
|
for pos, is_reaction in enumerate(current_cluster_reactions): |
|
position_counts[(pos, current_cluster_size)] += 1 |
|
|
|
if is_reaction: |
|
position_reactions[(pos, current_cluster_size)] += 1 |
|
|
|
return cluster_reactions, position_counts, position_reactions |
|
|
|
if __name__ == "__main__": |
|
tweets = sorted(pickle.load(open('all_tweets.p', 'rb'))) |
|
links = sorted(pickle.load(open('all_links.p', 'rb')), |
|
key=lambda x: x[1]) |
|
|
|
cluster_reactions = defaultdict(list) |
|
position_counts = defaultdict(int) |
|
position_reactions = defaultdict(int) |
|
for user, followee_links in groupby(links, key=lambda x: x[1]): |
|
followees = set([link[0] for link in followee_links]) |
|
|
|
cluster_reactions, position_counts,\ |
|
position_reactions = update_cluster_stats(user, followees, tweets, |
|
cluster_reactions, |
|
position_counts, position_reactions) |
|
|
|
for cluster_size in xrange(1, MAX_CLUSTER_SIZE+1): |
|
# statistics for clusters of this size |
|
reaction_counts = cluster_reactions[cluster_size] |
|
number_of_clusters = len(reaction_counts) |
|
number_of_nonempty_clusters = sum([1 for count in reaction_counts |
|
if count > 0]) |
|
number_of_tweets = cluster_size * number_of_clusters |
|
number_of_reactions = sum(reaction_counts) |
|
|
|
print '%d\t%d\t%d\t%d\t%d' % (cluster_size, number_of_reactions,\ |
|
number_of_tweets, number_of_clusters,\ |
|
number_of_nonempty_clusters) |