|
#!/usr/bin/env python |
|
|
|
import cPickle as pickle |
|
from itertools import groupby |
|
from multiprocessing import Pool |
|
|
|
LOGIN_THRESHOLD = 8 # hours |
|
|
|
def calculate_ap(producer, producer_tweets, |
|
follower, tweets, links): |
|
|
|
# all the follower's tweets |
|
follower_tweets = [tweet for tweet in tweets |
|
if tweet[0] == follower] |
|
|
|
if len(follower_tweets) == 0: |
|
return 0, 0.0, 0, 0, 0 |
|
|
|
# all the competitors' tweets |
|
competitors = set([link[1] for link in links if link[0] == follower]) |
|
competitor_tweets = [tweet for tweet in tweets |
|
if tweet[0] in competitors] |
|
|
|
# estimate gamma: prior prob of follower reaction |
|
# this is a user-define weight |
|
follower_reactions = [tweet for tweet in follower_tweets |
|
if tweet[2] > 0 or tweet[3] > 0] |
|
gamma = len(follower_reactions) / float(len(follower_tweets)) |
|
|
|
# estimate delta: prob of producer's tweet getting a follower reaction |
|
# this is the usual measure of (directed) tie strength |
|
follower_reactions_to_producer = [tweet for tweet in follower_reactions |
|
if tweet[2] == producer or tweet[3] == producer] |
|
delta = len(follower_reactions_to_producer) / float(len(producer_tweets)) |
|
|
|
if gamma == 0 or delta == 0: |
|
return len(follower_reactions_to_producer), 0.0, 0, 0, 0 |
|
|
|
# estimate rho: prob of leaving the timeline after reading a post |
|
# |
|
# rho is the parameter of a geometric distribution |
|
# it is estimated using the average number of posts |
|
# consumed per login |
|
# |
|
# a tweet is "consumed" if it is reacted to |
|
# a "login" is counted for a followers tweet |
|
# occuring after a gap of 8 hours (§2.2) |
|
num_logins = 0 |
|
prev = follower_tweets[0][1] / 1000 |
|
for follower_tweet in follower_tweets[1:]: |
|
diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours |
|
if diff >= LOGIN_THRESHOLD: |
|
num_logins += 1 |
|
prev = follower_tweet[1] / 1000 |
|
|
|
mu = len(follower_reactions) / float(num_logins) |
|
rho = 1. / (1 + mu) |
|
|
|
# reconstruct timeline |
|
timeline = [] |
|
timeline += [(t[1]/1000, 2) for t in follower_tweets] |
|
timeline += [(t[1]/1000, 1) for t in competitor_tweets] |
|
timeline += [(t[1]/1000, 0) for t in producer_tweets] |
|
timeline = sorted(timeline, reverse=True) |
|
|
|
# calculate ap: geometric user/cluster survival fns |
|
ap = 0.0 |
|
depth = 0 |
|
cluster_size = 0 |
|
cluster_ap = 0 |
|
for post in timeline: |
|
if post[1] == 2: # follower |
|
depth = 0 |
|
continue |
|
|
|
depth += 1 |
|
|
|
if post[1] == 1: # competitor |
|
ap += (delta ** cluster_size) * cluster_ap |
|
cluster_size = 0 |
|
cluster_ap = 0 |
|
continue |
|
|
|
if post[1] == 0: # producer |
|
cluster_size += 1 |
|
cluster_ap += (1 - rho) ** depth |
|
continue |
|
|
|
# toggle comment for with and without gamma |
|
ap *= gamma |
|
|
|
return len(follower_reactions_to_producer), ap, gamma, delta, rho |
|
|
|
def main(): |
|
tweets = sorted(pickle.load(open('all_tweets.p', 'rb'))) |
|
links = sorted(pickle.load(open('all_links.p', 'rb'))) |
|
|
|
# multiprocessing |
|
pool = Pool(8) |
|
results = [] # (producer, follower, #rts, #ap) |
|
|
|
for producer, producer_links in groupby(links, key=lambda x: x[0]): |
|
producer_tweets = [tweet for tweet in tweets |
|
if tweet[0] == producer] |
|
|
|
if len(producer_tweets) == 0: |
|
continue |
|
|
|
# calculate the total attention potential of the producer |
|
for producer_link in producer_links: |
|
follower = producer_link[1] |
|
|
|
results.append((producer, follower, |
|
pool.apply_async(calculate_ap, |
|
(producer, producer_tweets, |
|
follower, tweets, links)))) |
|
|
|
for producer, follower, r in results: |
|
producer_rts, producer_ap, gamma, delta, rho = r.get() |
|
print producer, '\t', follower, '\t', producer_rts, '\t', |
|
print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho) |
|
|
|
if __name__ == "__main__": |
|
main() |