emaadmanzoor · August 29, 2015 14:18
diff --git a/AttentionPotentialValidation.md b/AttentionPotentialValidation.md
diff --git a/validate_ap.py b/validate_ap.py
 #!/usr/bin/env python

 import cPickle as pickle
 from itertools import groupby
 from multiprocessing import Pool

 LOGIN_THRESHOLD = 8 # hours

 def calculate_ap(producer, producer_tweets,
                 follower, tweets, links):

    # all the follower's tweets
    follower_tweets = [tweet for tweet in tweets
                       if tweet[0] == follower]

    if len(follower_tweets) == 0:
        return 0, 0.0, 0, 0, 0

    # all the competitors' tweets
    competitors = set([link[1] for link in links if link[0] == follower])
    competitor_tweets = [tweet for tweet in tweets
                         if tweet[0] in competitors]
    
    # estimate gamma: prior prob of follower reaction
    #                 this is a user-define weight
    follower_reactions = [tweet for tweet in follower_tweets
                          if tweet[2] > 0 or tweet[3] > 0]
    gamma = len(follower_reactions) / float(len(follower_tweets))

    # estimate delta: prob of producer's tweet getting a follower reaction
    #                 this is the usual measure of (directed) tie strength
    follower_reactions_to_producer = [tweet for tweet in follower_reactions
                                      if tweet[2] == producer or tweet[3] == producer]
    delta = len(follower_reactions_to_producer) / float(len(producer_tweets))

    if gamma == 0 or delta == 0:
        return len(follower_reactions_to_producer), 0.0, 0, 0, 0

    # estimate rho: prob of leaving the timeline after reading a post
    #               
    #               rho is the parameter of a geometric distribution
    #               it is estimated using the average number of posts
    #               consumed per login
    #
    #               a tweet is "consumed" if it is reacted to
    #               a "login" is counted for a followers tweet
    #               occuring after a gap of 8 hours (§2.2)
    num_logins = 0
    prev = follower_tweets[0][1] / 1000
    for follower_tweet in follower_tweets[1:]:
        diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours
        if diff >= LOGIN_THRESHOLD:
            num_logins += 1
        prev = follower_tweet[1] / 1000

    mu = len(follower_reactions) / float(num_logins) 
    rho = 1. / (1 + mu)

    # reconstruct timeline
    timeline = []
    timeline += [(t[1]/1000, 2) for t in follower_tweets]
    timeline += [(t[1]/1000, 1) for t in competitor_tweets]
    timeline += [(t[1]/1000, 0) for t in producer_tweets]
    timeline = sorted(timeline, reverse=True)

    # calculate ap: geometric user/cluster survival fns
    ap = 0.0
    depth = 0
    cluster_size = 0
    cluster_ap = 0
    for post in timeline:
        if post[1] == 2: # follower
            depth = 0
            continue

        depth += 1
        
        if post[1] == 1: # competitor
            ap += (delta ** cluster_size) * cluster_ap
            cluster_size = 0
            cluster_ap = 0
            continue

        if post[1] == 0: # producer
            cluster_size += 1
            cluster_ap += (1 - rho) ** depth
            continue

    # toggle comment for with and without gamma
    ap *= gamma

    return len(follower_reactions_to_producer), ap, gamma, delta, rho

 def main():
    tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
    links = sorted(pickle.load(open('all_links.p', 'rb')))

    # multiprocessing
    pool = Pool(8)
    results = [] # (producer, follower, #rts, #ap)

    for producer, producer_links in groupby(links, key=lambda x: x[0]):
        producer_tweets = [tweet for tweet in tweets
                           if tweet[0] == producer]

        if len(producer_tweets) == 0:
            continue

        # calculate the total attention potential of the producer
        for producer_link in producer_links:
            follower = producer_link[1]

            results.append((producer, follower,
                            pool.apply_async(calculate_ap,
                                             (producer, producer_tweets,
                                              follower, tweets, links))))

    for producer, follower, r in results:
        producer_rts, producer_ap, gamma, delta, rho = r.get()
        print producer, '\t', follower, '\t', producer_rts, '\t',
        print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import cPickle as pickle
	from itertools import groupby
	from multiprocessing import Pool

	LOGIN_THRESHOLD = 8 # hours

	def calculate_ap(producer, producer_tweets,
	follower, tweets, links):

	# all the follower's tweets
	follower_tweets = [tweet for tweet in tweets
	if tweet[0] == follower]

	if len(follower_tweets) == 0:
	return 0, 0.0, 0, 0, 0

	# all the competitors' tweets
	competitors = set([link[1] for link in links if link[0] == follower])
	competitor_tweets = [tweet for tweet in tweets
	if tweet[0] in competitors]

	# estimate gamma: prior prob of follower reaction
	# this is a user-define weight
	follower_reactions = [tweet for tweet in follower_tweets
	if tweet[2] > 0 or tweet[3] > 0]
	gamma = len(follower_reactions) / float(len(follower_tweets))

	# estimate delta: prob of producer's tweet getting a follower reaction
	# this is the usual measure of (directed) tie strength
	follower_reactions_to_producer = [tweet for tweet in follower_reactions
	if tweet[2] == producer or tweet[3] == producer]
	delta = len(follower_reactions_to_producer) / float(len(producer_tweets))

	if gamma == 0 or delta == 0:
	return len(follower_reactions_to_producer), 0.0, 0, 0, 0

	# estimate rho: prob of leaving the timeline after reading a post
	#
	# rho is the parameter of a geometric distribution
	# it is estimated using the average number of posts
	# consumed per login
	#
	# a tweet is "consumed" if it is reacted to
	# a "login" is counted for a followers tweet
	# occuring after a gap of 8 hours (§2.2)
	num_logins = 0
	prev = follower_tweets[0][1] / 1000
	for follower_tweet in follower_tweets[1:]:
	diff = (follower_tweet[1] / 1000 - prev) / 3600. # hours
	if diff >= LOGIN_THRESHOLD:
	num_logins += 1
	prev = follower_tweet[1] / 1000

	mu = len(follower_reactions) / float(num_logins)
	rho = 1. / (1 + mu)

	# reconstruct timeline
	timeline = []
	timeline += [(t[1]/1000, 2) for t in follower_tweets]
	timeline += [(t[1]/1000, 1) for t in competitor_tweets]
	timeline += [(t[1]/1000, 0) for t in producer_tweets]
	timeline = sorted(timeline, reverse=True)

	# calculate ap: geometric user/cluster survival fns
	ap = 0.0
	depth = 0
	cluster_size = 0
	cluster_ap = 0
	for post in timeline:
	if post[1] == 2: # follower
	depth = 0
	continue

	depth += 1

	if post[1] == 1: # competitor
	ap += (delta ** cluster_size) * cluster_ap
	cluster_size = 0
	cluster_ap = 0
	continue

	if post[1] == 0: # producer
	cluster_size += 1
	cluster_ap += (1 - rho) ** depth
	continue

	# toggle comment for with and without gamma
	ap *= gamma

	return len(follower_reactions_to_producer), ap, gamma, delta, rho

	def main():
	tweets = sorted(pickle.load(open('all_tweets.p', 'rb')))
	links = sorted(pickle.load(open('all_links.p', 'rb')))

	# multiprocessing
	pool = Pool(8)
	results = [] # (producer, follower, #rts, #ap)

	for producer, producer_links in groupby(links, key=lambda x: x[0]):
	producer_tweets = [tweet for tweet in tweets
	if tweet[0] == producer]

	if len(producer_tweets) == 0:
	continue

	# calculate the total attention potential of the producer
	for producer_link in producer_links:
	follower = producer_link[1]

	results.append((producer, follower,
	pool.apply_async(calculate_ap,
	(producer, producer_tweets,
	follower, tweets, links))))

	for producer, follower, r in results:
	producer_rts, producer_ap, gamma, delta, rho = r.get()
	print producer, '\t', follower, '\t', producer_rts, '\t',
	print "%.4f\t%.4f\t%.4f\t%.4f" % (producer_ap, gamma, delta, rho)

	if __name__ == "__main__":
	main()