askmeegs · January 2, 2016 06:09
diff --git a/burstiness.py b/burstiness.py
 #Burstiness.py
 #by Megan O'Keefe

 import json, os, bisect
 from sets import Set 
 from datetime import datetime, timedelta
 from collections import Counter,defaultdict
 from numpy import *
 from pandas import *
 from pandas.tseries.offsets import Hour, Minute


 def create_timeseries(times, tint):
 	""" Returns a timeseries from a list of datetime objects based on a specified string time interval
 	
 		:param times: a list of datetime objects, extracted from a list of tweets
 		:type times: list
 		:param tint: a time interval, eg. "30mins"
 		:type tint: string
 	"""
 	timeline = Series(len(times)*[1], index=times)
 	print timeline[:3]
 	timeseries = timeline.resample(tint, how='sum')
 	return timeseries
    

 def burstiness(ts, intervalSecs):
 	""" Given a timeseries and an int interval, uses the burstiness formula to generate a number between 0-1 (cosine of the burst "angle") - therefore higher numbers are stronger bursts. Returns a tuples list of bursts, containing the time start/end intervals and the burst value.
 		
 		:param ts: Pandas timeseries object with the timeseries created in create_timeseries
 		:param intervalSecs: timeseries interval in seconds
 		:type intervalSecs: float
 	"""
 	print "burstiness function began. got ", ts
 	bursts = []
 	allTimes = ts.index
 	pairs = zip(allTimes, allTimes[1:])
 	values = list(ts.values)
 	for i in range(0, len(ts)-1):
 		ti, tj = pairs[i] #get the times
 		tweetDiff = values[i+1]-values[i]
 		root = math.sqrt((intervalSecs*intervalSecs) + (tweetDiff*tweetDiff))
 		burst = 1 - (intervalSecs/root)
 		print "calculated ", burst
 		temp = (ti, tj, burst) 
 		bursts.append(temp)
 	return bursts


 def findCascadeStart(dir):	
 	""" Given a path to a set of tweet files, creates parallel series of tweet IDs and dates; creates a timeseries, then applies burstiness formula on the time intervals. Gathers tweets and users during top time interval and stores them in json files. Returns the path to these files.
 	
 		:param dir: the path to the original tweet files
 		:type dir: string
 	"""
 	#specify file path
 	path = str(dir)
 	print "path is ", path
 	files = [f for f in os.listdir(str(path)) if f.startswith('relevant_')] #get the files
 	data = []
 	#load jsons
 	for f in files:
 		#open files one at a time
 		try:
 			temp = json.loads(json.load(open(path + "/" + str(f))))
 		except:
 			temp = json.load(open(path + "/" + str(f)))
 		for tweet in temp:
 			if 'created_at' in tweet:
 				data.append(tweet) #add only tweets with dates to the list
 		print "completed loading", str(f), "\n"
 		print "number of files loaded: ", str(len(data))
 	

 	intervalString = "30min"
 	intervalSecs = 1800.0 #how to automate this using datetime? 
 	
 	#create pairs of datetimes and IDs
 	times = [datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y') for t in data]
 	print times[:3]
 	ids = [t['id'] for t in data if 'id' in t]
 	pairs = zip(ids, times)
 	pairs = sorted(pairs) #sort pairs by ID
 	#remake IDs and pairs according to new sorted pairs list:
 	ids = [el[0] for el in pairs]
 	times = [el[1] for el in pairs]
 	
 	#create a timeseries out of the tweets, specifying a 30 minute interval 
 	ts = create_timeseries(times, intervalString)
 	#ts.plot()
 	print "Created timeseries of length", len(ts), "\n"

 	#call burstiness function with the timeseries we created 
 	bursts = burstiness(ts, intervalSecs)

 	#sort bursts largest-smallest
 	bursts = sorted(bursts, key=lambda el: el[2], reverse=True)
 	print bursts
 	print "length of burst list: ", len(bursts)
 	
 	start = bursts[0][0]
 	end = bursts[0][1]
 	
 	i1 = bisect.bisect(times, start)
 	i2 = bisect.bisect(times, end) 
 	print "top interval: ", i1, i2, " with a burst: ", bursts[0][2]
 	
 	pinpointed = [el for el in ids[i1:i2]]
 	
 	#threshold = 300
 	#check to see if enough tweets in pinpointed; if not enough, broaden i1 and i2 by a set # indices
 	
 	#re-gather a new tweets list based on these gathered IDs
 	#also generate a list of userIDs for the authors of these tweets, eliminating any duplicates 
 	newList = []
 	userList = defaultdict(int)
 	for f in files:
 		try:
 			temp = json.loads(json.load(open(path + "/" + str(f))))
 		except:
 			temp = json.load(open(path + "/" + str(f)))
 		for tweet in temp:
 			if tweet['id'] in pinpointed:
 				newList.append(tweet)
 				userList[tweet['user']['id']] = tweet['user']['friends_count']
 		print "completed reloading", str(f), "\n"
 	
 	newList = sorted(newList, key= lambda el: el['id'])
 	print [t['id'] for t in newList[:10]]
 	print "gathered ", str(len(newList)), " tweets in time interval."
 	print "gathered ", str(len(userList)), " users."
 	
 	outPath = path.split("allSearch")[0] + "out"
 	if not os.path.exists(outPath):
 		os.makedirs(outPath)
 	
 	json.dump(newList, open(os.path.join(outPath, 'interval.json'), 'w'))
 	json.dump(userList, open(os.path.join(outPath, 'users.json'), 'w'))
 	return os.path.join(outPath, 'interval.json'), os.path.join(outPath, 'users.json')
	#Burstiness.py
	#by Megan O'Keefe

	import json, os, bisect
	from sets import Set
	from datetime import datetime, timedelta
	from collections import Counter,defaultdict
	from numpy import *
	from pandas import *
	from pandas.tseries.offsets import Hour, Minute


	def create_timeseries(times, tint):
	""" Returns a timeseries from a list of datetime objects based on a specified string time interval

	:param times: a list of datetime objects, extracted from a list of tweets
	:type times: list
	:param tint: a time interval, eg. "30mins"
	:type tint: string
	"""
	timeline = Series(len(times)*[1], index=times)
	print timeline[:3]
	timeseries = timeline.resample(tint, how='sum')
	return timeseries


	def burstiness(ts, intervalSecs):
	""" Given a timeseries and an int interval, uses the burstiness formula to generate a number between 0-1 (cosine of the burst "angle") - therefore higher numbers are stronger bursts. Returns a tuples list of bursts, containing the time start/end intervals and the burst value.

	:param ts: Pandas timeseries object with the timeseries created in create_timeseries
	:param intervalSecs: timeseries interval in seconds
	:type intervalSecs: float
	"""
	print "burstiness function began. got ", ts
	bursts = []
	allTimes = ts.index
	pairs = zip(allTimes, allTimes[1:])
	values = list(ts.values)
	for i in range(0, len(ts)-1):
	ti, tj = pairs[i] #get the times
	tweetDiff = values[i+1]-values[i]
	root = math.sqrt((intervalSecsintervalSecs) + (tweetDifftweetDiff))
	burst = 1 - (intervalSecs/root)
	print "calculated ", burst
	temp = (ti, tj, burst)
	bursts.append(temp)
	return bursts


	def findCascadeStart(dir):
	""" Given a path to a set of tweet files, creates parallel series of tweet IDs and dates; creates a timeseries, then applies burstiness formula on the time intervals. Gathers tweets and users during top time interval and stores them in json files. Returns the path to these files.

	:param dir: the path to the original tweet files
	:type dir: string
	"""
	#specify file path
	path = str(dir)
	print "path is ", path
	files = [f for f in os.listdir(str(path)) if f.startswith('relevant_')] #get the files
	data = []
	#load jsons
	for f in files:
	#open files one at a time
	try:
	temp = json.loads(json.load(open(path + "/" + str(f))))
	except:
	temp = json.load(open(path + "/" + str(f)))
	for tweet in temp:
	if 'created_at' in tweet:
	data.append(tweet) #add only tweets with dates to the list
	print "completed loading", str(f), "\n"
	print "number of files loaded: ", str(len(data))


	intervalString = "30min"
	intervalSecs = 1800.0 #how to automate this using datetime?

	#create pairs of datetimes and IDs
	times = [datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y') for t in data]
	print times[:3]
	ids = [t['id'] for t in data if 'id' in t]
	pairs = zip(ids, times)
	pairs = sorted(pairs) #sort pairs by ID
	#remake IDs and pairs according to new sorted pairs list:
	ids = [el[0] for el in pairs]
	times = [el[1] for el in pairs]

	#create a timeseries out of the tweets, specifying a 30 minute interval
	ts = create_timeseries(times, intervalString)
	#ts.plot()
	print "Created timeseries of length", len(ts), "\n"

	#call burstiness function with the timeseries we created
	bursts = burstiness(ts, intervalSecs)

	#sort bursts largest-smallest
	bursts = sorted(bursts, key=lambda el: el[2], reverse=True)
	print bursts
	print "length of burst list: ", len(bursts)

	start = bursts[0][0]
	end = bursts[0][1]

	i1 = bisect.bisect(times, start)
	i2 = bisect.bisect(times, end)
	print "top interval: ", i1, i2, " with a burst: ", bursts[0][2]

	pinpointed = [el for el in ids[i1:i2]]

	#threshold = 300
	#check to see if enough tweets in pinpointed; if not enough, broaden i1 and i2 by a set # indices

	#re-gather a new tweets list based on these gathered IDs
	#also generate a list of userIDs for the authors of these tweets, eliminating any duplicates
	newList = []
	userList = defaultdict(int)
	for f in files:
	try:
	temp = json.loads(json.load(open(path + "/" + str(f))))
	except:
	temp = json.load(open(path + "/" + str(f)))
	for tweet in temp:
	if tweet['id'] in pinpointed:
	newList.append(tweet)
	userList[tweet['user']['id']] = tweet['user']['friends_count']
	print "completed reloading", str(f), "\n"

	newList = sorted(newList, key= lambda el: el['id'])
	print [t['id'] for t in newList[:10]]
	print "gathered ", str(len(newList)), " tweets in time interval."
	print "gathered ", str(len(userList)), " users."

	outPath = path.split("allSearch")[0] + "out"
	if not os.path.exists(outPath):
	os.makedirs(outPath)

	json.dump(newList, open(os.path.join(outPath, 'interval.json'), 'w'))
	json.dump(userList, open(os.path.join(outPath, 'users.json'), 'w'))
	return os.path.join(outPath, 'interval.json'), os.path.join(outPath, 'users.json')