Skip to content

Instantly share code, notes, and snippets.

@askmeegs
Last active January 2, 2016 06:09
Show Gist options
  • Save askmeegs/8262030 to your computer and use it in GitHub Desktop.
Save askmeegs/8262030 to your computer and use it in GitHub Desktop.
#Burstiness.py
#by Megan O'Keefe
import json, os, bisect
from sets import Set
from datetime import datetime, timedelta
from collections import Counter,defaultdict
from numpy import *
from pandas import *
from pandas.tseries.offsets import Hour, Minute
def create_timeseries(times, tint):
""" Returns a timeseries from a list of datetime objects based on a specified string time interval
:param times: a list of datetime objects, extracted from a list of tweets
:type times: list
:param tint: a time interval, eg. "30mins"
:type tint: string
"""
timeline = Series(len(times)*[1], index=times)
print timeline[:3]
timeseries = timeline.resample(tint, how='sum')
return timeseries
def burstiness(ts, intervalSecs):
""" Given a timeseries and an int interval, uses the burstiness formula to generate a number between 0-1 (cosine of the burst "angle") - therefore higher numbers are stronger bursts. Returns a tuples list of bursts, containing the time start/end intervals and the burst value.
:param ts: Pandas timeseries object with the timeseries created in create_timeseries
:param intervalSecs: timeseries interval in seconds
:type intervalSecs: float
"""
print "burstiness function began. got ", ts
bursts = []
allTimes = ts.index
pairs = zip(allTimes, allTimes[1:])
values = list(ts.values)
for i in range(0, len(ts)-1):
ti, tj = pairs[i] #get the times
tweetDiff = values[i+1]-values[i]
root = math.sqrt((intervalSecs*intervalSecs) + (tweetDiff*tweetDiff))
burst = 1 - (intervalSecs/root)
print "calculated ", burst
temp = (ti, tj, burst)
bursts.append(temp)
return bursts
def findCascadeStart(dir):
""" Given a path to a set of tweet files, creates parallel series of tweet IDs and dates; creates a timeseries, then applies burstiness formula on the time intervals. Gathers tweets and users during top time interval and stores them in json files. Returns the path to these files.
:param dir: the path to the original tweet files
:type dir: string
"""
#specify file path
path = str(dir)
print "path is ", path
files = [f for f in os.listdir(str(path)) if f.startswith('relevant_')] #get the files
data = []
#load jsons
for f in files:
#open files one at a time
try:
temp = json.loads(json.load(open(path + "/" + str(f))))
except:
temp = json.load(open(path + "/" + str(f)))
for tweet in temp:
if 'created_at' in tweet:
data.append(tweet) #add only tweets with dates to the list
print "completed loading", str(f), "\n"
print "number of files loaded: ", str(len(data))
intervalString = "30min"
intervalSecs = 1800.0 #how to automate this using datetime?
#create pairs of datetimes and IDs
times = [datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y') for t in data]
print times[:3]
ids = [t['id'] for t in data if 'id' in t]
pairs = zip(ids, times)
pairs = sorted(pairs) #sort pairs by ID
#remake IDs and pairs according to new sorted pairs list:
ids = [el[0] for el in pairs]
times = [el[1] for el in pairs]
#create a timeseries out of the tweets, specifying a 30 minute interval
ts = create_timeseries(times, intervalString)
#ts.plot()
print "Created timeseries of length", len(ts), "\n"
#call burstiness function with the timeseries we created
bursts = burstiness(ts, intervalSecs)
#sort bursts largest-smallest
bursts = sorted(bursts, key=lambda el: el[2], reverse=True)
print bursts
print "length of burst list: ", len(bursts)
start = bursts[0][0]
end = bursts[0][1]
i1 = bisect.bisect(times, start)
i2 = bisect.bisect(times, end)
print "top interval: ", i1, i2, " with a burst: ", bursts[0][2]
pinpointed = [el for el in ids[i1:i2]]
#threshold = 300
#check to see if enough tweets in pinpointed; if not enough, broaden i1 and i2 by a set # indices
#re-gather a new tweets list based on these gathered IDs
#also generate a list of userIDs for the authors of these tweets, eliminating any duplicates
newList = []
userList = defaultdict(int)
for f in files:
try:
temp = json.loads(json.load(open(path + "/" + str(f))))
except:
temp = json.load(open(path + "/" + str(f)))
for tweet in temp:
if tweet['id'] in pinpointed:
newList.append(tweet)
userList[tweet['user']['id']] = tweet['user']['friends_count']
print "completed reloading", str(f), "\n"
newList = sorted(newList, key= lambda el: el['id'])
print [t['id'] for t in newList[:10]]
print "gathered ", str(len(newList)), " tweets in time interval."
print "gathered ", str(len(userList)), " users."
outPath = path.split("allSearch")[0] + "out"
if not os.path.exists(outPath):
os.makedirs(outPath)
json.dump(newList, open(os.path.join(outPath, 'interval.json'), 'w'))
json.dump(userList, open(os.path.join(outPath, 'users.json'), 'w'))
return os.path.join(outPath, 'interval.json'), os.path.join(outPath, 'users.json')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment