Last active
January 2, 2016 06:09
-
-
Save askmeegs/8262030 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Burstiness.py | |
#by Megan O'Keefe | |
import json, os, bisect | |
from sets import Set | |
from datetime import datetime, timedelta | |
from collections import Counter,defaultdict | |
from numpy import * | |
from pandas import * | |
from pandas.tseries.offsets import Hour, Minute | |
def create_timeseries(times, tint): | |
""" Returns a timeseries from a list of datetime objects based on a specified string time interval | |
:param times: a list of datetime objects, extracted from a list of tweets | |
:type times: list | |
:param tint: a time interval, eg. "30mins" | |
:type tint: string | |
""" | |
timeline = Series(len(times)*[1], index=times) | |
print timeline[:3] | |
timeseries = timeline.resample(tint, how='sum') | |
return timeseries | |
def burstiness(ts, intervalSecs): | |
""" Given a timeseries and an int interval, uses the burstiness formula to generate a number between 0-1 (cosine of the burst "angle") - therefore higher numbers are stronger bursts. Returns a tuples list of bursts, containing the time start/end intervals and the burst value. | |
:param ts: Pandas timeseries object with the timeseries created in create_timeseries | |
:param intervalSecs: timeseries interval in seconds | |
:type intervalSecs: float | |
""" | |
print "burstiness function began. got ", ts | |
bursts = [] | |
allTimes = ts.index | |
pairs = zip(allTimes, allTimes[1:]) | |
values = list(ts.values) | |
for i in range(0, len(ts)-1): | |
ti, tj = pairs[i] #get the times | |
tweetDiff = values[i+1]-values[i] | |
root = math.sqrt((intervalSecs*intervalSecs) + (tweetDiff*tweetDiff)) | |
burst = 1 - (intervalSecs/root) | |
print "calculated ", burst | |
temp = (ti, tj, burst) | |
bursts.append(temp) | |
return bursts | |
def findCascadeStart(dir): | |
""" Given a path to a set of tweet files, creates parallel series of tweet IDs and dates; creates a timeseries, then applies burstiness formula on the time intervals. Gathers tweets and users during top time interval and stores them in json files. Returns the path to these files. | |
:param dir: the path to the original tweet files | |
:type dir: string | |
""" | |
#specify file path | |
path = str(dir) | |
print "path is ", path | |
files = [f for f in os.listdir(str(path)) if f.startswith('relevant_')] #get the files | |
data = [] | |
#load jsons | |
for f in files: | |
#open files one at a time | |
try: | |
temp = json.loads(json.load(open(path + "/" + str(f)))) | |
except: | |
temp = json.load(open(path + "/" + str(f))) | |
for tweet in temp: | |
if 'created_at' in tweet: | |
data.append(tweet) #add only tweets with dates to the list | |
print "completed loading", str(f), "\n" | |
print "number of files loaded: ", str(len(data)) | |
intervalString = "30min" | |
intervalSecs = 1800.0 #how to automate this using datetime? | |
#create pairs of datetimes and IDs | |
times = [datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y') for t in data] | |
print times[:3] | |
ids = [t['id'] for t in data if 'id' in t] | |
pairs = zip(ids, times) | |
pairs = sorted(pairs) #sort pairs by ID | |
#remake IDs and pairs according to new sorted pairs list: | |
ids = [el[0] for el in pairs] | |
times = [el[1] for el in pairs] | |
#create a timeseries out of the tweets, specifying a 30 minute interval | |
ts = create_timeseries(times, intervalString) | |
#ts.plot() | |
print "Created timeseries of length", len(ts), "\n" | |
#call burstiness function with the timeseries we created | |
bursts = burstiness(ts, intervalSecs) | |
#sort bursts largest-smallest | |
bursts = sorted(bursts, key=lambda el: el[2], reverse=True) | |
print bursts | |
print "length of burst list: ", len(bursts) | |
start = bursts[0][0] | |
end = bursts[0][1] | |
i1 = bisect.bisect(times, start) | |
i2 = bisect.bisect(times, end) | |
print "top interval: ", i1, i2, " with a burst: ", bursts[0][2] | |
pinpointed = [el for el in ids[i1:i2]] | |
#threshold = 300 | |
#check to see if enough tweets in pinpointed; if not enough, broaden i1 and i2 by a set # indices | |
#re-gather a new tweets list based on these gathered IDs | |
#also generate a list of userIDs for the authors of these tweets, eliminating any duplicates | |
newList = [] | |
userList = defaultdict(int) | |
for f in files: | |
try: | |
temp = json.loads(json.load(open(path + "/" + str(f)))) | |
except: | |
temp = json.load(open(path + "/" + str(f))) | |
for tweet in temp: | |
if tweet['id'] in pinpointed: | |
newList.append(tweet) | |
userList[tweet['user']['id']] = tweet['user']['friends_count'] | |
print "completed reloading", str(f), "\n" | |
newList = sorted(newList, key= lambda el: el['id']) | |
print [t['id'] for t in newList[:10]] | |
print "gathered ", str(len(newList)), " tweets in time interval." | |
print "gathered ", str(len(userList)), " users." | |
outPath = path.split("allSearch")[0] + "out" | |
if not os.path.exists(outPath): | |
os.makedirs(outPath) | |
json.dump(newList, open(os.path.join(outPath, 'interval.json'), 'w')) | |
json.dump(userList, open(os.path.join(outPath, 'users.json'), 'w')) | |
return os.path.join(outPath, 'interval.json'), os.path.join(outPath, 'users.json') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment