ferayebend · November 25, 2014 12:53
diff --git a/datab_toolkit.py b/datab_toolkit.py
 #!/usr/bin/python
 # -*- encoding: utf-8 -*-
 from __future__ import unicode_literals
 from pylab import *


 import json
 import codecs
 import pymongo
 import time
 import sys
 import os
 import re

 def getOldTweets(filename):
    input_file  = file(filename, "r")
    tweets = []
    for lines in input_file:
 	tweets.append(json.loads(lines))
    return tweets

 def get_users(filename):
    users = []
    f = open(filename)
    for lines in f:
 	users.append(lines.strip())
    return users

 def stdoutStatus(jsonarray):
    for tweet in jsonarray:
 	print tweet['text']

 def inputDb(collection_name, filename):
    '''
    init db
    '''
    input_file  = file(filename, "r")

    for lines in input_file:#memory efficient loop
    	data ={}
    	tweet = json.loads(lines)
        data['created_at'] = time.mktime(time.strptime(tweet['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))
    	data['user_name'] = tweet['user']['screen_name']
 	data['user_id']   = tweet['user']['id_str']
    	data['_id'] = tweet['id']
 	data['lang'] = tweet['lang']
 	data['text'] = tweet['text']
 	#print tweet['user']['screen_name']
 	#print tweet['entities']['hashtags']
 	data['hashtags'] = tweet['entities']['hashtags']
 	if collection_name.find_one({'_id':data['_id']}):
 	   continue
 	else:
 	   collection_name.insert(data) #takes care of duplicates 
    #input_file.close()  
    #posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish 

 def inputUser(collection_name, filename):
    '''
    init db
    '''
    input_file  = file(filename, "r")

    for lines in input_file:#memory efficient loop
    	data ={}
    	user = json.loads(lines)
 	try:
 	     user['id']
 	except KeyError:
 	     print user
 	     continue #assuming there is an acceptable error
        data['created_at'] = time.mktime(time.strptime(user['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))
    	data['user_name'] = user['screen_name']
 	#data['user_id']   = tweet['user']['id_str']
    	data['_id'] = user['id']
 	data['listed_count'] = user['listed_count']
 	data['description'] = user['description']
        data['followers_count'] =  user['followers_count']
        data['friends_count'] = user['friends_count']
 	data['statuses_count'] = user['statuses_count']
 	if collection_name.find_one({'_id':data['_id']}):
 	   continue
 	else:
 	   collection_name.insert(data) #takes care of duplicates 
    input_file.close()  
    #posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish 


 def KeywordFilter(InCollection,regex,sortindex,searchindex):
    histogram_array=[]
    for entry in InCollection.find({searchindex:  {'$in':[re.compile('%s'%(regex))]} }):
        histogram_array.append(entry[sortindex])#in seconds
    return histogram_array

 def KeywordFilterSimple(InCollection,regex,sortindex,searchindex):
    histogram_array=[]
    for entry in InCollection.find({searchindex:  {'$in':[regex]} }):
        histogram_array.append(entry[sortindex])#in seconds
    return histogram_array

 def KeywordComparison(posts):
    posts.create_index([("created_at", pymongo.ASCENDING)])
    OCdates = []
    RTdates = []
    for post in posts.find().sort([("created_at", pymongo.ASCENDING)]):
        RTdates.append(post['created_at'])#in seconds
    for post in posts.find({'text':  {'$nin':[re.compile('RT @')]} }).sort([("created_at", pymongo.ASCENDING)]):
        OCdates.append(post['created_at'])#in seconds
    mindate = min(min(RTdates),min(OCdates))#in UTC seconds
    maxdate = max(max(RTdates),max(OCdates))#in UTC seconds

    nRTdates = (array(RTdates)-array(mindate))/array(3600.)#convert to hours
    nOCdates = (array(OCdates)-array(mindate))/array(3600.)#convert to hours
    nmindate = min(min(nRTdates),min(nOCdates))#
    nmaxdate = max(max(nRTdates),max(nOCdates))

    #my_bin = linspace(mindate,maxdate,200)
    my_bin = linspace(nmindate,nmaxdate,200)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
    #nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
    #ax.xaxis.set_minor_locator(MultipleLocator(3600))
    #ax.xaxis.set_major_locator(MultipleLocator(12*3600))

    '''
    comparison keyword
    '''
    polis = 'polis*'
    saldiri = 'sald?r*'
    pdates = KeywordFilter(posts,polis,'created_at','text')
    npdates = (array(pdates)-array(mindate))/array(3600.)
    np, bins, patch = ax.hist(npdates, bins=my_bin)
    print len(list(np))

    start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati
    print max(list(nRT))
    ax.text(nmindate,max(list(nRT)),start_time_formatted)
    ylabel(r"birim zaman basina twit sayisi")
    xlabel(r'ilk twitten itibaren gecen zaman')
    show()

 def KeywordComparison(posts,start_time,end_time,binsize,keywords):
    posts.create_index([("created_at", pymongo.ASCENDING)])
    RTdates = []
    for post in posts.find({"created_at": {"$gte": start_time, "$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]):
        RTdates.append(post['created_at'])#in seconds
    mindate = min(RTdates)#in UTC seconds
    maxdate = max(RTdates)#in UTC seconds
    print 'mindate =',mindate

    nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes?
    nmindate = min(nRTdates)#
    nmaxdate = max(nRTdates)

    #my_bin = linspace(mindate,maxdate,200)
    #my_bin = linspace(nmindate,nmaxdate,20)
    #binsize = 2
    my_bin = arange(nmindate,nmaxdate+binsize,binsize)
    

    fig = plt.figure()
    ax = fig.add_subplot(111)
    nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
    clf()
    #nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
    #ax.xaxis.set_minor_locator(MultipleLocator(3600))
    #ax.xaxis.set_major_locator(MultipleLocator(12*3600))


    print len(my_bin[1:]), len(nRT)

    '''
    comparison keyword
    '''
    nps = []
    #keywords = ['gcalvetbarot','Camacho','herrerajoan','Albert_Rivera','HiginiaRoig']
    for key in keywords:
        keydates = KeywordFilterSimple(posts,key,'created_at','text')
        nkeydates = (array(keydates)-array(mindate))/array(60.)
        np, bins, patch = hist(nkeydates, bins=my_bin)
 	nps.append(np)
 	clf()

    plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2)
    for h in nps:
 	plot(my_bin[1:]-array(binsize/2.),h,'o-', linewidth = 2)

    #legend(handles=keywords)

    start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati
    print max(list(nRT))
    ax.text(nmindate,max(list(nRT)),start_time_formatted)
    ylabel(r"birim zaman basina twit sayisi")
    xlabel(r'ilk twitten itibaren gecen zaman')

 def Timeline(posts,start_time,end_time,binsize):
    posts.create_index([("created_at", pymongo.ASCENDING)])
    RTdates = []
    for post in posts.find({"created_at": {"$gte": start_time,"$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]):
        RTdates.append(post['created_at'])#in seconds
    mindate = min(RTdates)#in UTC seconds
    maxdate = max(RTdates)#in UTC seconds
    print 'mindate =',mindate

    nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes?
    nmindate = min(nRTdates)#
    nmaxdate = max(nRTdates)

    #my_bin = linspace(mindate,maxdate,200)
    #my_bin = linspace(nmindate,nmaxdate,20)
    #binsize = 2
    my_bin = arange(nmindate,nmaxdate+binsize,binsize)
    

    fig = plt.figure()
    ax = fig.add_subplot(111)
    nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
    clf()
    #nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
    #ax.xaxis.set_minor_locator(MultipleLocator(3600))
    #ax.xaxis.set_major_locator(MultipleLocator(12*3600))
    print len(my_bin[1:]), len(nRT)
    plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2)

    start_time_formatted = time.strftime("%d %b %Y %H:%M:%S",time.gmtime(mindate+4*3600.))#ispanya saati
    print max(list(nRT))
    #text(nmindate,1.15*max(list(nRT)),start_time_formatted)
    figtext(0.13,0.91,start_time_formatted, horizontalalignment='left')

    v = axis()
    #ylabel(r"birim zaman basina twit sayisi")
    #xlabel(r'ilk twitten itibaren gecen zaman')
    ylabel(r"Quantitat dels tuits per %2.1f minuts"%binsize)
    xlabel(r'Temps a partir del primer tuit (mins)')
    #savefig('frequencia231114.png')
    #savefig('frequencia300914.png')
    return max(nRT), min(nRT), mindate+4*3600., maxdate

 def VerticalLine(times, mintime, maxtweet):
    #plot([(time-mintime)/60.,(time-mintime)/60.],[0.6*maxtweet,0.9*maxtweet],'k-',linewidth = 2)
    #print (time-mintime)/60.
    for t in times:
        time_formatted = time.strftime("%H:%M",time.gmtime(mintime+t*60.))#ispanya saati
        print t, time_formatted
        plot([t,t],[0.2*maxtweet,0.55*maxtweet],'k-',linewidth = 2)
 	text(t,3,time_formatted,horizontalalignment='center', verticalalignment='bottom', rotation='vertical')


 def WOReTweets(posts,since,reftime,binsize, filename):
    posts.create_index([("created_at", pymongo.ASCENDING)])
    RTdates = []
    for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]):
        RTdates.append(post['created_at'])#in seconds
    mindate = min(RTdates)
    filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)*60.,"$lte": mindate+(reftime+binsize/2.)*60.}})
    untw = []
    counts = []
    for post in filtered:
 	if post['text'] in untw:
 	   counts[untw.index(post['text'])] += 1
 	else:
 	   untw.append(post['text'])
 	   counts.append(1)
    unique_tweets = [(untw[i],counts[i]) for i in range((len(counts)))]
    sorted_rts = sorted(unique_tweets, key = lambda a: a[1], reverse = True)
    outname = '%s_rts.dat'%filename
    out = codecs.open(outname, encoding='utf-8', mode='a+')
    out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
    for i in range(3):#assuming there are more than 3 tweets in the chunk
    	out.write('%i, %s\n'%(sorted_rts[i][1],sorted_rts[i][0]))
    out.close()
    #for i in range(len(untw)):
    #	print counts[i], untw[i]


 def WOTweets(posts,since,reftime,binsize,filename):
    posts.create_index([("created_at", pymongo.ASCENDING)])
    RTdates = []
    for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]):
        RTdates.append(post['created_at'])#in seconds
    mindate = min(RTdates)
    filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)*60.,"$lte": mindate+(reftime+binsize/2.)*60.}})
    outname = '%s_tw.dat'%filename
    out = codecs.open(outname, encoding='utf-8', mode='a+')
    out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
    for post in filtered:
    	out.write('%s\n'%(post['text']))
    out.close()

 def UserContributions(posts,filename):
    unUs = []
    counts = []
    for post in posts.find():
 	if post['user_name'] in unUs:
 	   counts[unUs.index(post['user_name'])] += 1
 	else:
 	   unUs.append(post['user_name'])
 	   counts.append(1)
    unique_users = [(unUs[i],counts[i]) for i in range((len(counts)))]
    sorted_users = sorted(unique_users, key = lambda a: a[1], reverse = True)
    outname = '%s_user_statistics.dat'%filename
    if os.path.isfile(outname):
 	os.popen('rm %s'%outname)
    out = codecs.open(outname, encoding='utf-8', mode='a+')
    #out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
    for i in range(len(sorted_users)):#assuming there are more than 3 tweets in the chunk
    	out.write('%i, %s\n'%(sorted_users[i][1],sorted_users[i][0]))
    out.close()

 def clean(word):
    newword = word.replace(',','').replace(':','').replace('…','').replace('"','').replace('.','').replace(')','').replace('(','').replace('!','').replace('?','')
    return newword

 def WordFrequency(posts,filterWords,filename):
    unWords = []
    counts = []
    for post in posts.find():
 	postWords = post['text'].split()
 	for postWord in postWords:
 	   cleanPostWord = clean(postWord)
 	   if ('http' in cleanPostWord) or (cleanPostWord in filterWords):
 	      continue
 	   elif cleanPostWord in unWords:
 	      counts[unWords.index(cleanPostWord)] += 1   
 	   else:
 	      unWords.append(cleanPostWord)
 	      counts.append(1)
    unique_words = [(unWords[i],counts[i]) for i in range((len(counts)))]
    sorted_words = sorted(unique_words, key = lambda a: a[1], reverse = True)
    outname = '%s_word_frequency.dat'%filename
    if os.path.isfile(outname):
 	os.popen('rm %s'%outname)
    out = codecs.open(outname, encoding='utf-8', mode='a+')
    #out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
    for i in range(len(sorted_words)):#assuming there are more than 3 tweets in the chunk
    	out.write('%s: %i\n'%(sorted_words[i][0],sorted_words[i][1]))
 	#out.write('%s\n'%sorted_words[i][0])
    out.close()

 def FollowerOps():
 	  '''
 	  #screen_name = user_info['screen_name']
          #followers =  user_info['followers_count']
          #influence = float(followers)/float(user_info['friends_count'])
 	  #statuses = user_info['statuses_count']
 	  #profile_description = user_info['description'].replace('\n',' ')
 	  #out_string = '%s\t%s\t%s\t%f\t%s\t\'%s\''%(user_id,screen_name,followers, influence, statuses, profile_description)
 	  #print out_string
 	  #out.write('%s\n'%out_string)
 	  #out2.write('%s\n'%user_info)
 	  '''

 if __name__ == '__main__':
    client = pymongo.MongoClient()
    db = client['twitter-test']
    posts = db.posts #this is a collection
        
    #inputDb(posts,'smt.json')

    '''
 	write a followers collection and a friends collection
    '''

    user_name = 'guanyem'
    users = db.users
    #inputUser(users,'guanyem_follower_data_full.dat')

    since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
    until = time.mktime(time.strptime('20 Dec 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))

    '''
    followback algortihm starts here
    
    #get follower list
    followerIds = []
    for user in users.find():
 	followerIds.append(str(user['_id']))

    #get friends list
    friendIds = get_users('%s_firen.txt'%user_name)

    #pull unique elements from the database
    uniqueIds = {}
    posts.find().sort([("created_at", pymongo.ASCENDING)])
    for post in posts.find({"created_at": {"$gte": since,"$lte":until}}):
 	if post['user_id'] in uniqueIds:
 	   continue
 	elif post['user_name'] == user_name:
 	   continue
 	else:
 	   #uniqueIds.append({post['user_id']: post['user_name']})
 	   uniqueIds[post['user_id']] =  post['user_name']
    print 'unique ids in this set of posts:',len(uniqueIds)


    #get target group
    targetUsers = []
    for userId in uniqueIds.keys():
 	if userId in followerIds:
 	   continue
 	elif userId in friendIds:
 	   continue
 	else:
 	   targetUsers.append(userId)
    print 'target ids in this set of posts:',len(targetUsers)
    
    out = codecs.open('target.dat', encoding='utf-8', mode='w')
    for ids in targetUsers:
 	out.write('%s\t%s\n'%(ids,uniqueIds[ids]))
    out.close()
    
    followback algorithm ends here
    '''


    #since = time.mktime(time.strptime('10 Oct 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
    #until = time.mktime(time.strptime('11 Oct 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))
     
    since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
    until = time.mktime(time.strptime('20 Nov 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))

    #print since, until
    binsize = 4
    maxtweets, mintweets, mintime, maxtime = Timeline(posts,since,until,binsize)
    #text(0.9*(maxtime-mintime)/60,maxtweets,'#pujol324',ha='right')
    #figtext(0.88,0.85,'#pujol324',ha='right',size =15)

    #times = [117.0,127, 137.0,145.0, 159.0, 165.0,181.0, 191.,197.]#elle secildi
    #VerticalLine(times, mintime, maxtweets)

    #keywords = ['gcalvetbarot','Camacho*','herrerajoan','Albert_Rivera','HiginiaRoig','jorditurull']
    keywords = ['guanyemDialogant']
    #keywords = ['mac?']
    KeywordComparison(posts,since, until,binsize, keywords)

    #savefig('frequencia_noucodietic02.png')
    show()


    #UserContributions(posts,'noucodietic')
    #FilterWords  = [lines.replace('\n','') for lines in codecs.open('filterWords.dat', encoding='utf-8', mode='r')]
    #WordFrequency(posts,FilterWords,'noucodietic02')

    '''
    outcore = 'AdaColauMAC_051014'
    outrt = '%s_rts.dat'%outcore
    if os.path.isfile(outrt):
 	os.popen('rm %s'%outrt)
    outtw = '%s_tw.dat'%outcore
    if os.path.isfile(outtw):
 	os.popen('rm %s'%outtw)
    for t in times:
    	#print t
 	WOTweets(posts,since,t,binsize,outcore)
 	WOReTweets(posts,since,t,binsize,outcore)
    '''
diff --git a/userapdeyt.py b/userapdeyt.py
 #!/usr/bin/python
 # -*- encoding: utf-8 -*-
 from __future__ import unicode_literals
 import requests
 from requests_oauthlib import OAuth1
 from urlparse import parse_qs
 import json
 import codecs
 from math import ceil
 import os.path
 import time
 import random

 REQUEST_TOKEN_URL = "https://api.twitter.com/oauth/request_token"
 AUTHORIZE_URL = "https://api.twitter.com/oauth/authorize?oauth_token="
 ACCESS_TOKEN_URL = "https://api.twitter.com/oauth/access_token"

 # Go to http://dev.twitter.com and create an app.
 # The consumer key and secret will be generated for you after
 CONSUMER_KEY=""
 CONSUMER_SECRET=""

 # After the step above, you will be redirected to your app's page.
 # Create an access token under the the "Your access token" section
 OAUTH_TOKEN=""
 OAUTH_TOKEN_SECRET=""


 def setup_oauth():
    """Authorize your app via identifier."""
    # Request token
    oauth = OAuth1(CONSUMER_KEY, client_secret=CONSUMER_SECRET)
    r = requests.post(url=REQUEST_TOKEN_URL, auth=oauth)
    credentials = parse_qs(r.content)

    resource_owner_key = credentials.get('oauth_token')[0]
    resource_owner_secret = credentials.get('oauth_token_secret')[0]
    
    # Authorize
    authorize_url = AUTHORIZE_URL + resource_owner_key
    print 'Please go here and authorize: ' + authorize_url
    
    verifier = raw_input('Please input the verifier: ')
    oauth = OAuth1(CONSUMER_KEY,
                   client_secret=CONSUMER_SECRET,
                   resource_owner_key=resource_owner_key,
                   resource_owner_secret=resource_owner_secret,
                   verifier=verifier)

    # Finally, Obtain the Access Token
    r = requests.post(url=ACCESS_TOKEN_URL, auth=oauth)
    credentials = parse_qs(r.content)
    token = credentials.get('oauth_token')[0]
    secret = credentials.get('oauth_token_secret')[0]

    return token, secret


 def get_oauth():
    oauth = OAuth1(CONSUMER_KEY,
                client_secret=CONSUMER_SECRET,
                resource_owner_key=OAUTH_TOKEN,
                resource_owner_secret=OAUTH_TOKEN_SECRET)
    return oauth

 def get_users(filename):
    users = []
    f = open(filename)
    for lines in f:
 	users.append(lines.strip())
    return users

 def loadData(inputFile):
    data = []
    for line in inputFile:
        if line.startswith("#"):
            continue
        data.append([v for v in line.strip().split()])
    return data

 def loadDataS(inputFile,seperator):
    data = []
    for line in inputFile:
        if line.startswith("#"):
            continue
        data.append([v for v in line.strip().split(seperator)])
    return data

 def transpose(data):
        return [[data[j][i] for j in range(len(data))] for i in range(len(data[0]))]


 def getFollowers(user):
    falovirs = []
    f = open(user+'_falovir.txt')
    for lines in f:
 	falovirs.append(lines.strip())
    return falovirs

 def get_unique(data):
    result=[]
    result.append(data[0])
    for i in range(len(data)):
        if data[i] in result:
            continue
        else:
                result.append(data[i])
    return result

 def getCommonFollowerCount(source_user,target_user):
    '''
    target is the account we are interested in
    '''
    source_followers = get_followers(source_user)
    target_followers = get_followers(target_user)
    source_count = len(source_followers)
    target_count = len(target_followers)
    count = 0
    for f in source_followers:
 	if f in target_followers:
 	   count += 1
    return count, source_count, float(count)/float(source_count), float(count)/float(target_count)

 def getCommonFollowerArrayCount(source_followers,target_followers):
    '''
    target is the account we are interested in
    '''
    source_count = len(source_followers)
    target_count = len(target_followers)
    count = 0
    target_copy = target_followers
    for f in source_followers:
 	if f in target_copy:
 	   count += 1
 	   source_followers.remove(f)
 	   #target_copy.remove(f)
 	   '''
 	   for efficiency the matching elements are taken out
 	   if userlists are badly made, the repeating elements will not be noticed
 	   '''
    return count, source_count, float(count)/float(source_count), float(count)/float(target_count)

 def getOldTweets(filename):
    input_file  = file(filename, "r")
    tweets = []
    for lines in input_file:
 	tweets.append(json.loads(lines))
    return tweets

 def getOldTweetsID(filename):
    input_file  = file(filename, "r")
    tweetIDs = []
    for lines in input_file:
 	tweet = json.loads(lines)
 	tweetIDs.append(tweet["id"])
    return tweetIDs

 def GrabSearch(hashtag,since):
    #since = -1 #default?
    PROFILE = "https://api.twitter.com/1.1/search/tweets.json?q=%s&max_id=%i&result_type=recent&count=100"%(hashtag,since)
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        search= r.json()
 	#print taym['statuses']
 	return search

 def GrabTweets(name):
    PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=true&screen_name=%s"%name
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        taym= r.json()
 	#print taym['statuses']
 	return taym

 def GrabFollowers(name, no_followers):
    if no_followers < 5001:
    	PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=-1&screen_name=%s&count=5000"%name
    	if not OAUTH_TOKEN:
           token, secret = setup_oauth()
           print "OAUTH_TOKEN: " + token
           print "OAUTH_TOKEN_SECRET: " + secret
           print
    	else:
           oauth = get_oauth()
           r = requests.get(url=PROFILE, auth=oauth)
 	   print r
           followers = r.json()
 	   time.sleep(61)
 	   return followers['ids']
    elif no_followers > 5000:
 	no_cursor = int(ceil(no_followers/5000.))
 	followers_rest = no_followers%5000
 	followers = []
 	next_cursor = -1
 	for i in range(no_cursor):
 	    if i == no_cursor-1:
 	       f_query = followers_rest
 	    else:
 	       f_query = 5000
    	    PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=%i&screen_name=%s&count=%i"%(next_cursor,name,f_query)
 	    print next_cursor
    	    if not OAUTH_TOKEN:
               token, secret = setup_oauth()
               print "OAUTH_TOKEN: " + token
               print "OAUTH_TOKEN_SECRET: " + secret
               print
    	    else:
               oauth = get_oauth()
               r = requests.get(url=PROFILE, auth=oauth)
 	       print r
 	       next_cursor = r.json()['next_cursor']
 	       #print next_cursor
               followers += r.json()['ids']
 	       time.sleep(61)
 	return followers


 def GrabFollowerCount(name):
    PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=false&count=1&screen_name=%s"%name
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        taym= r.json()
 	no_followers = int(taym[0]['user']['followers_count'])
 	print no_followers
 	return no_followers

 def GrabFriends(name, no_followers):
    if no_followers < 5001:
    	PROFILE = "https://api.twitter.com/1.1/friends/ids.json?cursor=-1&screen_name=%s&count=5000"%name
    	if not OAUTH_TOKEN:
           token, secret = setup_oauth()
           print "OAUTH_TOKEN: " + token
           print "OAUTH_TOKEN_SECRET: " + secret
           print
    	else:
           oauth = get_oauth()
           r = requests.get(url=PROFILE, auth=oauth)
 	   print r
           followers = r.json()
 	   time.sleep(61)
 	   return followers['ids']
    elif no_followers > 5000:
 	no_cursor = int(ceil(no_followers/5000.))
 	followers_rest = no_followers%5000
 	followers = []
 	next_cursor = -1
 	for i in range(no_cursor):
 	    if i == no_cursor-1:
 	       f_query = followers_rest
 	    else:
 	       f_query = 5000
    	    PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=%i&screen_name=%s&count=%i"%(next_cursor,name,f_query)
 	    print next_cursor
    	    if not OAUTH_TOKEN:
               token, secret = setup_oauth()
               print "OAUTH_TOKEN: " + token
               print "OAUTH_TOKEN_SECRET: " + secret
               print
    	    else:
               oauth = get_oauth()
               r = requests.get(url=PROFILE, auth=oauth)
 	       print r
 	       next_cursor = r.json()['next_cursor']
 	       #print next_cursor
               followers += r.json()['ids']
 	       time.sleep(61)
 	return followers

 def GrabFriendCount(name):
    PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=false&count=1&screen_name=%s"%name
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        taym= r.json()
 	no_friends = int(taym[0]['user']['friends_count'])
 	print no_friends
 	return no_friends

 def GrabTweetsExp(name,max_id):
    PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=true&count=200&max_id=%s&screen_name=%s"%(max_id,name)
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        taym= r.json()
 	#print taym['statuses']
 	return taym

 def GrabHistory(user):
    '''
    very ugly workaround for grabbing the whole timeline 3200 max 
    '''
    switch = 0
    for i in range(100):#cirkin cozum
      if switch < 2:
 	last = os.popen('tail -1 %s_taymlayn.txt'%user).read()
 	last_tweet = json.loads(last)
 	last_id = str(last_tweet['id'])
 	print "%s taymlayninda yeni bir sey var mi?"%user
 	taymlayn = GrabTweetsExp(user,last_id)
 	filename = '%s_taymlayn.txt'%user
 	if os.path.isfile(filename):
 	   OldTweets = getOldTweetsID(filename)
 	   #print OldTweets[0]['text']
 	   out = open(filename,'a+')
 	   for tweet in taymlayn:
 		if tweet['id'] in OldTweets:
 		  print tweet['id'], ' tibiti dosyada'
 		  switch = switch+1
 		  continue
 		else:
 		   #print 'mujde! yeni tibit: '#,tweet['text']
 		   switch = 0
 		   json.dump(tweet,out)
 		   out.write('\n')
 	   out.close()
 	else:
 	   writeTweets(taymlayn,filename)

 def GrabUserInfo(user_id):
    PROFILE = "https://api.twitter.com/1.1/users/show.json?user_id=%s&include_entities=false"%user_id
    if not OAUTH_TOKEN:
        token, secret = setup_oauth()
        print "OAUTH_TOKEN: " + token
        print "OAUTH_TOKEN_SECRET: " + secret
        print
    else:
        oauth = get_oauth()
        r = requests.get(url=PROFILE, auth=oauth)
        user = r.json()
 	print r
 	#print r.json()
 	return user


 def GetNewTimelines(users):
    for user in users:
 	print "%s taymlayninda yeni bir sey var mi?"%user
 	taymlayn = GrabTweets(user)
 	filename = '%s_taymlayn.txt'%user
 	if os.path.isfile(filename):
 	   OldTweets = getOldTweetsID(filename)
 	   #print OldTweets[0]['text']
 	   out = open(filename,'a+')
 	   for tweet in taymlayn:
 		if tweet['id'] in OldTweets:
 		  print tweet['id'], ' tibiti dosyada'
 		  continue
 		else:
 		   print 'mujde! yeni tibit: '#,tweet['text']
 		   json.dump(tweet,out)
 		   out.write('\n')
 	   out.close()
 	else:
 	   writeTweets(taymlayn,filename)


 def writeTweets(tweets,filename):
    out = codecs.open(filename, encoding='utf-8', mode='w')
    for tweet in tweets:
    	json.dump(tweet,out)
    	out.write('\n')
    out.close()

 def GetFollowerCorrelation(target_name,users):
    target_followers = getFollowers(target_name)
    out = codecs.open(target_name+'_comparison.dat', encoding='utf-8', mode='w')
    for user in users:
 	source_followers = getFollowers(user)
 	common_count, source_count, source_frac, target_frac = getCommonFollowerArrayCount(source_followers,target_followers)
 	out_string = '%s\t%i\t%i\t%f\t%f\n'%(user,common_count, source_count, source_frac, target_frac)
 	print out_string
 	out.write('%s'%out_string)
    out.close()

 def GetSearchResults(hashtag):
    #hashtag = 'GuanyemSantAntoni'
    #since = -1 #default

    outname = '%s_search.json'%hashtag
    #if os.path.isfile(outname):
    #	os.popen('rm %s'%outname)
    
    if os.path.isfile(outname):
    	out = codecs.open(outname, encoding='utf-8', mode='a+')
    	finalt = json.loads(os.popen('tail -1 %s'%outname).read())
    	since = finalt['id']
    	print 'starting from %s'%since
    else:
    	out = codecs.open(outname, encoding='utf-8', mode='a+')
 	since = -1
 	print 'new file'

    search = GrabSearch(hashtag,since)
    try:
 	print search['search_metadata']
    except KeyError:
 	print search['errors'][0]['message']
 	if search['errors'][0]['code'] == 88:
 	   print 'waiting...'
 	   time.sleep(15*60+1)
    search = GrabSearch(hashtag,since)
    since = search['search_metadata']['max_id']
    for tweet in search['statuses']:
 	current_max_date = tweet['created_at']
 	current_max_id = tweet['id']
 	print current_max_date, tweet['text']
 	json.dump(tweet,out)
 	out.write('\n')
    out.close()
    print 'compare with', current_max_date, current_max_id

 def ReadUserIds(input_file):
    #input_file  = file(filename, "r")
    Ids = []
    for lines in input_file:#memory efficient loop
    	user = json.loads(lines)
 	try:
 	     user['id']
 	except KeyError:
 	     print user
 	     continue #assuming there is an acceptable error
 	Ids.append(str(user['id'])) #conversion to string for getFollowers comparison
    return Ids

 if __name__ == "__main__":

    #users = get_users('userlistGuanyem3.txt')
    
    #followers = get_users('guanyem_falovir.txt')
    #OldTweets = getOldTweets('')

    #account_name = 'guanyem'
    #users += [account_name]
    users = ['guanyem']

    #GetFollowerCorrelation(account_name,users)

    '''
    account_name = 'guanyem'
    followers = get_users(account_name + '_falovir.txt')

    if os.path.isfile(account_name+'_follower_data_full.dat'):
 	was_open = True
    else:
 	was_open = False

    out2 = codecs.open(account_name+'_follower_data_full.dat', encoding='utf-8', mode='a+')
    if was_open:
 	recorded_ids = ReadUserIds(out2)
 	print len(recorded_ids)
    else:
 	recorded_ids = []
    count = 0
    #print 'waiting...'
    #time.sleep(15*60+1)


    for user_id in followers:
 	if user_id in recorded_ids:
 	   continue
 	else:
    	  user_info = GrabUserInfo(user_id)
 	  try:
 	     print user_info['screen_name']
 	  except KeyError:
 	     print user_info['errors'][0]['message']
 	     if user_info['errors'][0]['code'] == 34 or user_info['errors'][0]['code'] == 63:
 		continue
 	     elif user_info['errors'][0]['code'] == 88:
 	        print 'waiting...'
 	        time.sleep(15*60+1)
 		user_info = GrabUserInfo(user_id)#not good since still it can 404
 	  json.dump(user_info,out2)
 	  out2.write('\n')
    out2.close()
    '''

    for user in users:
 	print user
    	friends = GrabFriends(user,GrabFriendCount(user))
 	filename = '%s_firen.txt'%user
    	out = codecs.open(filename, encoding='utf-8', mode='w')
    	for friend in friends:
    	    out.write('%s\n'%friend)
    	out.close()

    for user in users:
 	print user
    	followers = GrabFollowers(user,GrabFollowerCount(user))
 	filename = '%s_falovir.txt'%user
    	out = codecs.open(filename, encoding='utf-8', mode='w')
    	for follower in followers:
    	    out.write('%s\n'%follower)
    	out.close()


    


    '''
 	try:
 	   print tweet['id']
 	except KeyError:
 	   print tweet['errors'][0]['message']
 	   if user_info['errors'][0]['code'] == 88:
 	        print 'waiting...'
 	        time.sleep(15*60+1)
 	print tweet['user']['screen_name']
 	print tweet['status']['text']
    	json.dump(tweet,out)
    	out.write('\n')
    out.close()
    '''
	#!/usr/bin/python
	# -- encoding: utf-8 --
	from __future__ import unicode_literals
	from pylab import *


	import json
	import codecs
	import pymongo
	import time
	import sys
	import os
	import re

	def getOldTweets(filename):
	input_file = file(filename, "r")
	tweets = []
	for lines in input_file:
	tweets.append(json.loads(lines))
	return tweets

	def get_users(filename):
	users = []
	f = open(filename)
	for lines in f:
	users.append(lines.strip())
	return users

	def stdoutStatus(jsonarray):
	for tweet in jsonarray:
	print tweet['text']

	def inputDb(collection_name, filename):
	'''
	init db
	'''
	input_file = file(filename, "r")

	for lines in input_file:#memory efficient loop
	data ={}
	tweet = json.loads(lines)
	data['created_at'] = time.mktime(time.strptime(tweet['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))
	data['user_name'] = tweet['user']['screen_name']
	data['user_id'] = tweet['user']['id_str']
	data['_id'] = tweet['id']
	data['lang'] = tweet['lang']
	data['text'] = tweet['text']
	#print tweet['user']['screen_name']
	#print tweet['entities']['hashtags']
	data['hashtags'] = tweet['entities']['hashtags']
	if collection_name.find_one({'_id':data['_id']}):
	continue
	else:
	collection_name.insert(data) #takes care of duplicates
	#input_file.close()
	#posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish

	def inputUser(collection_name, filename):
	'''
	init db
	'''
	input_file = file(filename, "r")

	for lines in input_file:#memory efficient loop
	data ={}
	user = json.loads(lines)
	try:
	user['id']
	except KeyError:
	print user
	continue #assuming there is an acceptable error
	data['created_at'] = time.mktime(time.strptime(user['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))
	data['user_name'] = user['screen_name']
	#data['user_id'] = tweet['user']['id_str']
	data['_id'] = user['id']
	data['listed_count'] = user['listed_count']
	data['description'] = user['description']
	data['followers_count'] = user['followers_count']
	data['friends_count'] = user['friends_count']
	data['statuses_count'] = user['statuses_count']
	if collection_name.find_one({'_id':data['_id']}):
	continue
	else:
	collection_name.insert(data) #takes care of duplicates
	input_file.close()
	#posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish


	def KeywordFilter(InCollection,regex,sortindex,searchindex):
	histogram_array=[]
	for entry in InCollection.find({searchindex: {'$in':[re.compile('%s'%(regex))]} }):
	histogram_array.append(entry[sortindex])#in seconds
	return histogram_array

	def KeywordFilterSimple(InCollection,regex,sortindex,searchindex):
	histogram_array=[]
	for entry in InCollection.find({searchindex: {'$in':[regex]} }):
	histogram_array.append(entry[sortindex])#in seconds
	return histogram_array

	def KeywordComparison(posts):
	posts.create_index([("created_at", pymongo.ASCENDING)])
	OCdates = []
	RTdates = []
	for post in posts.find().sort([("created_at", pymongo.ASCENDING)]):
	RTdates.append(post['created_at'])#in seconds
	for post in posts.find({'text': {'$nin':[re.compile('RT @')]} }).sort([("created_at", pymongo.ASCENDING)]):
	OCdates.append(post['created_at'])#in seconds
	mindate = min(min(RTdates),min(OCdates))#in UTC seconds
	maxdate = max(max(RTdates),max(OCdates))#in UTC seconds

	nRTdates = (array(RTdates)-array(mindate))/array(3600.)#convert to hours
	nOCdates = (array(OCdates)-array(mindate))/array(3600.)#convert to hours
	nmindate = min(min(nRTdates),min(nOCdates))#
	nmaxdate = max(max(nRTdates),max(nOCdates))

	#my_bin = linspace(mindate,maxdate,200)
	my_bin = linspace(nmindate,nmaxdate,200)

	fig = plt.figure()
	ax = fig.add_subplot(111)
	nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
	#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
	#ax.xaxis.set_minor_locator(MultipleLocator(3600))
	#ax.xaxis.set_major_locator(MultipleLocator(12*3600))

	'''
	comparison keyword
	'''
	polis = 'polis*'
	saldiri = 'sald?r*'
	pdates = KeywordFilter(posts,polis,'created_at','text')
	npdates = (array(pdates)-array(mindate))/array(3600.)
	np, bins, patch = ax.hist(npdates, bins=my_bin)
	print len(list(np))

	start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati
	print max(list(nRT))
	ax.text(nmindate,max(list(nRT)),start_time_formatted)
	ylabel(r"birim zaman basina twit sayisi")
	xlabel(r'ilk twitten itibaren gecen zaman')
	show()

	def KeywordComparison(posts,start_time,end_time,binsize,keywords):
	posts.create_index([("created_at", pymongo.ASCENDING)])
	RTdates = []
	for post in posts.find({"created_at": {"$gte": start_time, "$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]):
	RTdates.append(post['created_at'])#in seconds
	mindate = min(RTdates)#in UTC seconds
	maxdate = max(RTdates)#in UTC seconds
	print 'mindate =',mindate

	nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes?
	nmindate = min(nRTdates)#
	nmaxdate = max(nRTdates)

	#my_bin = linspace(mindate,maxdate,200)
	#my_bin = linspace(nmindate,nmaxdate,20)
	#binsize = 2
	my_bin = arange(nmindate,nmaxdate+binsize,binsize)


	fig = plt.figure()
	ax = fig.add_subplot(111)
	nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
	clf()
	#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
	#ax.xaxis.set_minor_locator(MultipleLocator(3600))
	#ax.xaxis.set_major_locator(MultipleLocator(12*3600))


	print len(my_bin[1:]), len(nRT)

	'''
	comparison keyword
	'''
	nps = []
	#keywords = ['gcalvetbarot','Camacho','herrerajoan','Albert_Rivera','HiginiaRoig']
	for key in keywords:
	keydates = KeywordFilterSimple(posts,key,'created_at','text')
	nkeydates = (array(keydates)-array(mindate))/array(60.)
	np, bins, patch = hist(nkeydates, bins=my_bin)
	nps.append(np)
	clf()

	plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2)
	for h in nps:
	plot(my_bin[1:]-array(binsize/2.),h,'o-', linewidth = 2)

	#legend(handles=keywords)

	start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati
	print max(list(nRT))
	ax.text(nmindate,max(list(nRT)),start_time_formatted)
	ylabel(r"birim zaman basina twit sayisi")
	xlabel(r'ilk twitten itibaren gecen zaman')

	def Timeline(posts,start_time,end_time,binsize):
	posts.create_index([("created_at", pymongo.ASCENDING)])
	RTdates = []
	for post in posts.find({"created_at": {"$gte": start_time,"$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]):
	RTdates.append(post['created_at'])#in seconds
	mindate = min(RTdates)#in UTC seconds
	maxdate = max(RTdates)#in UTC seconds
	print 'mindate =',mindate

	nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes?
	nmindate = min(nRTdates)#
	nmaxdate = max(nRTdates)

	#my_bin = linspace(mindate,maxdate,200)
	#my_bin = linspace(nmindate,nmaxdate,20)
	#binsize = 2
	my_bin = arange(nmindate,nmaxdate+binsize,binsize)


	fig = plt.figure()
	ax = fig.add_subplot(111)
	nRT, bins, patch = ax.hist(nRTdates, bins=my_bin)
	clf()
	#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin)
	#ax.xaxis.set_minor_locator(MultipleLocator(3600))
	#ax.xaxis.set_major_locator(MultipleLocator(12*3600))
	print len(my_bin[1:]), len(nRT)
	plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2)

	start_time_formatted = time.strftime("%d %b %Y %H:%M:%S",time.gmtime(mindate+4*3600.))#ispanya saati
	print max(list(nRT))
	#text(nmindate,1.15*max(list(nRT)),start_time_formatted)
	figtext(0.13,0.91,start_time_formatted, horizontalalignment='left')

	v = axis()
	#ylabel(r"birim zaman basina twit sayisi")
	#xlabel(r'ilk twitten itibaren gecen zaman')
	ylabel(r"Quantitat dels tuits per %2.1f minuts"%binsize)
	xlabel(r'Temps a partir del primer tuit (mins)')
	#savefig('frequencia231114.png')
	#savefig('frequencia300914.png')
	return max(nRT), min(nRT), mindate+4*3600., maxdate

	def VerticalLine(times, mintime, maxtweet):
	#plot([(time-mintime)/60.,(time-mintime)/60.],[0.6maxtweet,0.9maxtweet],'k-',linewidth = 2)
	#print (time-mintime)/60.
	for t in times:
	time_formatted = time.strftime("%H:%M",time.gmtime(mintime+t*60.))#ispanya saati
	print t, time_formatted
	plot([t,t],[0.2maxtweet,0.55maxtweet],'k-',linewidth = 2)
	text(t,3,time_formatted,horizontalalignment='center', verticalalignment='bottom', rotation='vertical')


	def WOReTweets(posts,since,reftime,binsize, filename):
	posts.create_index([("created_at", pymongo.ASCENDING)])
	RTdates = []
	for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]):
	RTdates.append(post['created_at'])#in seconds
	mindate = min(RTdates)
	filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)60.,"$lte": mindate+(reftime+binsize/2.)60.}})
	untw = []
	counts = []
	for post in filtered:
	if post['text'] in untw:
	counts[untw.index(post['text'])] += 1
	else:
	untw.append(post['text'])
	counts.append(1)
	unique_tweets = [(untw[i],counts[i]) for i in range((len(counts)))]
	sorted_rts = sorted(unique_tweets, key = lambda a: a[1], reverse = True)
	outname = '%s_rts.dat'%filename
	out = codecs.open(outname, encoding='utf-8', mode='a+')
	out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
	for i in range(3):#assuming there are more than 3 tweets in the chunk
	out.write('%i, %s\n'%(sorted_rts[i][1],sorted_rts[i][0]))
	out.close()
	#for i in range(len(untw)):
	# print counts[i], untw[i]


	def WOTweets(posts,since,reftime,binsize,filename):
	posts.create_index([("created_at", pymongo.ASCENDING)])
	RTdates = []
	for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]):
	RTdates.append(post['created_at'])#in seconds
	mindate = min(RTdates)
	filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)60.,"$lte": mindate+(reftime+binsize/2.)60.}})
	outname = '%s_tw.dat'%filename
	out = codecs.open(outname, encoding='utf-8', mode='a+')
	out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
	for post in filtered:
	out.write('%s\n'%(post['text']))
	out.close()

	def UserContributions(posts,filename):
	unUs = []
	counts = []
	for post in posts.find():
	if post['user_name'] in unUs:
	counts[unUs.index(post['user_name'])] += 1
	else:
	unUs.append(post['user_name'])
	counts.append(1)
	unique_users = [(unUs[i],counts[i]) for i in range((len(counts)))]
	sorted_users = sorted(unique_users, key = lambda a: a[1], reverse = True)
	outname = '%s_user_statistics.dat'%filename
	if os.path.isfile(outname):
	os.popen('rm %s'%outname)
	out = codecs.open(outname, encoding='utf-8', mode='a+')
	#out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
	for i in range(len(sorted_users)):#assuming there are more than 3 tweets in the chunk
	out.write('%i, %s\n'%(sorted_users[i][1],sorted_users[i][0]))
	out.close()

	def clean(word):
	newword = word.replace(',','').replace(':','').replace('…','').replace('"','').replace('.','').replace(')','').replace('(','').replace('!','').replace('?','')
	return newword

	def WordFrequency(posts,filterWords,filename):
	unWords = []
	counts = []
	for post in posts.find():
	postWords = post['text'].split()
	for postWord in postWords:
	cleanPostWord = clean(postWord)
	if ('http' in cleanPostWord) or (cleanPostWord in filterWords):
	continue
	elif cleanPostWord in unWords:
	counts[unWords.index(cleanPostWord)] += 1
	else:
	unWords.append(cleanPostWord)
	counts.append(1)
	unique_words = [(unWords[i],counts[i]) for i in range((len(counts)))]
	sorted_words = sorted(unique_words, key = lambda a: a[1], reverse = True)
	outname = '%s_word_frequency.dat'%filename
	if os.path.isfile(outname):
	os.popen('rm %s'%outname)
	out = codecs.open(outname, encoding='utf-8', mode='a+')
	#out.write('#at %3.1f: %i\n'%(reftime,filtered.count()))
	for i in range(len(sorted_words)):#assuming there are more than 3 tweets in the chunk
	out.write('%s: %i\n'%(sorted_words[i][0],sorted_words[i][1]))
	#out.write('%s\n'%sorted_words[i][0])
	out.close()

	def FollowerOps():
	'''
	#screen_name = user_info['screen_name']
	#followers = user_info['followers_count']
	#influence = float(followers)/float(user_info['friends_count'])
	#statuses = user_info['statuses_count']
	#profile_description = user_info['description'].replace('\n',' ')
	#out_string = '%s\t%s\t%s\t%f\t%s\t\'%s\''%(user_id,screen_name,followers, influence, statuses, profile_description)
	#print out_string
	#out.write('%s\n'%out_string)
	#out2.write('%s\n'%user_info)
	'''

	if __name__ == '__main__':
	client = pymongo.MongoClient()
	db = client['twitter-test']
	posts = db.posts #this is a collection

	#inputDb(posts,'smt.json')

	'''
	write a followers collection and a friends collection
	'''

	user_name = 'guanyem'
	users = db.users
	#inputUser(users,'guanyem_follower_data_full.dat')

	since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
	until = time.mktime(time.strptime('20 Dec 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))

	'''
	followback algortihm starts here

	#get follower list
	followerIds = []
	for user in users.find():
	followerIds.append(str(user['_id']))

	#get friends list
	friendIds = get_users('%s_firen.txt'%user_name)

	#pull unique elements from the database
	uniqueIds = {}
	posts.find().sort([("created_at", pymongo.ASCENDING)])
	for post in posts.find({"created_at": {"$gte": since,"$lte":until}}):
	if post['user_id'] in uniqueIds:
	continue
	elif post['user_name'] == user_name:
	continue
	else:
	#uniqueIds.append({post['user_id']: post['user_name']})
	uniqueIds[post['user_id']] = post['user_name']
	print 'unique ids in this set of posts:',len(uniqueIds)


	#get target group
	targetUsers = []
	for userId in uniqueIds.keys():
	if userId in followerIds:
	continue
	elif userId in friendIds:
	continue
	else:
	targetUsers.append(userId)
	print 'target ids in this set of posts:',len(targetUsers)

	out = codecs.open('target.dat', encoding='utf-8', mode='w')
	for ids in targetUsers:
	out.write('%s\t%s\n'%(ids,uniqueIds[ids]))
	out.close()

	followback algorithm ends here
	'''


	#since = time.mktime(time.strptime('10 Oct 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
	#until = time.mktime(time.strptime('11 Oct 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))

	since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000"))
	until = time.mktime(time.strptime('20 Nov 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000"))

	#print since, until
	binsize = 4
	maxtweets, mintweets, mintime, maxtime = Timeline(posts,since,until,binsize)
	#text(0.9*(maxtime-mintime)/60,maxtweets,'#pujol324',ha='right')
	#figtext(0.88,0.85,'#pujol324',ha='right',size =15)

	#times = [117.0,127, 137.0,145.0, 159.0, 165.0,181.0, 191.,197.]#elle secildi
	#VerticalLine(times, mintime, maxtweets)

	#keywords = ['gcalvetbarot','Camacho*','herrerajoan','Albert_Rivera','HiginiaRoig','jorditurull']
	keywords = ['guanyemDialogant']
	#keywords = ['mac?']
	KeywordComparison(posts,since, until,binsize, keywords)

	#savefig('frequencia_noucodietic02.png')
	show()


	#UserContributions(posts,'noucodietic')
	#FilterWords = [lines.replace('\n','') for lines in codecs.open('filterWords.dat', encoding='utf-8', mode='r')]
	#WordFrequency(posts,FilterWords,'noucodietic02')

	'''
	outcore = 'AdaColauMAC_051014'
	outrt = '%s_rts.dat'%outcore
	if os.path.isfile(outrt):
	os.popen('rm %s'%outrt)
	outtw = '%s_tw.dat'%outcore
	if os.path.isfile(outtw):
	os.popen('rm %s'%outtw)
	for t in times:
	#print t
	WOTweets(posts,since,t,binsize,outcore)
	WOReTweets(posts,since,t,binsize,outcore)
	'''