Created
November 25, 2014 12:53
-
-
Save ferayebend/6d9c5ca79ce0ef9f43ba to your computer and use it in GitHub Desktop.
twitter API tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- encoding: utf-8 -*- | |
from __future__ import unicode_literals | |
from pylab import * | |
import json | |
import codecs | |
import pymongo | |
import time | |
import sys | |
import os | |
import re | |
def getOldTweets(filename): | |
input_file = file(filename, "r") | |
tweets = [] | |
for lines in input_file: | |
tweets.append(json.loads(lines)) | |
return tweets | |
def get_users(filename): | |
users = [] | |
f = open(filename) | |
for lines in f: | |
users.append(lines.strip()) | |
return users | |
def stdoutStatus(jsonarray): | |
for tweet in jsonarray: | |
print tweet['text'] | |
def inputDb(collection_name, filename): | |
''' | |
init db | |
''' | |
input_file = file(filename, "r") | |
for lines in input_file:#memory efficient loop | |
data ={} | |
tweet = json.loads(lines) | |
data['created_at'] = time.mktime(time.strptime(tweet['created_at'],"%a %b %d %H:%M:%S +0000 %Y")) | |
data['user_name'] = tweet['user']['screen_name'] | |
data['user_id'] = tweet['user']['id_str'] | |
data['_id'] = tweet['id'] | |
data['lang'] = tweet['lang'] | |
data['text'] = tweet['text'] | |
#print tweet['user']['screen_name'] | |
#print tweet['entities']['hashtags'] | |
data['hashtags'] = tweet['entities']['hashtags'] | |
if collection_name.find_one({'_id':data['_id']}): | |
continue | |
else: | |
collection_name.insert(data) #takes care of duplicates | |
#input_file.close() | |
#posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish | |
def inputUser(collection_name, filename): | |
''' | |
init db | |
''' | |
input_file = file(filename, "r") | |
for lines in input_file:#memory efficient loop | |
data ={} | |
user = json.loads(lines) | |
try: | |
user['id'] | |
except KeyError: | |
print user | |
continue #assuming there is an acceptable error | |
data['created_at'] = time.mktime(time.strptime(user['created_at'],"%a %b %d %H:%M:%S +0000 %Y")) | |
data['user_name'] = user['screen_name'] | |
#data['user_id'] = tweet['user']['id_str'] | |
data['_id'] = user['id'] | |
data['listed_count'] = user['listed_count'] | |
data['description'] = user['description'] | |
data['followers_count'] = user['followers_count'] | |
data['friends_count'] = user['friends_count'] | |
data['statuses_count'] = user['statuses_count'] | |
if collection_name.find_one({'_id':data['_id']}): | |
continue | |
else: | |
collection_name.insert(data) #takes care of duplicates | |
input_file.close() | |
#posts.remove({u'lang':{'$nin':[u'tr']}}) #remove non turkish | |
def KeywordFilter(InCollection,regex,sortindex,searchindex): | |
histogram_array=[] | |
for entry in InCollection.find({searchindex: {'$in':[re.compile('%s'%(regex))]} }): | |
histogram_array.append(entry[sortindex])#in seconds | |
return histogram_array | |
def KeywordFilterSimple(InCollection,regex,sortindex,searchindex): | |
histogram_array=[] | |
for entry in InCollection.find({searchindex: {'$in':[regex]} }): | |
histogram_array.append(entry[sortindex])#in seconds | |
return histogram_array | |
def KeywordComparison(posts): | |
posts.create_index([("created_at", pymongo.ASCENDING)]) | |
OCdates = [] | |
RTdates = [] | |
for post in posts.find().sort([("created_at", pymongo.ASCENDING)]): | |
RTdates.append(post['created_at'])#in seconds | |
for post in posts.find({'text': {'$nin':[re.compile('RT @')]} }).sort([("created_at", pymongo.ASCENDING)]): | |
OCdates.append(post['created_at'])#in seconds | |
mindate = min(min(RTdates),min(OCdates))#in UTC seconds | |
maxdate = max(max(RTdates),max(OCdates))#in UTC seconds | |
nRTdates = (array(RTdates)-array(mindate))/array(3600.)#convert to hours | |
nOCdates = (array(OCdates)-array(mindate))/array(3600.)#convert to hours | |
nmindate = min(min(nRTdates),min(nOCdates))# | |
nmaxdate = max(max(nRTdates),max(nOCdates)) | |
#my_bin = linspace(mindate,maxdate,200) | |
my_bin = linspace(nmindate,nmaxdate,200) | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
nRT, bins, patch = ax.hist(nRTdates, bins=my_bin) | |
#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin) | |
#ax.xaxis.set_minor_locator(MultipleLocator(3600)) | |
#ax.xaxis.set_major_locator(MultipleLocator(12*3600)) | |
''' | |
comparison keyword | |
''' | |
polis = 'polis*' | |
saldiri = 'sald?r*' | |
pdates = KeywordFilter(posts,polis,'created_at','text') | |
npdates = (array(pdates)-array(mindate))/array(3600.) | |
np, bins, patch = ax.hist(npdates, bins=my_bin) | |
print len(list(np)) | |
start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati | |
print max(list(nRT)) | |
ax.text(nmindate,max(list(nRT)),start_time_formatted) | |
ylabel(r"birim zaman basina twit sayisi") | |
xlabel(r'ilk twitten itibaren gecen zaman') | |
show() | |
def KeywordComparison(posts,start_time,end_time,binsize,keywords): | |
posts.create_index([("created_at", pymongo.ASCENDING)]) | |
RTdates = [] | |
for post in posts.find({"created_at": {"$gte": start_time, "$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]): | |
RTdates.append(post['created_at'])#in seconds | |
mindate = min(RTdates)#in UTC seconds | |
maxdate = max(RTdates)#in UTC seconds | |
print 'mindate =',mindate | |
nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes? | |
nmindate = min(nRTdates)# | |
nmaxdate = max(nRTdates) | |
#my_bin = linspace(mindate,maxdate,200) | |
#my_bin = linspace(nmindate,nmaxdate,20) | |
#binsize = 2 | |
my_bin = arange(nmindate,nmaxdate+binsize,binsize) | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
nRT, bins, patch = ax.hist(nRTdates, bins=my_bin) | |
clf() | |
#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin) | |
#ax.xaxis.set_minor_locator(MultipleLocator(3600)) | |
#ax.xaxis.set_major_locator(MultipleLocator(12*3600)) | |
print len(my_bin[1:]), len(nRT) | |
''' | |
comparison keyword | |
''' | |
nps = [] | |
#keywords = ['gcalvetbarot','Camacho','herrerajoan','Albert_Rivera','HiginiaRoig'] | |
for key in keywords: | |
keydates = KeywordFilterSimple(posts,key,'created_at','text') | |
nkeydates = (array(keydates)-array(mindate))/array(60.) | |
np, bins, patch = hist(nkeydates, bins=my_bin) | |
nps.append(np) | |
clf() | |
plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2) | |
for h in nps: | |
plot(my_bin[1:]-array(binsize/2.),h,'o-', linewidth = 2) | |
#legend(handles=keywords) | |
start_time_formatted = time.strftime("%d %b %Y %H:%M:%S +0000",time.gmtime(mindate+2*3600))#türkiye saati | |
print max(list(nRT)) | |
ax.text(nmindate,max(list(nRT)),start_time_formatted) | |
ylabel(r"birim zaman basina twit sayisi") | |
xlabel(r'ilk twitten itibaren gecen zaman') | |
def Timeline(posts,start_time,end_time,binsize): | |
posts.create_index([("created_at", pymongo.ASCENDING)]) | |
RTdates = [] | |
for post in posts.find({"created_at": {"$gte": start_time,"$lte": end_time}}).sort([("created_at", pymongo.ASCENDING)]): | |
RTdates.append(post['created_at'])#in seconds | |
mindate = min(RTdates)#in UTC seconds | |
maxdate = max(RTdates)#in UTC seconds | |
print 'mindate =',mindate | |
nRTdates = (array(RTdates)-array(mindate))/array(60.)#convert to minutes? | |
nmindate = min(nRTdates)# | |
nmaxdate = max(nRTdates) | |
#my_bin = linspace(mindate,maxdate,200) | |
#my_bin = linspace(nmindate,nmaxdate,20) | |
#binsize = 2 | |
my_bin = arange(nmindate,nmaxdate+binsize,binsize) | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
nRT, bins, patch = ax.hist(nRTdates, bins=my_bin) | |
clf() | |
#nOC, bins, patch = ax.hist(nOCdates, bins=my_bin) | |
#ax.xaxis.set_minor_locator(MultipleLocator(3600)) | |
#ax.xaxis.set_major_locator(MultipleLocator(12*3600)) | |
print len(my_bin[1:]), len(nRT) | |
plot(my_bin[1:]-array(binsize/2.),nRT,'ro-', linewidth = 2) | |
start_time_formatted = time.strftime("%d %b %Y %H:%M:%S",time.gmtime(mindate+4*3600.))#ispanya saati | |
print max(list(nRT)) | |
#text(nmindate,1.15*max(list(nRT)),start_time_formatted) | |
figtext(0.13,0.91,start_time_formatted, horizontalalignment='left') | |
v = axis() | |
#ylabel(r"birim zaman basina twit sayisi") | |
#xlabel(r'ilk twitten itibaren gecen zaman') | |
ylabel(r"Quantitat dels tuits per %2.1f minuts"%binsize) | |
xlabel(r'Temps a partir del primer tuit (mins)') | |
#savefig('frequencia231114.png') | |
#savefig('frequencia300914.png') | |
return max(nRT), min(nRT), mindate+4*3600., maxdate | |
def VerticalLine(times, mintime, maxtweet): | |
#plot([(time-mintime)/60.,(time-mintime)/60.],[0.6*maxtweet,0.9*maxtweet],'k-',linewidth = 2) | |
#print (time-mintime)/60. | |
for t in times: | |
time_formatted = time.strftime("%H:%M",time.gmtime(mintime+t*60.))#ispanya saati | |
print t, time_formatted | |
plot([t,t],[0.2*maxtweet,0.55*maxtweet],'k-',linewidth = 2) | |
text(t,3,time_formatted,horizontalalignment='center', verticalalignment='bottom', rotation='vertical') | |
def WOReTweets(posts,since,reftime,binsize, filename): | |
posts.create_index([("created_at", pymongo.ASCENDING)]) | |
RTdates = [] | |
for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]): | |
RTdates.append(post['created_at'])#in seconds | |
mindate = min(RTdates) | |
filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)*60.,"$lte": mindate+(reftime+binsize/2.)*60.}}) | |
untw = [] | |
counts = [] | |
for post in filtered: | |
if post['text'] in untw: | |
counts[untw.index(post['text'])] += 1 | |
else: | |
untw.append(post['text']) | |
counts.append(1) | |
unique_tweets = [(untw[i],counts[i]) for i in range((len(counts)))] | |
sorted_rts = sorted(unique_tweets, key = lambda a: a[1], reverse = True) | |
outname = '%s_rts.dat'%filename | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
out.write('#at %3.1f: %i\n'%(reftime,filtered.count())) | |
for i in range(3):#assuming there are more than 3 tweets in the chunk | |
out.write('%i, %s\n'%(sorted_rts[i][1],sorted_rts[i][0])) | |
out.close() | |
#for i in range(len(untw)): | |
# print counts[i], untw[i] | |
def WOTweets(posts,since,reftime,binsize,filename): | |
posts.create_index([("created_at", pymongo.ASCENDING)]) | |
RTdates = [] | |
for post in posts.find({"created_at": {"$gte": since}}).sort([("created_at", pymongo.ASCENDING)]): | |
RTdates.append(post['created_at'])#in seconds | |
mindate = min(RTdates) | |
filtered = posts.find({"created_at": {"$gte": mindate+(reftime-binsize/2.)*60.,"$lte": mindate+(reftime+binsize/2.)*60.}}) | |
outname = '%s_tw.dat'%filename | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
out.write('#at %3.1f: %i\n'%(reftime,filtered.count())) | |
for post in filtered: | |
out.write('%s\n'%(post['text'])) | |
out.close() | |
def UserContributions(posts,filename): | |
unUs = [] | |
counts = [] | |
for post in posts.find(): | |
if post['user_name'] in unUs: | |
counts[unUs.index(post['user_name'])] += 1 | |
else: | |
unUs.append(post['user_name']) | |
counts.append(1) | |
unique_users = [(unUs[i],counts[i]) for i in range((len(counts)))] | |
sorted_users = sorted(unique_users, key = lambda a: a[1], reverse = True) | |
outname = '%s_user_statistics.dat'%filename | |
if os.path.isfile(outname): | |
os.popen('rm %s'%outname) | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
#out.write('#at %3.1f: %i\n'%(reftime,filtered.count())) | |
for i in range(len(sorted_users)):#assuming there are more than 3 tweets in the chunk | |
out.write('%i, %s\n'%(sorted_users[i][1],sorted_users[i][0])) | |
out.close() | |
def clean(word): | |
newword = word.replace(',','').replace(':','').replace('…','').replace('"','').replace('.','').replace(')','').replace('(','').replace('!','').replace('?','') | |
return newword | |
def WordFrequency(posts,filterWords,filename): | |
unWords = [] | |
counts = [] | |
for post in posts.find(): | |
postWords = post['text'].split() | |
for postWord in postWords: | |
cleanPostWord = clean(postWord) | |
if ('http' in cleanPostWord) or (cleanPostWord in filterWords): | |
continue | |
elif cleanPostWord in unWords: | |
counts[unWords.index(cleanPostWord)] += 1 | |
else: | |
unWords.append(cleanPostWord) | |
counts.append(1) | |
unique_words = [(unWords[i],counts[i]) for i in range((len(counts)))] | |
sorted_words = sorted(unique_words, key = lambda a: a[1], reverse = True) | |
outname = '%s_word_frequency.dat'%filename | |
if os.path.isfile(outname): | |
os.popen('rm %s'%outname) | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
#out.write('#at %3.1f: %i\n'%(reftime,filtered.count())) | |
for i in range(len(sorted_words)):#assuming there are more than 3 tweets in the chunk | |
out.write('%s: %i\n'%(sorted_words[i][0],sorted_words[i][1])) | |
#out.write('%s\n'%sorted_words[i][0]) | |
out.close() | |
def FollowerOps(): | |
''' | |
#screen_name = user_info['screen_name'] | |
#followers = user_info['followers_count'] | |
#influence = float(followers)/float(user_info['friends_count']) | |
#statuses = user_info['statuses_count'] | |
#profile_description = user_info['description'].replace('\n',' ') | |
#out_string = '%s\t%s\t%s\t%f\t%s\t\'%s\''%(user_id,screen_name,followers, influence, statuses, profile_description) | |
#print out_string | |
#out.write('%s\n'%out_string) | |
#out2.write('%s\n'%user_info) | |
''' | |
if __name__ == '__main__': | |
client = pymongo.MongoClient() | |
db = client['twitter-test'] | |
posts = db.posts #this is a collection | |
#inputDb(posts,'smt.json') | |
''' | |
write a followers collection and a friends collection | |
''' | |
user_name = 'guanyem' | |
users = db.users | |
#inputUser(users,'guanyem_follower_data_full.dat') | |
since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
until = time.mktime(time.strptime('20 Dec 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
''' | |
followback algortihm starts here | |
#get follower list | |
followerIds = [] | |
for user in users.find(): | |
followerIds.append(str(user['_id'])) | |
#get friends list | |
friendIds = get_users('%s_firen.txt'%user_name) | |
#pull unique elements from the database | |
uniqueIds = {} | |
posts.find().sort([("created_at", pymongo.ASCENDING)]) | |
for post in posts.find({"created_at": {"$gte": since,"$lte":until}}): | |
if post['user_id'] in uniqueIds: | |
continue | |
elif post['user_name'] == user_name: | |
continue | |
else: | |
#uniqueIds.append({post['user_id']: post['user_name']}) | |
uniqueIds[post['user_id']] = post['user_name'] | |
print 'unique ids in this set of posts:',len(uniqueIds) | |
#get target group | |
targetUsers = [] | |
for userId in uniqueIds.keys(): | |
if userId in followerIds: | |
continue | |
elif userId in friendIds: | |
continue | |
else: | |
targetUsers.append(userId) | |
print 'target ids in this set of posts:',len(targetUsers) | |
out = codecs.open('target.dat', encoding='utf-8', mode='w') | |
for ids in targetUsers: | |
out.write('%s\t%s\n'%(ids,uniqueIds[ids])) | |
out.close() | |
followback algorithm ends here | |
''' | |
#since = time.mktime(time.strptime('10 Oct 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
#until = time.mktime(time.strptime('11 Oct 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
since = time.mktime(time.strptime('14 Nov 2014 07:00:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
until = time.mktime(time.strptime('20 Nov 2014 15:20:00 +0000',"%d %b %Y %H:%M:%S +0000")) | |
#print since, until | |
binsize = 4 | |
maxtweets, mintweets, mintime, maxtime = Timeline(posts,since,until,binsize) | |
#text(0.9*(maxtime-mintime)/60,maxtweets,'#pujol324',ha='right') | |
#figtext(0.88,0.85,'#pujol324',ha='right',size =15) | |
#times = [117.0,127, 137.0,145.0, 159.0, 165.0,181.0, 191.,197.]#elle secildi | |
#VerticalLine(times, mintime, maxtweets) | |
#keywords = ['gcalvetbarot','Camacho*','herrerajoan','Albert_Rivera','HiginiaRoig','jorditurull'] | |
keywords = ['guanyemDialogant'] | |
#keywords = ['mac?'] | |
KeywordComparison(posts,since, until,binsize, keywords) | |
#savefig('frequencia_noucodietic02.png') | |
show() | |
#UserContributions(posts,'noucodietic') | |
#FilterWords = [lines.replace('\n','') for lines in codecs.open('filterWords.dat', encoding='utf-8', mode='r')] | |
#WordFrequency(posts,FilterWords,'noucodietic02') | |
''' | |
outcore = 'AdaColauMAC_051014' | |
outrt = '%s_rts.dat'%outcore | |
if os.path.isfile(outrt): | |
os.popen('rm %s'%outrt) | |
outtw = '%s_tw.dat'%outcore | |
if os.path.isfile(outtw): | |
os.popen('rm %s'%outtw) | |
for t in times: | |
#print t | |
WOTweets(posts,since,t,binsize,outcore) | |
WOReTweets(posts,since,t,binsize,outcore) | |
''' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- encoding: utf-8 -*- | |
from __future__ import unicode_literals | |
import requests | |
from requests_oauthlib import OAuth1 | |
from urlparse import parse_qs | |
import json | |
import codecs | |
from math import ceil | |
import os.path | |
import time | |
import random | |
REQUEST_TOKEN_URL = "https://api.twitter.com/oauth/request_token" | |
AUTHORIZE_URL = "https://api.twitter.com/oauth/authorize?oauth_token=" | |
ACCESS_TOKEN_URL = "https://api.twitter.com/oauth/access_token" | |
# Go to http://dev.twitter.com and create an app. | |
# The consumer key and secret will be generated for you after | |
CONSUMER_KEY="" | |
CONSUMER_SECRET="" | |
# After the step above, you will be redirected to your app's page. | |
# Create an access token under the the "Your access token" section | |
OAUTH_TOKEN="" | |
OAUTH_TOKEN_SECRET="" | |
def setup_oauth(): | |
"""Authorize your app via identifier.""" | |
# Request token | |
oauth = OAuth1(CONSUMER_KEY, client_secret=CONSUMER_SECRET) | |
r = requests.post(url=REQUEST_TOKEN_URL, auth=oauth) | |
credentials = parse_qs(r.content) | |
resource_owner_key = credentials.get('oauth_token')[0] | |
resource_owner_secret = credentials.get('oauth_token_secret')[0] | |
# Authorize | |
authorize_url = AUTHORIZE_URL + resource_owner_key | |
print 'Please go here and authorize: ' + authorize_url | |
verifier = raw_input('Please input the verifier: ') | |
oauth = OAuth1(CONSUMER_KEY, | |
client_secret=CONSUMER_SECRET, | |
resource_owner_key=resource_owner_key, | |
resource_owner_secret=resource_owner_secret, | |
verifier=verifier) | |
# Finally, Obtain the Access Token | |
r = requests.post(url=ACCESS_TOKEN_URL, auth=oauth) | |
credentials = parse_qs(r.content) | |
token = credentials.get('oauth_token')[0] | |
secret = credentials.get('oauth_token_secret')[0] | |
return token, secret | |
def get_oauth(): | |
oauth = OAuth1(CONSUMER_KEY, | |
client_secret=CONSUMER_SECRET, | |
resource_owner_key=OAUTH_TOKEN, | |
resource_owner_secret=OAUTH_TOKEN_SECRET) | |
return oauth | |
def get_users(filename): | |
users = [] | |
f = open(filename) | |
for lines in f: | |
users.append(lines.strip()) | |
return users | |
def loadData(inputFile): | |
data = [] | |
for line in inputFile: | |
if line.startswith("#"): | |
continue | |
data.append([v for v in line.strip().split()]) | |
return data | |
def loadDataS(inputFile,seperator): | |
data = [] | |
for line in inputFile: | |
if line.startswith("#"): | |
continue | |
data.append([v for v in line.strip().split(seperator)]) | |
return data | |
def transpose(data): | |
return [[data[j][i] for j in range(len(data))] for i in range(len(data[0]))] | |
def getFollowers(user): | |
falovirs = [] | |
f = open(user+'_falovir.txt') | |
for lines in f: | |
falovirs.append(lines.strip()) | |
return falovirs | |
def get_unique(data): | |
result=[] | |
result.append(data[0]) | |
for i in range(len(data)): | |
if data[i] in result: | |
continue | |
else: | |
result.append(data[i]) | |
return result | |
def getCommonFollowerCount(source_user,target_user): | |
''' | |
target is the account we are interested in | |
''' | |
source_followers = get_followers(source_user) | |
target_followers = get_followers(target_user) | |
source_count = len(source_followers) | |
target_count = len(target_followers) | |
count = 0 | |
for f in source_followers: | |
if f in target_followers: | |
count += 1 | |
return count, source_count, float(count)/float(source_count), float(count)/float(target_count) | |
def getCommonFollowerArrayCount(source_followers,target_followers): | |
''' | |
target is the account we are interested in | |
''' | |
source_count = len(source_followers) | |
target_count = len(target_followers) | |
count = 0 | |
target_copy = target_followers | |
for f in source_followers: | |
if f in target_copy: | |
count += 1 | |
source_followers.remove(f) | |
#target_copy.remove(f) | |
''' | |
for efficiency the matching elements are taken out | |
if userlists are badly made, the repeating elements will not be noticed | |
''' | |
return count, source_count, float(count)/float(source_count), float(count)/float(target_count) | |
def getOldTweets(filename): | |
input_file = file(filename, "r") | |
tweets = [] | |
for lines in input_file: | |
tweets.append(json.loads(lines)) | |
return tweets | |
def getOldTweetsID(filename): | |
input_file = file(filename, "r") | |
tweetIDs = [] | |
for lines in input_file: | |
tweet = json.loads(lines) | |
tweetIDs.append(tweet["id"]) | |
return tweetIDs | |
def GrabSearch(hashtag,since): | |
#since = -1 #default? | |
PROFILE = "https://api.twitter.com/1.1/search/tweets.json?q=%s&max_id=%i&result_type=recent&count=100"%(hashtag,since) | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
search= r.json() | |
#print taym['statuses'] | |
return search | |
def GrabTweets(name): | |
PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=true&screen_name=%s"%name | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
taym= r.json() | |
#print taym['statuses'] | |
return taym | |
def GrabFollowers(name, no_followers): | |
if no_followers < 5001: | |
PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=-1&screen_name=%s&count=5000"%name | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
print r | |
followers = r.json() | |
time.sleep(61) | |
return followers['ids'] | |
elif no_followers > 5000: | |
no_cursor = int(ceil(no_followers/5000.)) | |
followers_rest = no_followers%5000 | |
followers = [] | |
next_cursor = -1 | |
for i in range(no_cursor): | |
if i == no_cursor-1: | |
f_query = followers_rest | |
else: | |
f_query = 5000 | |
PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=%i&screen_name=%s&count=%i"%(next_cursor,name,f_query) | |
print next_cursor | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
print r | |
next_cursor = r.json()['next_cursor'] | |
#print next_cursor | |
followers += r.json()['ids'] | |
time.sleep(61) | |
return followers | |
def GrabFollowerCount(name): | |
PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=false&count=1&screen_name=%s"%name | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
taym= r.json() | |
no_followers = int(taym[0]['user']['followers_count']) | |
print no_followers | |
return no_followers | |
def GrabFriends(name, no_followers): | |
if no_followers < 5001: | |
PROFILE = "https://api.twitter.com/1.1/friends/ids.json?cursor=-1&screen_name=%s&count=5000"%name | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
print r | |
followers = r.json() | |
time.sleep(61) | |
return followers['ids'] | |
elif no_followers > 5000: | |
no_cursor = int(ceil(no_followers/5000.)) | |
followers_rest = no_followers%5000 | |
followers = [] | |
next_cursor = -1 | |
for i in range(no_cursor): | |
if i == no_cursor-1: | |
f_query = followers_rest | |
else: | |
f_query = 5000 | |
PROFILE = "https://api.twitter.com/1.1/followers/ids.json?cursor=%i&screen_name=%s&count=%i"%(next_cursor,name,f_query) | |
print next_cursor | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
print r | |
next_cursor = r.json()['next_cursor'] | |
#print next_cursor | |
followers += r.json()['ids'] | |
time.sleep(61) | |
return followers | |
def GrabFriendCount(name): | |
PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=false&count=1&screen_name=%s"%name | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
taym= r.json() | |
no_friends = int(taym[0]['user']['friends_count']) | |
print no_friends | |
return no_friends | |
def GrabTweetsExp(name,max_id): | |
PROFILE = "https://api.twitter.com/1.1/statuses/user_timeline.json?include_entities=true&include_rts=true&count=200&max_id=%s&screen_name=%s"%(max_id,name) | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
taym= r.json() | |
#print taym['statuses'] | |
return taym | |
def GrabHistory(user): | |
''' | |
very ugly workaround for grabbing the whole timeline 3200 max | |
''' | |
switch = 0 | |
for i in range(100):#cirkin cozum | |
if switch < 2: | |
last = os.popen('tail -1 %s_taymlayn.txt'%user).read() | |
last_tweet = json.loads(last) | |
last_id = str(last_tweet['id']) | |
print "%s taymlayninda yeni bir sey var mi?"%user | |
taymlayn = GrabTweetsExp(user,last_id) | |
filename = '%s_taymlayn.txt'%user | |
if os.path.isfile(filename): | |
OldTweets = getOldTweetsID(filename) | |
#print OldTweets[0]['text'] | |
out = open(filename,'a+') | |
for tweet in taymlayn: | |
if tweet['id'] in OldTweets: | |
print tweet['id'], ' tibiti dosyada' | |
switch = switch+1 | |
continue | |
else: | |
#print 'mujde! yeni tibit: '#,tweet['text'] | |
switch = 0 | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
else: | |
writeTweets(taymlayn,filename) | |
def GrabUserInfo(user_id): | |
PROFILE = "https://api.twitter.com/1.1/users/show.json?user_id=%s&include_entities=false"%user_id | |
if not OAUTH_TOKEN: | |
token, secret = setup_oauth() | |
print "OAUTH_TOKEN: " + token | |
print "OAUTH_TOKEN_SECRET: " + secret | |
else: | |
oauth = get_oauth() | |
r = requests.get(url=PROFILE, auth=oauth) | |
user = r.json() | |
print r | |
#print r.json() | |
return user | |
def GetNewTimelines(users): | |
for user in users: | |
print "%s taymlayninda yeni bir sey var mi?"%user | |
taymlayn = GrabTweets(user) | |
filename = '%s_taymlayn.txt'%user | |
if os.path.isfile(filename): | |
OldTweets = getOldTweetsID(filename) | |
#print OldTweets[0]['text'] | |
out = open(filename,'a+') | |
for tweet in taymlayn: | |
if tweet['id'] in OldTweets: | |
print tweet['id'], ' tibiti dosyada' | |
continue | |
else: | |
print 'mujde! yeni tibit: '#,tweet['text'] | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
else: | |
writeTweets(taymlayn,filename) | |
def writeTweets(tweets,filename): | |
out = codecs.open(filename, encoding='utf-8', mode='w') | |
for tweet in tweets: | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
def GetFollowerCorrelation(target_name,users): | |
target_followers = getFollowers(target_name) | |
out = codecs.open(target_name+'_comparison.dat', encoding='utf-8', mode='w') | |
for user in users: | |
source_followers = getFollowers(user) | |
common_count, source_count, source_frac, target_frac = getCommonFollowerArrayCount(source_followers,target_followers) | |
out_string = '%s\t%i\t%i\t%f\t%f\n'%(user,common_count, source_count, source_frac, target_frac) | |
print out_string | |
out.write('%s'%out_string) | |
out.close() | |
def GetSearchResults(hashtag): | |
#hashtag = 'GuanyemSantAntoni' | |
#since = -1 #default | |
outname = '%s_search.json'%hashtag | |
#if os.path.isfile(outname): | |
# os.popen('rm %s'%outname) | |
if os.path.isfile(outname): | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
finalt = json.loads(os.popen('tail -1 %s'%outname).read()) | |
since = finalt['id'] | |
print 'starting from %s'%since | |
else: | |
out = codecs.open(outname, encoding='utf-8', mode='a+') | |
since = -1 | |
print 'new file' | |
search = GrabSearch(hashtag,since) | |
try: | |
print search['search_metadata'] | |
except KeyError: | |
print search['errors'][0]['message'] | |
if search['errors'][0]['code'] == 88: | |
print 'waiting...' | |
time.sleep(15*60+1) | |
search = GrabSearch(hashtag,since) | |
since = search['search_metadata']['max_id'] | |
for tweet in search['statuses']: | |
current_max_date = tweet['created_at'] | |
current_max_id = tweet['id'] | |
print current_max_date, tweet['text'] | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
print 'compare with', current_max_date, current_max_id | |
def ReadUserIds(input_file): | |
#input_file = file(filename, "r") | |
Ids = [] | |
for lines in input_file:#memory efficient loop | |
user = json.loads(lines) | |
try: | |
user['id'] | |
except KeyError: | |
print user | |
continue #assuming there is an acceptable error | |
Ids.append(str(user['id'])) #conversion to string for getFollowers comparison | |
return Ids | |
if __name__ == "__main__": | |
#users = get_users('userlistGuanyem3.txt') | |
#followers = get_users('guanyem_falovir.txt') | |
#OldTweets = getOldTweets('') | |
#account_name = 'guanyem' | |
#users += [account_name] | |
users = ['guanyem'] | |
#GetFollowerCorrelation(account_name,users) | |
''' | |
account_name = 'guanyem' | |
followers = get_users(account_name + '_falovir.txt') | |
if os.path.isfile(account_name+'_follower_data_full.dat'): | |
was_open = True | |
else: | |
was_open = False | |
out2 = codecs.open(account_name+'_follower_data_full.dat', encoding='utf-8', mode='a+') | |
if was_open: | |
recorded_ids = ReadUserIds(out2) | |
print len(recorded_ids) | |
else: | |
recorded_ids = [] | |
count = 0 | |
#print 'waiting...' | |
#time.sleep(15*60+1) | |
for user_id in followers: | |
if user_id in recorded_ids: | |
continue | |
else: | |
user_info = GrabUserInfo(user_id) | |
try: | |
print user_info['screen_name'] | |
except KeyError: | |
print user_info['errors'][0]['message'] | |
if user_info['errors'][0]['code'] == 34 or user_info['errors'][0]['code'] == 63: | |
continue | |
elif user_info['errors'][0]['code'] == 88: | |
print 'waiting...' | |
time.sleep(15*60+1) | |
user_info = GrabUserInfo(user_id)#not good since still it can 404 | |
json.dump(user_info,out2) | |
out2.write('\n') | |
out2.close() | |
''' | |
for user in users: | |
print user | |
friends = GrabFriends(user,GrabFriendCount(user)) | |
filename = '%s_firen.txt'%user | |
out = codecs.open(filename, encoding='utf-8', mode='w') | |
for friend in friends: | |
out.write('%s\n'%friend) | |
out.close() | |
for user in users: | |
print user | |
followers = GrabFollowers(user,GrabFollowerCount(user)) | |
filename = '%s_falovir.txt'%user | |
out = codecs.open(filename, encoding='utf-8', mode='w') | |
for follower in followers: | |
out.write('%s\n'%follower) | |
out.close() | |
''' | |
try: | |
print tweet['id'] | |
except KeyError: | |
print tweet['errors'][0]['message'] | |
if user_info['errors'][0]['code'] == 88: | |
print 'waiting...' | |
time.sleep(15*60+1) | |
print tweet['user']['screen_name'] | |
print tweet['status']['text'] | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment