Last active
August 29, 2015 13:57
-
-
Save ferayebend/9497021 to your computer and use it in GitHub Desktop.
ascii json olarak yazılmış tweetleri okumak/işlemek için bir grup edevat
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- encoding: utf-8 -*- | |
from __future__ import unicode_literals | |
import json | |
import codecs | |
import sys | |
import os | |
def getOldTweets(filename): | |
input_file = file(filename, "r") | |
tweets = [] | |
for lines in input_file: | |
tweets.append(json.loads(lines)) | |
return tweets | |
def writeTweets(tweets,filename): | |
out = codecs.open(filename, encoding='utf-8', mode='w') | |
for tweet in tweets: | |
json.dump(tweet,out) | |
out.write('\n') | |
out.close() | |
def stdoutStatus(jsonarray): | |
for tweet in jsonarray: | |
print tweet['text'] | |
def getImages(jsonarray): | |
mediaHTTPS = [] | |
for tweet in jsonarray: | |
if 'media' in tweet['entities']: | |
https = tweet['entities']['media'][0]['media_url_https'] | |
if https in mediaHTTPS: | |
continue | |
print 'tekrar var' | |
else: | |
mediaHTTPS.append(https+':large') | |
return mediaHTTPS | |
def countMentions(mentions): | |
unique = [] | |
sayi = [] | |
for mention in mentions: | |
if mention in unique: | |
sayi[unique.index(mention)] = sayi[unique.index(mention)]+1 | |
else: | |
unique.append(mention) | |
sayi.append(1) | |
return unique, sayi | |
def getMentions(jsonarray): | |
mansor = [] | |
for tweet in jsonarray: | |
if tweet['entities']['user_mentions'] != []: | |
mansor.append(tweet['entities']['user_mentions'][0]['screen_name']) | |
return mansor | |
def WOMentionStats(filename): | |
name = filename+'.csv' | |
outf = open(name,'w') | |
unique, sayi = countMentions(getMentions(getOldTweets(filename))) | |
for i in range(len(unique)): | |
outf.write('%s,%i\n'%(unique[i], sayi[i])) | |
outf.close() | |
def dowloadAllImages(filename): | |
tweets = getOldTweets(filename) | |
medias = getImages(tweets) | |
for media in medias: | |
os.popen('wget %s'%media) | |
def getAllUserMentions(directory): | |
command = 'ls %s/*taymlayn.txt'%directory | |
users = os.popen(command).read().split() | |
for user in users: | |
WOMentionStats(user) | |
def mergeTweets(basetw,addedtw): | |
''' | |
tweet tekrarlarina cikararak, addedtw deki tweetleri basetw'e ekler | |
''' | |
ids = [] | |
for fs in basetw: | |
ids.append(fs['id']) | |
for ts in addedtw: | |
if ts['id'] in ids: | |
continue | |
else: | |
basetw.append(ts) | |
del ids | |
def WOmergeTweets(): | |
if len(sys.argv) < 3: | |
print ''' | |
tibit dosyalarini birlestirmek istiyorsun, dosya isimlerini vermiyorsun ;_; | |
kullanim sekli: | |
> berkin_toolkit.py input1.json input2.json | |
input2.json dosyasini ayiklayarak input1.json a append eder. | |
''' | |
sys.exit() | |
else: | |
file1 = sys.argv[1] | |
file2 = sys.argv[2] | |
print ''' | |
\"%s\" \"%s\" dosyalarini alip, tekrarlari ayiklayip \"%s\"a yazdirmaya calisiyorsun. | |
eminsin insaAllah u_u'''%(file1,file2,file1) | |
if not os.path.isfile(file1): | |
print ''' | |
%s dosyasi yox, ne is?'''%file1 | |
sys.exit() | |
mergeAndWriteLargeTweetFiles(file1,file2) | |
def mergeAndWriteLargeTweetFiles(basefile,addedfile): | |
base = file(basefile, "r") | |
ids = [] | |
for lines in base: | |
ids.append(json.loads(lines)['id']) | |
base.close() | |
print "ilk faslin idlerini okudum kaydettim. ayiklanacak dosyayi aciyorum." | |
baseout = codecs.open(basefile, encoding='utf-8', mode='a')#append? | |
#baseout.write('\n')#? | |
added = file(addedfile,"r") | |
for lines in added: | |
tweet = json.loads(lines) | |
if tweet['id'] in ids: | |
continue | |
else: | |
json.dump(tweet,baseout) | |
baseout.write('\n') | |
ids.append(tweet['id']) | |
added.close() | |
baseout.close() | |
print "halloldu insallaa, masallaa." | |
def FilemergeTweets(): | |
if len(sys.argv) < 3: | |
print ''' | |
tibit dosyalarini birlestirmek istiyorsun, dosya isimlerini nereden alacagimi soylemiyorsun | |
kullanim sekli: | |
> berkin_toolkit.py inputliste.txt output.json | |
''' | |
sys.exit() | |
else: | |
liste = sys.argv[1] | |
outfile = sys.argv[2] | |
print ''' | |
dosyalarin listesi surada -->\"%s\", tekrarlari ayiklayip \"%s\"a yazdirmaya calisiyorsun. | |
eminsin insaAllah u_u'''%(liste,outfile) | |
if os.path.isfile(outfile): | |
print ''' | |
%s dosyasi hali hazirda var. silinmesin yazik.'''%outfile | |
sys.exit() | |
files = open(liste).read().split() | |
print files | |
print "dosyalarini birlestirecegiz" | |
base = getOldTweets(files[0]) | |
for f in files[1:]: | |
print "%s dosyasi okunuyor"%f | |
eklenen = getOldTweets(f) | |
print "gorulmustur. simdi de ayiklayalim" | |
mergeTweets(base,eklenen) | |
del eklenen | |
writeTweets(base,outfile) | |
if __name__ == "__main__": | |
WOmergeTweets() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment