Last active
February 26, 2019 09:13
-
-
Save hanasuru/0a8e1f932b4ec3b1820f30e9709d9c3c to your computer and use it in GitHub Desktop.
A simple tweet-dumper based on Twitter API for data gathering of specific keyword
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/python2 | |
from datetime import datetime, timedelta | |
from requests_oauthlib import OAuth1 | |
from email.utils import parsedate_tz | |
import requests | |
import json, time | |
class TwitDumperConfig: | |
RESP_CODE = { | |
400: 'Bad Request', | |
401: 'Unauthorized', | |
403: 'Forbidden', | |
404: 'Not Found', | |
406: 'Not Acceptable', | |
410: 'Gone: This resource is gone', | |
420: 'Enhance Your Calm', | |
422: 'Unprocessable Entity', | |
429: 'Too Many Requests', | |
500: 'Internal Server Error', | |
502: 'Bad Gateway', | |
503: 'Service Unavailable', | |
504: 'Gateway timeout' | |
} | |
CREDENTIALS = { | |
'consumer-key' : 'CONSUMER-KEY-HERE', | |
'consumer-secret' : 'CONSUMER-SECRET-HERE', | |
'access-key' : 'ACCESS-KEY-HERE', | |
'access-secret' : 'ACCESS-SECRET-HERE', | |
} | |
def setConsumerCreds(self, conKey, conSec): | |
self.CREDENTIALS['consumer-key'] = conKey | |
self.CREDENTIALS['consumer-secret'] = conSec | |
def setAccessCreds(self, accKey, accSec): | |
self.CREDENTIALS['access-key'] = accKey | |
self.CREDENTIALS['access-secret'] = accSec | |
def get_config(self): | |
return self.CREDENTIALS | |
@staticmethod | |
def get_resp_code(self): | |
return self.RESP_CODE | |
class TwitDumper: | |
BASE_URL = 'https://api.twitter.com/1.1/' | |
VERIFY_URL = 'account/verify_credentials.json' | |
SEARCH_URL = 'search/tweets.json' | |
USER_URL = 'statuses/user_timeline.json' | |
def __init__(self, config): | |
self.config = config.get_config() | |
self.params = dict() | |
self.params['lang'] = 'id' | |
self.params['count'] = 100 | |
self.params['tweet_mode'] = 'extended' | |
self.params['result_type'] = 'recent' | |
self.params['until'] = str(datetime.now()).split()[0] | |
self.authenticate() | |
def __repr__(self): | |
return '<<%s %s>>' % (self.__class__.__name__, self.config['access-key']) | |
def authenticate(self): | |
self.sess = requests.session() | |
self.oauth = OAuth1(self.config['consumer-key'], | |
client_secret=self.config['consumer-secret'], | |
resource_owner_key=self.config['access-key'], | |
resource_owner_secret=self.config['access-secret']) | |
r = self.sess.get(self.BASE_URL + self.VERIFY_URL, auth=self.oauth) | |
return self.check_status(r.status_code) | |
def check_status(self, code): | |
exceptions = TwitDumperConfig.RESP_CODE | |
if code in exceptions: | |
raise Exception('%s Error' % (code), 'Message : %s' % (exceptions[code])) | |
def date_parse(self, datestring): | |
time_tuple = parsedate_tz(datestring.strip()) | |
dt = datetime(*time_tuple[:6]) | |
return str(dt - timedelta(seconds=time_tuple[-1])) | |
def parse_tweet(self, status): | |
tweet = [] | |
tweet.append(status['user']['screen_name'].strip().encode('utf-8')) | |
tweet.append(self.date_parse(status['user']['created_at'])) | |
tweet.append(status['full_text'].strip().encode('utf-8').replace('\n',' ')) | |
tweet.append(self.date_parse(status['created_at'])) | |
return ','.join(tweet) | |
def search(self): | |
try: | |
time.sleep(1.0) | |
endpoint = '&'.join('%s=%s'%(i[0],i[1]) for i in self.params.iteritems()) | |
url = self.BASE_URL + self.SEARCH_URL + '?' + endpoint | |
resp = self.sess.get(url,auth=self.oauth) | |
# print resp.headers | |
self.check_status(resp.status_code) | |
data = json.loads(resp.text) | |
post_id = [i['id'] for i in data['statuses']] | |
data = [self.parse_tweet(i) for i in data['statuses']] | |
self.count += len(post_id) | |
print '\n'.join(data); sorted(post_id) | |
return (min(post_id),max(post_id)) | |
except: | |
return (None, None) | |
def backward(self, limit, id): | |
self.params.pop('since_id', None) | |
while self.count < limit: | |
self.params['max_id'] = id | |
id = self.search()[0] | |
if not id: | |
return | |
def forward(self, limit, id): | |
self.params.pop('max_id', None) | |
while self.count < limit: | |
self.params['since_id'] = id | |
id = self.search()[1] | |
if not id: | |
return | |
def search_query(self, limit, **kwargs): | |
self.count = 0 | |
self.params.pop('max_id', None) | |
self.params.pop('since_id', None) | |
for i in kwargs.iteritems(): | |
self.params[i[0]] = i[1] | |
x,y = self.search() | |
self.backward(x) | |
self.forward(y) | |
def main(): | |
config = TwitDumperConfig() | |
config.setConsumerCreds('==REDACTED==','==REDACTED==') | |
config.setAccessCreds('==REDACTED==','==REDACTED==') | |
twit = TwitDumper(config) | |
keyword = ['UninstallBukaLapak', 'ShutdownJokowi','PokoknyaPrabowoSandi','DebatKeduaPilpres2019','DebatPilpres2019'] | |
for i in keyword: | |
twit.search_query(100,q=i) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment