Skip to content

Instantly share code, notes, and snippets.

@hanasuru
Last active February 26, 2019 09:13
Show Gist options
  • Save hanasuru/0a8e1f932b4ec3b1820f30e9709d9c3c to your computer and use it in GitHub Desktop.
Save hanasuru/0a8e1f932b4ec3b1820f30e9709d9c3c to your computer and use it in GitHub Desktop.
A simple tweet-dumper based on Twitter API for data gathering of specific keyword
#/usr/bin/python2
from datetime import datetime, timedelta
from requests_oauthlib import OAuth1
from email.utils import parsedate_tz
import requests
import json, time
class TwitDumperConfig:
RESP_CODE = {
400: 'Bad Request',
401: 'Unauthorized',
403: 'Forbidden',
404: 'Not Found',
406: 'Not Acceptable',
410: 'Gone: This resource is gone',
420: 'Enhance Your Calm',
422: 'Unprocessable Entity',
429: 'Too Many Requests',
500: 'Internal Server Error',
502: 'Bad Gateway',
503: 'Service Unavailable',
504: 'Gateway timeout'
}
CREDENTIALS = {
'consumer-key' : 'CONSUMER-KEY-HERE',
'consumer-secret' : 'CONSUMER-SECRET-HERE',
'access-key' : 'ACCESS-KEY-HERE',
'access-secret' : 'ACCESS-SECRET-HERE',
}
def setConsumerCreds(self, conKey, conSec):
self.CREDENTIALS['consumer-key'] = conKey
self.CREDENTIALS['consumer-secret'] = conSec
def setAccessCreds(self, accKey, accSec):
self.CREDENTIALS['access-key'] = accKey
self.CREDENTIALS['access-secret'] = accSec
def get_config(self):
return self.CREDENTIALS
@staticmethod
def get_resp_code(self):
return self.RESP_CODE
class TwitDumper:
BASE_URL = 'https://api.twitter.com/1.1/'
VERIFY_URL = 'account/verify_credentials.json'
SEARCH_URL = 'search/tweets.json'
USER_URL = 'statuses/user_timeline.json'
def __init__(self, config):
self.config = config.get_config()
self.params = dict()
self.params['lang'] = 'id'
self.params['count'] = 100
self.params['tweet_mode'] = 'extended'
self.params['result_type'] = 'recent'
self.params['until'] = str(datetime.now()).split()[0]
self.authenticate()
def __repr__(self):
return '<<%s %s>>' % (self.__class__.__name__, self.config['access-key'])
def authenticate(self):
self.sess = requests.session()
self.oauth = OAuth1(self.config['consumer-key'],
client_secret=self.config['consumer-secret'],
resource_owner_key=self.config['access-key'],
resource_owner_secret=self.config['access-secret'])
r = self.sess.get(self.BASE_URL + self.VERIFY_URL, auth=self.oauth)
return self.check_status(r.status_code)
def check_status(self, code):
exceptions = TwitDumperConfig.RESP_CODE
if code in exceptions:
raise Exception('%s Error' % (code), 'Message : %s' % (exceptions[code]))
def date_parse(self, datestring):
time_tuple = parsedate_tz(datestring.strip())
dt = datetime(*time_tuple[:6])
return str(dt - timedelta(seconds=time_tuple[-1]))
def parse_tweet(self, status):
tweet = []
tweet.append(status['user']['screen_name'].strip().encode('utf-8'))
tweet.append(self.date_parse(status['user']['created_at']))
tweet.append(status['full_text'].strip().encode('utf-8').replace('\n',' '))
tweet.append(self.date_parse(status['created_at']))
return ','.join(tweet)
def search(self):
try:
time.sleep(1.0)
endpoint = '&'.join('%s=%s'%(i[0],i[1]) for i in self.params.iteritems())
url = self.BASE_URL + self.SEARCH_URL + '?' + endpoint
resp = self.sess.get(url,auth=self.oauth)
# print resp.headers
self.check_status(resp.status_code)
data = json.loads(resp.text)
post_id = [i['id'] for i in data['statuses']]
data = [self.parse_tweet(i) for i in data['statuses']]
self.count += len(post_id)
print '\n'.join(data); sorted(post_id)
return (min(post_id),max(post_id))
except:
return (None, None)
def backward(self, limit, id):
self.params.pop('since_id', None)
while self.count < limit:
self.params['max_id'] = id
id = self.search()[0]
if not id:
return
def forward(self, limit, id):
self.params.pop('max_id', None)
while self.count < limit:
self.params['since_id'] = id
id = self.search()[1]
if not id:
return
def search_query(self, limit, **kwargs):
self.count = 0
self.params.pop('max_id', None)
self.params.pop('since_id', None)
for i in kwargs.iteritems():
self.params[i[0]] = i[1]
x,y = self.search()
self.backward(x)
self.forward(y)
def main():
config = TwitDumperConfig()
config.setConsumerCreds('==REDACTED==','==REDACTED==')
config.setAccessCreds('==REDACTED==','==REDACTED==')
twit = TwitDumper(config)
keyword = ['UninstallBukaLapak', 'ShutdownJokowi','PokoknyaPrabowoSandi','DebatKeduaPilpres2019','DebatPilpres2019']
for i in keyword:
twit.search_query(100,q=i)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment