Skip to content

Instantly share code, notes, and snippets.

@thinrhino
Created May 2, 2014 10:23
Show Gist options
  • Save thinrhino/10334d84b0c8e0c46c1e to your computer and use it in GitHub Desktop.
Save thinrhino/10334d84b0c8e0c46c1e to your computer and use it in GitHub Desktop.
# Script to populate data into MongoDB
import twitter
import time
import logging
from pymongo import MongoClient
CONSUMER_KEY = '<twitter_consumer_key>'
CONSUMER_SECRET = '<twitter_secret_key>'
OAUTH_TOKEN = '<twitter_oauth_token>'
OAUTH_TOKEN_SECRET = '<twitter_oauth_token_secret>'
client = MongoClient('<mongodb_ip_address>', 50000) # MongoDB access ip & port
db = client['twitter']
auth = twitter.OAuth(
OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET
)
logging.basicConfig(
filename='push_data.log',
level=logging.DEBUG,
filemode='w'
)
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
logging.getLogger('').addHandler(console)
try:
twitter_stream = twitter.TwitterStream(auth=auth)
iterator = twitter_stream.statuses.sample()
start_time = time.time()
data_count = 0
for tweet in iterator:
if 'delete' in tweet.keys():
continue
if not 'lang' in tweet.keys():
continue
db.twitter_data.insert(tweet)
data_count += 1
if data_count == 10000:
time_taken = time.time() - start_time
total_data_count = db['twitter_data'].find().count()
logging.info('%s,%s\n' % (total_data_count, time_taken))
start_time = time.time()
data_count = 0
except Exception, e:
logging.critical('Error : %s' % e)
raise e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment