Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save gdsaxton/b0d36c10bbdb80e26b692a1d1a3e11de to your computer and use it in GitHub Desktop.
Save gdsaxton/b0d36c10bbdb80e26b692a1d1a3e11de to your computer and use it in GitHub Desktop.
Downloading Tweets by a List of Users -- SQLite Version
#!/usr/bin/env python
"""
Social_Metrics_Tutorial_Script_User_Timeline_All_Pages.py - DOWNLOADS ALL AVAILABLE RECENT
TWEETS FROM 5 MLB ACCOUNTS INTO SQLITE DATABASE
BEFORE RUNNING THIS SCRIPT, YOU WILL NEED TO:
1. HAVE ANACONDA PYTHON 2.7 INSTALLED
2. HAVE CREATED CSV FILE (E.G., IN EXCEL) CONTAINING TWITTER HANDLES YOU
WISH TO DOWNLOAD (SEE TUTORIAL FOR DETAILS)
THE CODE IS DIVIDED INTO SIX PARTS:
1. Importing necessary Python packages
2. Importing Twython and Twitter app key and access token
- YOU NEED TO MODIFY THIS SECTION IN ORDER TO GET SCRIPT TO WORK (LINES 53-54)
3. Defining function for getting Twitter data
4. Set up columns for database to allow use of SQLAlchemy
5. Writing function for parsing data returned by Twitter API/creating variables to store
6. Main loop over each of the Twitter handles in the accounts table of the database.
- YOU CAN MODIFY DATABASE NAME HERE BY CHANGING 'test.sqlite' (LINE 564)
- # OF PAGES OF TWEETS TO BE DOWNLOADED PER ACCOUNT CAN BE CHANGED (LINES 596, 615)
"""
###### PART I: IMPORT PYTHON PACKAGES (ALL BUT SIMPLEJSON & TWYTHON COME W/ ANACONDA) ######
import sys
import string
import sqlite3
import time
import datetime
from pprint import pprint
import sqlalchemy
from sqlalchemy.orm import mapper, sessionmaker
from sqlalchemy import Column, Integer, String, ForeignKey, Text, DateTime, Unicode, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import exc
from types import *
from datetime import datetime, date, time
import simplejson #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install simplejson
from twython import Twython #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install Twython
###### PART II: IMPORT TWYTHON, ADD TWITTER APP KEY & ACCESS TOKEN (TO ACCESS API) ######
#REPLACE 'APP_KEY' WITH YOUR APP KEY, ACCESS_TOKEN WITH YOUR ACCESS TOKEN, IN THE NEXT TWO LINES
APP_KEY = ' '
ACCESS_TOKEN = ' '
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
###### PART III: DEFINE TWYTHON FUNCTION FOR GETTING ALL AVAILABLE PAGES OF TWEETS PER USER ######
def get_data_user_timeline_all_pages(kid, page):
try:
'''
'count' specifies the number of tweets to try and retrieve, up to a maximum of 200 per distinct request.
The value of count is best thought of as a limit to the number of tweets to return because suspended or
deleted content is removed after the count has been applied. We include retweets in the count,
even if include_rts is not supplied. It is recommended you always send include_rts=1 when using this API method.
THE SCRIPT IS CURRENT SET UP TO DOWNLOAD ALL AVAILABLE PAGES -- 16 PAGES OF 200 TWEETS = 3,200 TWEETS/ACCOUNT TOTAL
'''
d = twitter.get_user_timeline(screen_name=kid, count="200", page=page, include_entities="true", include_rts="1")
except Exception, e:
print "Error reading id %s, exception: %s" % (kid, e)
return None
#print len(d), #d[0] #NUMBER OF ENTRIES RETURNED, FIRST ENTRY
#print "d.keys(): ", d[0].keys()
return d
###### PART IV: SET UP COLUMNS FOR TWEET AND ACCOUNT TABLES IN DATABASE ######
'''
THIS PART OF THE SCRIPT CAN BE SPLIT OFF TO A SEPARATE FILE -- WILL ALSO BE USED
FOR OTHER SCRIPTS TO ANALYZE THE DATA AFTER DOWNLOADING THE TWEETS
'''
Base = declarative_base()
#TWEET TABLE - THIS TABLE WILL BE CREATED AND POPULATED BY THIS PYTHON SCRIPT
class TWEET(Base):
__tablename__ = 'user_timeline' #NAME OF TABLE HERE
rowid = Column(Integer, primary_key=True)
query = Column(String)
tweet_id = Column(String)
tweet_id_str = Column(String, unique=True) ##### UNIQUE CONSTRAINT #####
inserted_date = Column(DateTime)
truncated = Column(String)
language = Column(String)
possibly_sensitive = Column(String) ### NEW
coordinates = Column(String) #Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).
retweeted_status = Column(String)
withheld_in_countries = Column(String)
withheld_scope = Column(String)
created_at_text = Column(String) #UTC time when this Tweet was created.
created_at = Column(DateTime)
month = Column(String)
year = Column(String)
content = Column(Text)
from_user_screen_name = Column(String) #The screen name, handle, or alias that this user identifies themselves with. screen_names are unique but subject to change. Use id_str as a user identifier whenever possible.
from_user_id = Column(String)
from_user_followers_count = Column(Integer) #The number of followers this account currently has.
from_user_friends_count = Column(Integer) #The number of users this account is following (AKA their "followings").
from_user_listed_count = Column(Integer) #The number of public lists that this user is a member of.
from_user_favourites_count = Column(Integer) #The number of tweets this user has favorited in the account's lifetime. British spelling used in the field name for historical reasons.
from_user_statuses_count = Column(Integer) #The number of tweets (including retweets) issued by the user.
from_user_description = Column(String) #The user-defined UTF-8 string describing their account.
from_user_location = Column(String) #The user-defined location for this account's profile.
from_user_created_at = Column(String) #The UTC datetime that the user account was created on Twitter.
retweet_count = Column(Integer)
favorite_count = Column(Integer) #Indicates approximately how many times this Tweet has been "favorited" by Twitter users.
entities_urls = Column(Unicode(255))
entities_urls_count = Column(Integer)
entities_hashtags = Column(Unicode(255))
entities_hashtags_count = Column(Integer)
entities_mentions = Column(Unicode(255))
entities_mentions_count = Column(Integer)
in_reply_to_screen_name = Column(String)
in_reply_to_status_id = Column(String)
source = Column(String)
entities_expanded_urls = Column(Text)
entities_media_count = Column(Integer)
media_expanded_url = Column(Text)
media_url = Column(Text)
media_type = Column(Text)
video_link = Column(Integer)
photo_link = Column(Integer)
twitpic = Column(Integer)
num_characters = Column(Integer)
num_words = Column(Integer)
retweeted_user = Column(Text)
retweeted_user_description = Column(Text)
retweeted_user_screen_name = Column(Text)
retweeted_user_followers_count = Column(Integer)
retweeted_user_listed_count = Column(Integer)
retweeted_user_statuses_count = Column(Integer)
retweeted_user_location = Column(Text)
retweeted_tweet_created_at = Column(DateTime)
Unique_ID = Column(Integer) #SPECIFIC TO EACH ACCOUNT
def __init__(self, query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive, coordinates,
retweeted_status, withheld_in_countries, withheld_scope, created_at_text, created_at, month, year, content,
from_user_screen_name, from_user_id, from_user_followers_count, from_user_friends_count,
from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,
from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,
entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,
in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls,
entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic,
num_characters, num_words,
retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count,
retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location,
retweeted_tweet_created_at,
Unique_ID,
):
self.query = query
self.tweet_id = tweet_id
self.tweet_id_str = tweet_id_str
self.inserted_date = inserted_date
self.truncated = truncated
self.language = language
self.possibly_sensitive = possibly_sensitive
self.coordinates = coordinates
self.retweeted_status = retweeted_status
self.withheld_in_countries = withheld_in_countries
self.withheld_scope = withheld_scope
self.created_at_text = created_at_text
self.created_at = created_at
self.month = month
self.year = year
self.content = content
self.from_user_screen_name = from_user_screen_name
self.from_user_id = from_user_id
self.from_user_followers_count = from_user_followers_count
self.from_user_friends_count = from_user_friends_count
self.from_user_listed_count = from_user_listed_count
self.from_user_favourites_count = from_user_favourites_count
self.from_user_statuses_count = from_user_statuses_count
self.from_user_description = from_user_description
self.from_user_location = from_user_location
self.from_user_created_at = from_user_created_at
self.retweet_count = retweet_count
self.favorite_count = favorite_count
self.entities_urls = entities_urls
self.entities_urls_count = entities_urls_count
self.entities_hashtags = entities_hashtags
self.entities_hashtags_count = entities_hashtags_count
self.entities_mentions = entities_mentions
self.entities_mentions_count = entities_mentions_count
self.in_reply_to_screen_name = in_reply_to_screen_name
self.in_reply_to_status_id = in_reply_to_status_id
self.source = source
self.entities_expanded_urls = entities_expanded_urls
self.entities_media_count = entities_media_count
self.media_expanded_url = media_expanded_url
self.media_url = media_url
self.media_type = media_type
self.video_link = video_link
self.photo_link = photo_link
self.twitpic = twitpic
self.num_characters = num_characters
self.num_words = num_words
self.retweeted_user = retweeted_user
self.retweeted_user_description = retweeted_user_description
self.retweeted_user_screen_name = retweeted_user_screen_name
self.retweeted_user_followers_count = retweeted_user_followers_count
self.retweeted_user_listed_count = retweeted_user_listed_count
self.retweeted_user_statuses_count = retweeted_user_statuses_count
self.retweeted_user_location = retweeted_user_location
self.retweeted_tweet_created_at = retweeted_tweet_created_at
self.Unique_ID = Unique_ID
def __repr__(self):
return "<sender, created_at('%s', '%s')>" % (self.from_user_screen_name,self.created_at)
#ACCOUNTS TABLE -- THIS IS WHAT YOU IMPORTED INTO SQLITE EARLIER IN THE TUTORIAL
class ACCOUNT(Base):
__tablename__ = 'accounts'
Unique_ID = Column(Integer, primary_key=True)
org_name = Column(String)
org_URL = Column(String)
Twitter_URL = Column(String)
Twitter_handle = Column(String)
earliest_tweet_in_db = Column(String)
number_of_tweets_in_db = Column(Integer)
def __init__(self, org_name, org_URL, Twitter_URL, Twitter_handle,
earliest_tweet_in_db, number_of_tweets_in_db,
):
self.org_name = org_name
self.org_URL = org_URL
self.Twitter_URL = Twitter_URL
self.Twitter_handle = Twitter_handle
self.earliest_tweet_in_db = earliest_tweet_in_db
self.number_of_tweets_in_db = number_of_tweets_in_db
def __repr__(self):
return "<Twitter handle, org_type('%s', '%s')>" % (self.Twitter_handle,self.org_type)
###### PART V: DEFINE FUNCTION FOR PARSING JSON-BASED TWEET DATA RETURNED BY TWITTER API -- CREATE VARIABLES TO BE STORED ######
def write_data(self, d, Twitter_handle, Unique_ID):
Unique_ID = Unique_ID
query = Twitter_handle #d['search_metadata']['query'] #THIS IS DIFFERENT FROM MENTIONS AND DMS FILES
#for sender in d['results']:
for entry in d: #d['statuses']: #THIS IS DIFFERENT FROM MENTIONS AND DMS FILES
#json_output = str(entry)
#print json_output
#json_output = unicode.join(u'\n',map(unicode,json_output_string))
#ids.append(sender['id'])
#print sender.keys()
tweet_id = entry['id']
tweet_id_str = entry['id_str']
inserted_date = datetime.now() #CURRENT DATE AND TIME
##### NEW VARIABLES -- UNTESTED #####
truncated = entry['truncated']
language = entry['lang']
#WE NEED THE 'IF' CONDITION CHECK HERE BECAUSE THERE IS NOT A VALUE FOR ALL TWEETS, UNLIKE,
#SAY, WITH THE 'TEXT' VARIABLE
if 'possibly_sensitive' in entry:
possibly_sensitive= entry['possibly_sensitive']
else:
possibly_sensitive = ''
coordinates = []
if 'coordinates' in entry and entry['coordinates'] != None:
#print entry['coordinates']['coordinates']
#[-98.0738001, 27.75346355]
#-98.0738001 <type 'float'>
#27.75346355 <type 'float'>
for coordinate in entry['coordinates']['coordinates']:
#print coordinate, type(coordinate)
#coordinates.append(str(coordinate))
coordinates.append(coordinate)
#print "HERE ARE THE COORDINATES---------------------->", "".join(format(x, "10.8f") for x in coordinates)
coordinates = ', '.join(map(str, coordinates)) #WILL NOT WORK
#coordinates = entry['coordinates']['coordinates']
#coordinates = ",".join("'{0}'".format(n) for n in coordinates) #THIS IS A LIST OF INTEGERS
#print type(coordinates), len(coordinates), coordinates
#print type(coordinates), coordinates
else:
coordinates = ''
if 'retweeted_status' in entry:
retweeted_status = 'THIS IS A RETWEET'
retweeted_user = entry['retweeted_status']['user']['id_str'] ###I SHOULD USE ALL STR OR NOT
retweeted_user_description = entry['retweeted_status']['user']['description']
retweeted_user_screen_name = entry['retweeted_status']['user']['screen_name']
retweeted_user_followers_count = entry['retweeted_status']['user']['followers_count']
retweeted_user_listed_count = entry['retweeted_status']['user']['listed_count']
retweeted_user_statuses_count = entry['retweeted_status']['user']['statuses_count']
retweeted_user_location = entry['retweeted_status']['user']['location']
retweeted_tweet_created_at_text = entry['retweeted_status']['created_at']
retweeted_tweet_created_at = datetime.strptime(retweeted_tweet_created_at_text, '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'>
else:
retweeted_status = ''
retweeted_user = ''
retweeted_user_description = ''
retweeted_user_screen_name = ''
retweeted_user_followers_count = ''
retweeted_user_listed_count = ''
retweeted_user_statuses_count = ''
retweeted_user_location = ''
retweeted_tweet_created_at = None
if 'withheld_in_countries' in entry:
withheld_in_countries = 'WITHHELD --> CHECK JSON'
else:
withheld_in_countries = ''
if 'withheld_scope' in entry:
withheld_scope = entry['withheld_scope']
else:
withheld_scope = ''
#print sender['text']
# init field values
content = entry['text']
content = content.replace('\n','')
print '\n', content.encode('utf-8')
num_characters = len(content) #NUMBER OF CHARACTERS (SPACES INCLUDED)
words = content.split()
num_words = len(words)
created_at_text = entry['created_at'] ##### Fri Jun 24 18:14:34 +0000 2011 --> <type 'str'>
##### EXAMPLE OF HOW TO CONVERT STRING TO DATETIME FORMAT AND THEN MODIFY
#t = datetime.strptime("20091229050936", "%Y%m%d%H%M%S")
#print t.strftime('%H:%M %d %B %Y (UTC)')
#fred = t.strftime('%H:%M %d %B %Y (UTC)') ## 05:09 29 December 2009 (UTC) --> <type 'str'>
#date_test = datetime.strptime('Fri Jun 24 18:14:34 +0000 2011', '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'>
created_at = datetime.strptime(created_at_text, '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'>
#date = twitter_date.strftime('%Y-%B-%d %H:%M') ### BETTER OPTION BELOW (HAS MONTH, DAY AS NUMBERS)
created_at2 = created_at.strftime('%Y-%m-%d %H:%M:%S') ### THIS WORKS WELL, BUT IS SAME AS twitter_date ABOVE
#month = datetime.strptime(created_at_text, '%b')
#year = datetime.strptime(created_at_text, '%Y') #ValueError: time data 'Thu Oct 17 17:47:34 +0000 2013' does not match format '%Y'
month = created_at.strftime('%m')
year = created_at.strftime('%Y')
#print 'month', month, 'year', year
#print created_at, created_at2
#from_user = sender['from_user'] ##### THIS NEEDS TO BE UPDATED --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS
from_user_screen_name = entry['user']['screen_name']
#from_user_id = sender['from_user_id'] ##### THIS NEEDS TO BE UPDATED --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS
from_user_id = entry['user']['id']
from_user_followers_count = entry['user']['followers_count']
from_user_friends_count = entry['user']['friends_count']
from_user_listed_count = entry['user']['listed_count']
from_user_favourites_count = entry['user']['favourites_count']
print '\n', 'from_user_favourites_count-------------->', from_user_favourites_count
from_user_statuses_count = entry['user']['statuses_count']
from_user_description = entry['user']['description']
from_user_location = entry['user']['location']
from_user_created_at = entry['user']['created_at']
retweet_count = entry['retweet_count']
favorite_count = entry['favorite_count']
in_reply_to_screen_name = entry['in_reply_to_screen_name']
in_reply_to_status_id = entry['in_reply_to_status_id']
#GENERATES VARIABLES FOR #URLS, HASHTAGS, AND MENTIONS
entities_urls_count = len(entry['entities']['urls'])
entities_hashtags_count = len(entry['entities']['hashtags'])
entities_mentions_count = len(entry['entities']['user_mentions'])
source = entry['source']
#NOW WE ARE GOING TO CREATE VARIABLES FOR URLS, TAGS, AND MENTIONS --> THE DIFFICULTY HERE
#IS THAT THERE IS NO UNIFORM NUMBER OF, SAY, TAGS IN EACH TWEET, SO WE CANNOT CREATE
#A FIXED NUMBER OF COLUMNS/VARIABLES BEFOREHAND. INSTEAD, WE'LL CREATE A STRING FOR EACH
#VARIABLE --> CAN BE PARSED OUT OR LOOPED OVER LATER AS NEEDED.
entities_urls, entities_expanded_urls, entities_hashtags, entities_mentions = [], [], [], []
#print type(entry['entities']['urls']) ##### IT'S A LIST
#urls = entry['entities']['urls']
for link in entry['entities']['urls']:
if 'url' in link:
url = link['url']
expanded_url = link['expanded_url']
#print link['url'], link['expanded_url']
entities_urls.append(url)
entities_expanded_urls.append(expanded_url)
else:
print "No urls in entry"
for hashtag in entry['entities']['hashtags']:
if 'text' in hashtag:
tag = hashtag['text']
#print hashtag['text']
entities_hashtags.append(tag)
else:
print "No hashtags in entry"
for at in entry['entities']['user_mentions']:
if 'screen_name' in at:
mention = at['screen_name']
#print at['screen_name']
entities_mentions.append(mention)
else:
print "No mentions in entry"
entities_mentions = string.join(entities_mentions, u", ")
entities_hashtags = string.join(entities_hashtags, u", ")
entities_urls = string.join(entities_urls, u", ")
entities_expanded_urls = string.join(entities_expanded_urls, u", ")
video_link = 0
if 'vimeo' in entities_expanded_urls or 'youtube' in entities_expanded_urls or 'youtu' in entities_expanded_urls or 'vine' in entities_expanded_urls:
video_link = 1 #All of these videos show up in 'View Media' in tweets
#print "FOUND A VIDEO!!!"
else:
video_link = 0
#if photo_text.find('twitpic'): # or photo_text.find('instagram') or photo_text.find('instagr'):
#if 'twitpic' in photo_text:
if 'twitpic' in entities_expanded_urls:
twitpic = 1 #twitpic images show up in 'View Media' in tweets
#print "FOUND A TWITPIC LINK!"
else:
twitpic = 0
if 'twitpic' in entities_expanded_urls or 'instagram' in entities_expanded_urls or 'instagr' in entities_expanded_urls:
photo_link = 1 #instagram images DO NOT show up in 'View Media' in tweets
#print "FOUND A TWITPIC OR INSTAGRAM LINK!!!"
else:
photo_link = 0
#CONVERT TO UNICODE FOR INSERTION INTO SQLITE DB
entities_urls = unicode(entities_urls)
entities_expanded_urls = unicode(entities_expanded_urls)
entities_hashtags = unicode(entities_hashtags)
entities_mentions = unicode(entities_mentions)
#print "urls...?....", entities_urls
#print "user_mentions...?....", entities_mentions
#print "hashtags...?....", entities_hashtags
#if 'symbols' in entry['entities']:
# print "HERE ARE THE SYMBOLS.......", entry['entities']['symbols']
#else:
# print "THERE AIN'T NO entry['entities']['symbols']"
if 'media' in entry['entities']:
#print "HERE ARE THE MEDIA.......", entry['entities']['media']
entities_media_count = len(entry['entities']['media'])
#'expanded_url' #The fully resolved media URL [FULL TWEET PLUS PICTURE] --> e.g., http://twitter.com/StJude/status/347801636351135744/photo/1
#'display_url' # Not a URL but a string to display instead of the media URL --> e.g., pic.twitter.com/hO4BjuqrnE
#'media_url_https' #The SSL URL of the media file --> e.g., https://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg
#'type' # only photo for now --> e.g., photo
#'media_url' # The URL of the media file --> e.g., http://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg
else:
entities_media_count = ''
#for a in entry['entities']['media']:
#print a['expanded_url'], a['media_url'] #, a['media_type']
#bob = entry['entities']['media'][0]
#print bob
#print bob['expanded_url']
if 'media' in entry['entities']:
if 'expanded_url' in entry['entities']['media'][0]:
media_expanded_url = entry['entities']['media'][0]['expanded_url']
else:
#print "THERE AIN'T NO expanded_url in entry['entities']['media']"
media_expanded_url = ''
if 'media_url' in entry['entities']['media'][0]:
media_url = entry['entities']['media'][0]['media_url']
else:
#print "THERE AIN'T NO media_url in entry['entities']['media']"
media_url = ''
if 'type' in entry['entities']['media'][0]:
media_type = entry['entities']['media'][0]['type']
else:
#print "THERE AIN'T NO type in entry['entities']['media']"
media_type = ''
else:
media_type = ''
media_url = ''
media_expanded_url = ''
upd = TWEET(query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive,
coordinates, retweeted_status, withheld_in_countries, withheld_scope, created_at_text,
created_at, month, year, content, from_user_screen_name, from_user_id, from_user_followers_count,
from_user_friends_count, from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,
from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,
entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,
in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, #json_output,
entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic,
num_characters, num_words,
retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count,
retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location,
retweeted_tweet_created_at,
Unique_ID,
)
self.session.add(upd)
try:
self.session.commit()
except exc.SQLAlchemyError:
self.session.rollback()
print " NOT INSERTING --> IT'S A DUPLICATE"
###### PART VI: MAIN LOOP OVER EACH ID STORED IN 'ACCOUNTS' TABLE OF DATABASE - 16 PAGES OF 200 TWEETS PER ID ######
class Scrape:
def __init__(self):
engine = sqlalchemy.create_engine("sqlite:///test.sqlite", echo=False) # YOUR DATABASE NAME HERE
Session = sessionmaker(bind=engine)
self.session = Session()
Base.metadata.create_all(engine)
def main(self):
all_ids = self.session.query(ACCOUNT).all()
print 'len(all_ids)', len(all_ids)
#CHECK IF THE 'ACCOUNTS' TABLE HAS BEEN POPULATED; IF NOT, ADD DATA FROM
#CSV FILE TO THE TABLE
if len(all_ids) < 1:
conn = sqlite3.connect('test.sqlite')
import pandas as pd
df = pd.read_csv('accounts.csv')
df.to_sql('accounts', conn, if_exists='append', index=False)
all_ids = self.session.query(ACCOUNT).all()
keys = []
for i in all_ids:
Unique_ID = i.Unique_ID
Twitter_handle = i.Twitter_handle
kid = Twitter_handle
rowid = i.Unique_ID
print '\n', "\rprocessing id %s/%s -- %s" % (rowid, len(all_ids), Twitter_handle),
sys.stdout.flush()
page = 1
while page < 17: #TO DOWNLOAD FEWER PAGES OF TWEETS, MODIFY THIS PLUS LINE 621
print "------XXXXXX------ STARTING PAGE", page
d = get_data_user_timeline_all_pages(kid, page)
if not d:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
break
if len(d)==0:
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
break
write_data(self, d, Twitter_handle, Unique_ID)
#self.session.commit()
#print 'pausing for 1 second'
#time.sleep(1) #PAUSE FOR 1 SECOND
page += 1
if page > 16:
print "WE'RE AT THE END OF PAGE 16!!!!!"
break
self.session.commit()
print '\n', '\n', '\n', "FINISHED WITH ALL IDS"
self.session.close()
if __name__ == "__main__":
s = Scrape()
s.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment