Instantly share code, notes, and snippets.
Created
September 25, 2017 17:21
-
Star
(2)
2
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save gdsaxton/b0d36c10bbdb80e26b692a1d1a3e11de to your computer and use it in GitHub Desktop.
Downloading Tweets by a List of Users -- SQLite Version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Social_Metrics_Tutorial_Script_User_Timeline_All_Pages.py - DOWNLOADS ALL AVAILABLE RECENT | |
TWEETS FROM 5 MLB ACCOUNTS INTO SQLITE DATABASE | |
BEFORE RUNNING THIS SCRIPT, YOU WILL NEED TO: | |
1. HAVE ANACONDA PYTHON 2.7 INSTALLED | |
2. HAVE CREATED CSV FILE (E.G., IN EXCEL) CONTAINING TWITTER HANDLES YOU | |
WISH TO DOWNLOAD (SEE TUTORIAL FOR DETAILS) | |
THE CODE IS DIVIDED INTO SIX PARTS: | |
1. Importing necessary Python packages | |
2. Importing Twython and Twitter app key and access token | |
- YOU NEED TO MODIFY THIS SECTION IN ORDER TO GET SCRIPT TO WORK (LINES 53-54) | |
3. Defining function for getting Twitter data | |
4. Set up columns for database to allow use of SQLAlchemy | |
5. Writing function for parsing data returned by Twitter API/creating variables to store | |
6. Main loop over each of the Twitter handles in the accounts table of the database. | |
- YOU CAN MODIFY DATABASE NAME HERE BY CHANGING 'test.sqlite' (LINE 564) | |
- # OF PAGES OF TWEETS TO BE DOWNLOADED PER ACCOUNT CAN BE CHANGED (LINES 596, 615) | |
""" | |
###### PART I: IMPORT PYTHON PACKAGES (ALL BUT SIMPLEJSON & TWYTHON COME W/ ANACONDA) ###### | |
import sys | |
import string | |
import sqlite3 | |
import time | |
import datetime | |
from pprint import pprint | |
import sqlalchemy | |
from sqlalchemy.orm import mapper, sessionmaker | |
from sqlalchemy import Column, Integer, String, ForeignKey, Text, DateTime, Unicode, Float | |
from sqlalchemy.ext.declarative import declarative_base | |
from sqlalchemy import exc | |
from types import * | |
from datetime import datetime, date, time | |
import simplejson #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install simplejson | |
from twython import Twython #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install Twython | |
###### PART II: IMPORT TWYTHON, ADD TWITTER APP KEY & ACCESS TOKEN (TO ACCESS API) ###### | |
#REPLACE 'APP_KEY' WITH YOUR APP KEY, ACCESS_TOKEN WITH YOUR ACCESS TOKEN, IN THE NEXT TWO LINES | |
APP_KEY = ' ' | |
ACCESS_TOKEN = ' ' | |
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN) | |
###### PART III: DEFINE TWYTHON FUNCTION FOR GETTING ALL AVAILABLE PAGES OF TWEETS PER USER ###### | |
def get_data_user_timeline_all_pages(kid, page): | |
try: | |
''' | |
'count' specifies the number of tweets to try and retrieve, up to a maximum of 200 per distinct request. | |
The value of count is best thought of as a limit to the number of tweets to return because suspended or | |
deleted content is removed after the count has been applied. We include retweets in the count, | |
even if include_rts is not supplied. It is recommended you always send include_rts=1 when using this API method. | |
THE SCRIPT IS CURRENT SET UP TO DOWNLOAD ALL AVAILABLE PAGES -- 16 PAGES OF 200 TWEETS = 3,200 TWEETS/ACCOUNT TOTAL | |
''' | |
d = twitter.get_user_timeline(screen_name=kid, count="200", page=page, include_entities="true", include_rts="1") | |
except Exception, e: | |
print "Error reading id %s, exception: %s" % (kid, e) | |
return None | |
#print len(d), #d[0] #NUMBER OF ENTRIES RETURNED, FIRST ENTRY | |
#print "d.keys(): ", d[0].keys() | |
return d | |
###### PART IV: SET UP COLUMNS FOR TWEET AND ACCOUNT TABLES IN DATABASE ###### | |
''' | |
THIS PART OF THE SCRIPT CAN BE SPLIT OFF TO A SEPARATE FILE -- WILL ALSO BE USED | |
FOR OTHER SCRIPTS TO ANALYZE THE DATA AFTER DOWNLOADING THE TWEETS | |
''' | |
Base = declarative_base() | |
#TWEET TABLE - THIS TABLE WILL BE CREATED AND POPULATED BY THIS PYTHON SCRIPT | |
class TWEET(Base): | |
__tablename__ = 'user_timeline' #NAME OF TABLE HERE | |
rowid = Column(Integer, primary_key=True) | |
query = Column(String) | |
tweet_id = Column(String) | |
tweet_id_str = Column(String, unique=True) ##### UNIQUE CONSTRAINT ##### | |
inserted_date = Column(DateTime) | |
truncated = Column(String) | |
language = Column(String) | |
possibly_sensitive = Column(String) ### NEW | |
coordinates = Column(String) #Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude). | |
retweeted_status = Column(String) | |
withheld_in_countries = Column(String) | |
withheld_scope = Column(String) | |
created_at_text = Column(String) #UTC time when this Tweet was created. | |
created_at = Column(DateTime) | |
month = Column(String) | |
year = Column(String) | |
content = Column(Text) | |
from_user_screen_name = Column(String) #The screen name, handle, or alias that this user identifies themselves with. screen_names are unique but subject to change. Use id_str as a user identifier whenever possible. | |
from_user_id = Column(String) | |
from_user_followers_count = Column(Integer) #The number of followers this account currently has. | |
from_user_friends_count = Column(Integer) #The number of users this account is following (AKA their "followings"). | |
from_user_listed_count = Column(Integer) #The number of public lists that this user is a member of. | |
from_user_favourites_count = Column(Integer) #The number of tweets this user has favorited in the account's lifetime. British spelling used in the field name for historical reasons. | |
from_user_statuses_count = Column(Integer) #The number of tweets (including retweets) issued by the user. | |
from_user_description = Column(String) #The user-defined UTF-8 string describing their account. | |
from_user_location = Column(String) #The user-defined location for this account's profile. | |
from_user_created_at = Column(String) #The UTC datetime that the user account was created on Twitter. | |
retweet_count = Column(Integer) | |
favorite_count = Column(Integer) #Indicates approximately how many times this Tweet has been "favorited" by Twitter users. | |
entities_urls = Column(Unicode(255)) | |
entities_urls_count = Column(Integer) | |
entities_hashtags = Column(Unicode(255)) | |
entities_hashtags_count = Column(Integer) | |
entities_mentions = Column(Unicode(255)) | |
entities_mentions_count = Column(Integer) | |
in_reply_to_screen_name = Column(String) | |
in_reply_to_status_id = Column(String) | |
source = Column(String) | |
entities_expanded_urls = Column(Text) | |
entities_media_count = Column(Integer) | |
media_expanded_url = Column(Text) | |
media_url = Column(Text) | |
media_type = Column(Text) | |
video_link = Column(Integer) | |
photo_link = Column(Integer) | |
twitpic = Column(Integer) | |
num_characters = Column(Integer) | |
num_words = Column(Integer) | |
retweeted_user = Column(Text) | |
retweeted_user_description = Column(Text) | |
retweeted_user_screen_name = Column(Text) | |
retweeted_user_followers_count = Column(Integer) | |
retweeted_user_listed_count = Column(Integer) | |
retweeted_user_statuses_count = Column(Integer) | |
retweeted_user_location = Column(Text) | |
retweeted_tweet_created_at = Column(DateTime) | |
Unique_ID = Column(Integer) #SPECIFIC TO EACH ACCOUNT | |
def __init__(self, query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive, coordinates, | |
retweeted_status, withheld_in_countries, withheld_scope, created_at_text, created_at, month, year, content, | |
from_user_screen_name, from_user_id, from_user_followers_count, from_user_friends_count, | |
from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description, | |
from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count, | |
entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count, | |
in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, | |
entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic, | |
num_characters, num_words, | |
retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count, | |
retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location, | |
retweeted_tweet_created_at, | |
Unique_ID, | |
): | |
self.query = query | |
self.tweet_id = tweet_id | |
self.tweet_id_str = tweet_id_str | |
self.inserted_date = inserted_date | |
self.truncated = truncated | |
self.language = language | |
self.possibly_sensitive = possibly_sensitive | |
self.coordinates = coordinates | |
self.retweeted_status = retweeted_status | |
self.withheld_in_countries = withheld_in_countries | |
self.withheld_scope = withheld_scope | |
self.created_at_text = created_at_text | |
self.created_at = created_at | |
self.month = month | |
self.year = year | |
self.content = content | |
self.from_user_screen_name = from_user_screen_name | |
self.from_user_id = from_user_id | |
self.from_user_followers_count = from_user_followers_count | |
self.from_user_friends_count = from_user_friends_count | |
self.from_user_listed_count = from_user_listed_count | |
self.from_user_favourites_count = from_user_favourites_count | |
self.from_user_statuses_count = from_user_statuses_count | |
self.from_user_description = from_user_description | |
self.from_user_location = from_user_location | |
self.from_user_created_at = from_user_created_at | |
self.retweet_count = retweet_count | |
self.favorite_count = favorite_count | |
self.entities_urls = entities_urls | |
self.entities_urls_count = entities_urls_count | |
self.entities_hashtags = entities_hashtags | |
self.entities_hashtags_count = entities_hashtags_count | |
self.entities_mentions = entities_mentions | |
self.entities_mentions_count = entities_mentions_count | |
self.in_reply_to_screen_name = in_reply_to_screen_name | |
self.in_reply_to_status_id = in_reply_to_status_id | |
self.source = source | |
self.entities_expanded_urls = entities_expanded_urls | |
self.entities_media_count = entities_media_count | |
self.media_expanded_url = media_expanded_url | |
self.media_url = media_url | |
self.media_type = media_type | |
self.video_link = video_link | |
self.photo_link = photo_link | |
self.twitpic = twitpic | |
self.num_characters = num_characters | |
self.num_words = num_words | |
self.retweeted_user = retweeted_user | |
self.retweeted_user_description = retweeted_user_description | |
self.retweeted_user_screen_name = retweeted_user_screen_name | |
self.retweeted_user_followers_count = retweeted_user_followers_count | |
self.retweeted_user_listed_count = retweeted_user_listed_count | |
self.retweeted_user_statuses_count = retweeted_user_statuses_count | |
self.retweeted_user_location = retweeted_user_location | |
self.retweeted_tweet_created_at = retweeted_tweet_created_at | |
self.Unique_ID = Unique_ID | |
def __repr__(self): | |
return "<sender, created_at('%s', '%s')>" % (self.from_user_screen_name,self.created_at) | |
#ACCOUNTS TABLE -- THIS IS WHAT YOU IMPORTED INTO SQLITE EARLIER IN THE TUTORIAL | |
class ACCOUNT(Base): | |
__tablename__ = 'accounts' | |
Unique_ID = Column(Integer, primary_key=True) | |
org_name = Column(String) | |
org_URL = Column(String) | |
Twitter_URL = Column(String) | |
Twitter_handle = Column(String) | |
earliest_tweet_in_db = Column(String) | |
number_of_tweets_in_db = Column(Integer) | |
def __init__(self, org_name, org_URL, Twitter_URL, Twitter_handle, | |
earliest_tweet_in_db, number_of_tweets_in_db, | |
): | |
self.org_name = org_name | |
self.org_URL = org_URL | |
self.Twitter_URL = Twitter_URL | |
self.Twitter_handle = Twitter_handle | |
self.earliest_tweet_in_db = earliest_tweet_in_db | |
self.number_of_tweets_in_db = number_of_tweets_in_db | |
def __repr__(self): | |
return "<Twitter handle, org_type('%s', '%s')>" % (self.Twitter_handle,self.org_type) | |
###### PART V: DEFINE FUNCTION FOR PARSING JSON-BASED TWEET DATA RETURNED BY TWITTER API -- CREATE VARIABLES TO BE STORED ###### | |
def write_data(self, d, Twitter_handle, Unique_ID): | |
Unique_ID = Unique_ID | |
query = Twitter_handle #d['search_metadata']['query'] #THIS IS DIFFERENT FROM MENTIONS AND DMS FILES | |
#for sender in d['results']: | |
for entry in d: #d['statuses']: #THIS IS DIFFERENT FROM MENTIONS AND DMS FILES | |
#json_output = str(entry) | |
#print json_output | |
#json_output = unicode.join(u'\n',map(unicode,json_output_string)) | |
#ids.append(sender['id']) | |
#print sender.keys() | |
tweet_id = entry['id'] | |
tweet_id_str = entry['id_str'] | |
inserted_date = datetime.now() #CURRENT DATE AND TIME | |
##### NEW VARIABLES -- UNTESTED ##### | |
truncated = entry['truncated'] | |
language = entry['lang'] | |
#WE NEED THE 'IF' CONDITION CHECK HERE BECAUSE THERE IS NOT A VALUE FOR ALL TWEETS, UNLIKE, | |
#SAY, WITH THE 'TEXT' VARIABLE | |
if 'possibly_sensitive' in entry: | |
possibly_sensitive= entry['possibly_sensitive'] | |
else: | |
possibly_sensitive = '' | |
coordinates = [] | |
if 'coordinates' in entry and entry['coordinates'] != None: | |
#print entry['coordinates']['coordinates'] | |
#[-98.0738001, 27.75346355] | |
#-98.0738001 <type 'float'> | |
#27.75346355 <type 'float'> | |
for coordinate in entry['coordinates']['coordinates']: | |
#print coordinate, type(coordinate) | |
#coordinates.append(str(coordinate)) | |
coordinates.append(coordinate) | |
#print "HERE ARE THE COORDINATES---------------------->", "".join(format(x, "10.8f") for x in coordinates) | |
coordinates = ', '.join(map(str, coordinates)) #WILL NOT WORK | |
#coordinates = entry['coordinates']['coordinates'] | |
#coordinates = ",".join("'{0}'".format(n) for n in coordinates) #THIS IS A LIST OF INTEGERS | |
#print type(coordinates), len(coordinates), coordinates | |
#print type(coordinates), coordinates | |
else: | |
coordinates = '' | |
if 'retweeted_status' in entry: | |
retweeted_status = 'THIS IS A RETWEET' | |
retweeted_user = entry['retweeted_status']['user']['id_str'] ###I SHOULD USE ALL STR OR NOT | |
retweeted_user_description = entry['retweeted_status']['user']['description'] | |
retweeted_user_screen_name = entry['retweeted_status']['user']['screen_name'] | |
retweeted_user_followers_count = entry['retweeted_status']['user']['followers_count'] | |
retweeted_user_listed_count = entry['retweeted_status']['user']['listed_count'] | |
retweeted_user_statuses_count = entry['retweeted_status']['user']['statuses_count'] | |
retweeted_user_location = entry['retweeted_status']['user']['location'] | |
retweeted_tweet_created_at_text = entry['retweeted_status']['created_at'] | |
retweeted_tweet_created_at = datetime.strptime(retweeted_tweet_created_at_text, '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'> | |
else: | |
retweeted_status = '' | |
retweeted_user = '' | |
retweeted_user_description = '' | |
retweeted_user_screen_name = '' | |
retweeted_user_followers_count = '' | |
retweeted_user_listed_count = '' | |
retweeted_user_statuses_count = '' | |
retweeted_user_location = '' | |
retweeted_tweet_created_at = None | |
if 'withheld_in_countries' in entry: | |
withheld_in_countries = 'WITHHELD --> CHECK JSON' | |
else: | |
withheld_in_countries = '' | |
if 'withheld_scope' in entry: | |
withheld_scope = entry['withheld_scope'] | |
else: | |
withheld_scope = '' | |
#print sender['text'] | |
# init field values | |
content = entry['text'] | |
content = content.replace('\n','') | |
print '\n', content.encode('utf-8') | |
num_characters = len(content) #NUMBER OF CHARACTERS (SPACES INCLUDED) | |
words = content.split() | |
num_words = len(words) | |
created_at_text = entry['created_at'] ##### Fri Jun 24 18:14:34 +0000 2011 --> <type 'str'> | |
##### EXAMPLE OF HOW TO CONVERT STRING TO DATETIME FORMAT AND THEN MODIFY | |
#t = datetime.strptime("20091229050936", "%Y%m%d%H%M%S") | |
#print t.strftime('%H:%M %d %B %Y (UTC)') | |
#fred = t.strftime('%H:%M %d %B %Y (UTC)') ## 05:09 29 December 2009 (UTC) --> <type 'str'> | |
#date_test = datetime.strptime('Fri Jun 24 18:14:34 +0000 2011', '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'> | |
created_at = datetime.strptime(created_at_text, '%a %b %d %H:%M:%S +0000 %Y') #<type 'datetime.datetime'> | |
#date = twitter_date.strftime('%Y-%B-%d %H:%M') ### BETTER OPTION BELOW (HAS MONTH, DAY AS NUMBERS) | |
created_at2 = created_at.strftime('%Y-%m-%d %H:%M:%S') ### THIS WORKS WELL, BUT IS SAME AS twitter_date ABOVE | |
#month = datetime.strptime(created_at_text, '%b') | |
#year = datetime.strptime(created_at_text, '%Y') #ValueError: time data 'Thu Oct 17 17:47:34 +0000 2013' does not match format '%Y' | |
month = created_at.strftime('%m') | |
year = created_at.strftime('%Y') | |
#print 'month', month, 'year', year | |
#print created_at, created_at2 | |
#from_user = sender['from_user'] ##### THIS NEEDS TO BE UPDATED --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS | |
from_user_screen_name = entry['user']['screen_name'] | |
#from_user_id = sender['from_user_id'] ##### THIS NEEDS TO BE UPDATED --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS | |
from_user_id = entry['user']['id'] | |
from_user_followers_count = entry['user']['followers_count'] | |
from_user_friends_count = entry['user']['friends_count'] | |
from_user_listed_count = entry['user']['listed_count'] | |
from_user_favourites_count = entry['user']['favourites_count'] | |
print '\n', 'from_user_favourites_count-------------->', from_user_favourites_count | |
from_user_statuses_count = entry['user']['statuses_count'] | |
from_user_description = entry['user']['description'] | |
from_user_location = entry['user']['location'] | |
from_user_created_at = entry['user']['created_at'] | |
retweet_count = entry['retweet_count'] | |
favorite_count = entry['favorite_count'] | |
in_reply_to_screen_name = entry['in_reply_to_screen_name'] | |
in_reply_to_status_id = entry['in_reply_to_status_id'] | |
#GENERATES VARIABLES FOR #URLS, HASHTAGS, AND MENTIONS | |
entities_urls_count = len(entry['entities']['urls']) | |
entities_hashtags_count = len(entry['entities']['hashtags']) | |
entities_mentions_count = len(entry['entities']['user_mentions']) | |
source = entry['source'] | |
#NOW WE ARE GOING TO CREATE VARIABLES FOR URLS, TAGS, AND MENTIONS --> THE DIFFICULTY HERE | |
#IS THAT THERE IS NO UNIFORM NUMBER OF, SAY, TAGS IN EACH TWEET, SO WE CANNOT CREATE | |
#A FIXED NUMBER OF COLUMNS/VARIABLES BEFOREHAND. INSTEAD, WE'LL CREATE A STRING FOR EACH | |
#VARIABLE --> CAN BE PARSED OUT OR LOOPED OVER LATER AS NEEDED. | |
entities_urls, entities_expanded_urls, entities_hashtags, entities_mentions = [], [], [], [] | |
#print type(entry['entities']['urls']) ##### IT'S A LIST | |
#urls = entry['entities']['urls'] | |
for link in entry['entities']['urls']: | |
if 'url' in link: | |
url = link['url'] | |
expanded_url = link['expanded_url'] | |
#print link['url'], link['expanded_url'] | |
entities_urls.append(url) | |
entities_expanded_urls.append(expanded_url) | |
else: | |
print "No urls in entry" | |
for hashtag in entry['entities']['hashtags']: | |
if 'text' in hashtag: | |
tag = hashtag['text'] | |
#print hashtag['text'] | |
entities_hashtags.append(tag) | |
else: | |
print "No hashtags in entry" | |
for at in entry['entities']['user_mentions']: | |
if 'screen_name' in at: | |
mention = at['screen_name'] | |
#print at['screen_name'] | |
entities_mentions.append(mention) | |
else: | |
print "No mentions in entry" | |
entities_mentions = string.join(entities_mentions, u", ") | |
entities_hashtags = string.join(entities_hashtags, u", ") | |
entities_urls = string.join(entities_urls, u", ") | |
entities_expanded_urls = string.join(entities_expanded_urls, u", ") | |
video_link = 0 | |
if 'vimeo' in entities_expanded_urls or 'youtube' in entities_expanded_urls or 'youtu' in entities_expanded_urls or 'vine' in entities_expanded_urls: | |
video_link = 1 #All of these videos show up in 'View Media' in tweets | |
#print "FOUND A VIDEO!!!" | |
else: | |
video_link = 0 | |
#if photo_text.find('twitpic'): # or photo_text.find('instagram') or photo_text.find('instagr'): | |
#if 'twitpic' in photo_text: | |
if 'twitpic' in entities_expanded_urls: | |
twitpic = 1 #twitpic images show up in 'View Media' in tweets | |
#print "FOUND A TWITPIC LINK!" | |
else: | |
twitpic = 0 | |
if 'twitpic' in entities_expanded_urls or 'instagram' in entities_expanded_urls or 'instagr' in entities_expanded_urls: | |
photo_link = 1 #instagram images DO NOT show up in 'View Media' in tweets | |
#print "FOUND A TWITPIC OR INSTAGRAM LINK!!!" | |
else: | |
photo_link = 0 | |
#CONVERT TO UNICODE FOR INSERTION INTO SQLITE DB | |
entities_urls = unicode(entities_urls) | |
entities_expanded_urls = unicode(entities_expanded_urls) | |
entities_hashtags = unicode(entities_hashtags) | |
entities_mentions = unicode(entities_mentions) | |
#print "urls...?....", entities_urls | |
#print "user_mentions...?....", entities_mentions | |
#print "hashtags...?....", entities_hashtags | |
#if 'symbols' in entry['entities']: | |
# print "HERE ARE THE SYMBOLS.......", entry['entities']['symbols'] | |
#else: | |
# print "THERE AIN'T NO entry['entities']['symbols']" | |
if 'media' in entry['entities']: | |
#print "HERE ARE THE MEDIA.......", entry['entities']['media'] | |
entities_media_count = len(entry['entities']['media']) | |
#'expanded_url' #The fully resolved media URL [FULL TWEET PLUS PICTURE] --> e.g., http://twitter.com/StJude/status/347801636351135744/photo/1 | |
#'display_url' # Not a URL but a string to display instead of the media URL --> e.g., pic.twitter.com/hO4BjuqrnE | |
#'media_url_https' #The SSL URL of the media file --> e.g., https://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg | |
#'type' # only photo for now --> e.g., photo | |
#'media_url' # The URL of the media file --> e.g., http://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg | |
else: | |
entities_media_count = '' | |
#for a in entry['entities']['media']: | |
#print a['expanded_url'], a['media_url'] #, a['media_type'] | |
#bob = entry['entities']['media'][0] | |
#print bob | |
#print bob['expanded_url'] | |
if 'media' in entry['entities']: | |
if 'expanded_url' in entry['entities']['media'][0]: | |
media_expanded_url = entry['entities']['media'][0]['expanded_url'] | |
else: | |
#print "THERE AIN'T NO expanded_url in entry['entities']['media']" | |
media_expanded_url = '' | |
if 'media_url' in entry['entities']['media'][0]: | |
media_url = entry['entities']['media'][0]['media_url'] | |
else: | |
#print "THERE AIN'T NO media_url in entry['entities']['media']" | |
media_url = '' | |
if 'type' in entry['entities']['media'][0]: | |
media_type = entry['entities']['media'][0]['type'] | |
else: | |
#print "THERE AIN'T NO type in entry['entities']['media']" | |
media_type = '' | |
else: | |
media_type = '' | |
media_url = '' | |
media_expanded_url = '' | |
upd = TWEET(query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive, | |
coordinates, retweeted_status, withheld_in_countries, withheld_scope, created_at_text, | |
created_at, month, year, content, from_user_screen_name, from_user_id, from_user_followers_count, | |
from_user_friends_count, from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description, | |
from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count, | |
entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count, | |
in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, #json_output, | |
entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic, | |
num_characters, num_words, | |
retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count, | |
retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location, | |
retweeted_tweet_created_at, | |
Unique_ID, | |
) | |
self.session.add(upd) | |
try: | |
self.session.commit() | |
except exc.SQLAlchemyError: | |
self.session.rollback() | |
print " NOT INSERTING --> IT'S A DUPLICATE" | |
###### PART VI: MAIN LOOP OVER EACH ID STORED IN 'ACCOUNTS' TABLE OF DATABASE - 16 PAGES OF 200 TWEETS PER ID ###### | |
class Scrape: | |
def __init__(self): | |
engine = sqlalchemy.create_engine("sqlite:///test.sqlite", echo=False) # YOUR DATABASE NAME HERE | |
Session = sessionmaker(bind=engine) | |
self.session = Session() | |
Base.metadata.create_all(engine) | |
def main(self): | |
all_ids = self.session.query(ACCOUNT).all() | |
print 'len(all_ids)', len(all_ids) | |
#CHECK IF THE 'ACCOUNTS' TABLE HAS BEEN POPULATED; IF NOT, ADD DATA FROM | |
#CSV FILE TO THE TABLE | |
if len(all_ids) < 1: | |
conn = sqlite3.connect('test.sqlite') | |
import pandas as pd | |
df = pd.read_csv('accounts.csv') | |
df.to_sql('accounts', conn, if_exists='append', index=False) | |
all_ids = self.session.query(ACCOUNT).all() | |
keys = [] | |
for i in all_ids: | |
Unique_ID = i.Unique_ID | |
Twitter_handle = i.Twitter_handle | |
kid = Twitter_handle | |
rowid = i.Unique_ID | |
print '\n', "\rprocessing id %s/%s -- %s" % (rowid, len(all_ids), Twitter_handle), | |
sys.stdout.flush() | |
page = 1 | |
while page < 17: #TO DOWNLOAD FEWER PAGES OF TWEETS, MODIFY THIS PLUS LINE 621 | |
print "------XXXXXX------ STARTING PAGE", page | |
d = get_data_user_timeline_all_pages(kid, page) | |
if not d: | |
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID" | |
break | |
if len(d)==0: | |
print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID" | |
break | |
write_data(self, d, Twitter_handle, Unique_ID) | |
#self.session.commit() | |
#print 'pausing for 1 second' | |
#time.sleep(1) #PAUSE FOR 1 SECOND | |
page += 1 | |
if page > 16: | |
print "WE'RE AT THE END OF PAGE 16!!!!!" | |
break | |
self.session.commit() | |
print '\n', '\n', '\n', "FINISHED WITH ALL IDS" | |
self.session.close() | |
if __name__ == "__main__": | |
s = Scrape() | |
s.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment