Last active
October 19, 2017 14:36
-
-
Save giocomai/ea5fa8bd0732728abc05befedf8a2ac7 to your computer and use it in GitHub Desktop.
Extract the full twitter history of all current members of the European Parliament with R (rtweet) #rstats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading | |
pacman::p_load("tidyverse") | |
pacman::p_load("ROAuth") | |
pacman::p_load("rtweet") | |
# this assumes authentication has been taken care of as explained here: http://rtweet.info/articles/auth.html | |
# it should still work, but without access tokens the limits for API requests are much slower (it would take *a lot* longer to get all tweets) | |
# it is expected that this script will need to be run more than once | |
# interim files are automatically stored and recovered if the script is re-run | |
# create folders | |
dir.create(path = file.path("TwitterMEP"), showWarnings = FALSE) | |
dir.create(path = file.path("TwitterMEP", "data"), showWarnings = FALSE) | |
## get list of twitter accounts of all MEPs | |
if (file.exists(file.path("TwitterMEP", "data", "MEPsDF.rds"))==FALSE) { | |
MEPsDF <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>% | |
mutate(SCREEN_NAME = stringr::str_replace_all(string = SCREEN_NAME, pattern = stringr::fixed("@"), replacement = "")) | |
# add a control: to be changed when oldest tweet reached to skip faster to others | |
MEPsDF$OldestReached <- FALSE | |
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
} else { | |
MEPsDF <- read_rds(file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
} | |
# if not first run, load previous results | |
if (file.exists(file.path("TwitterMEP", "data", "allMEPtweets.rds"))) { | |
allMEPtweets <- readRDS(file.path("TwitterMEP", "data", "allMEPtweets.rds")) | |
} else { # otherwise create empty list | |
allMEPtweets <- setNames(vector("list", length(MEPsDF$SCREEN_NAME)), MEPsDF$SCREEN_NAME) | |
} | |
for (i in sample(which(is.na(MEPsDF$SCREEN_NAME)==FALSE))) { # exclude MEP not on Twitter | |
if(MEPsDF$OldestReached[i]==FALSE) { # if the oldest tweet for a MEP has not yet been found, proceed and ask for tweets | |
if (is.null(allMEPtweets[[i]])) { # if no tweet has previous been collected for given MEP, ask for the latest tweets | |
temp <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200), | |
error = function(e) { | |
# Do nothing if error thrown | |
}) | |
} else { # if some tweets already present, load them in memory | |
temp <- allMEPtweets[[i]] | |
} | |
if (is.null(temp)==FALSE) { | |
minId1 <- min(temp$status_id) # find id of oldest tweet | |
if (is.na(minId1)==TRUE){ # suspended accounts throw back a data_frame of NAs | |
minId1 <- 0 | |
} | |
minId2 <- 0 | |
while (minId1 != minId2) { #until oldest tweet is found, keep on asking for previous tweets for a given user | |
temp2 <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200, max_id = minId1), | |
error = function(e) { | |
## Do nothing if error thrown | |
}) | |
if (is.null(temp2)==FALSE) { | |
minId1 <- min(temp$status_id) | |
minId2 <- min(temp2$status_id) | |
if (minId1==minId2) { | |
MEPsDF$OldestReached[i] <- TRUE | |
} else { | |
temp <- bind_rows(temp, temp2) | |
} | |
} else { | |
minId2 <- minId1 # if end of timeline reached, skip to next MEP | |
} | |
Sys.sleep(time = 1) | |
} | |
allMEPtweets[[i]] <- temp %>% distinct() | |
saveRDS(object = allMEPtweets, file = file.path("TwitterMEP", "data", "allMEPtweets.rds")) | |
saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds")) | |
message(paste("New tweets for user", MEPsDF$SCREEN_NAME[i], "stored.")) # inform of progress | |
Sys.sleep(time = 1) | |
} | |
} | |
} | |
# transform into data frame | |
allMEPtweetsDF <- map_df(allMEPtweets, bind_rows) %>% distinct() | |
nrow(allMEPtweetsDF %>% distinct()) | |
# merge with initial data frame to include more details on MEPs | |
allMEPfull <- left_join(allMEPtweetsDF, MEPsDF %>% rename(screen_name = SCREEN_NAME), by = "screen_name") | |
# store the final dataset | |
saveRDS(object = allMEPfull, file = file.path("TwitterMEP", "data", "allMEPfull.rds")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment