giocomai · October 19, 2017 14:36
diff --git a/MEPsOnTwitter_rtweet.R b/MEPsOnTwitter_rtweet.R
 if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading
 pacman::p_load("tidyverse")
 pacman::p_load("ROAuth")
 pacman::p_load("rtweet")

 # this assumes authentication has been taken care of as explained here: http://rtweet.info/articles/auth.html
 # it should still work, but without access tokens the limits for API requests are much slower (it would take *a lot* longer to get all tweets)

 # it is expected that this script will need to be run more than once
 # interim files are automatically stored and recovered if the script is re-run

 # create folders
 dir.create(path = file.path("TwitterMEP"), showWarnings = FALSE)
 dir.create(path = file.path("TwitterMEP", "data"), showWarnings = FALSE)

 ## get list of twitter accounts of all MEPs
 if (file.exists(file.path("TwitterMEP", "data", "MEPsDF.rds"))==FALSE) {
  MEPsDF <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>%
    mutate(SCREEN_NAME = stringr::str_replace_all(string = SCREEN_NAME, pattern = stringr::fixed("@"), replacement = ""))
  # add a control: to be changed when oldest tweet reached to skip faster to others
  MEPsDF$OldestReached <- FALSE

  saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
 } else {
  MEPsDF <- read_rds(file.path("TwitterMEP", "data", "MEPsDF.rds"))
 }

 # if not first run, load previous results
 if (file.exists(file.path("TwitterMEP", "data", "allMEPtweets.rds"))) {
  allMEPtweets <- readRDS(file.path("TwitterMEP", "data", "allMEPtweets.rds"))
 } else { # otherwise create empty list
  allMEPtweets <- setNames(vector("list", length(MEPsDF$SCREEN_NAME)), MEPsDF$SCREEN_NAME)
 }


 for (i in sample(which(is.na(MEPsDF$SCREEN_NAME)==FALSE))) {  # exclude MEP not on Twitter
  if(MEPsDF$OldestReached[i]==FALSE) { # if the oldest tweet for a MEP has not yet been found, proceed and ask for tweets
    if (is.null(allMEPtweets[[i]])) { # if no tweet has previous been collected for given MEP, ask for the latest tweets
      temp <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200),
                       error = function(e) {
                         # Do nothing if error thrown  
                       })
    } else { # if some tweets already present, load them in memory
      temp <- allMEPtweets[[i]]
    }
    if (is.null(temp)==FALSE) {
      minId1 <- min(temp$status_id) # find id of oldest tweet
      if (is.na(minId1)==TRUE){ # suspended accounts throw back a data_frame of NAs
        minId1 <- 0
      }
      minId2 <- 0
      while (minId1 != minId2) { #until oldest tweet is found, keep on asking for previous tweets for a given user
        temp2 <-  tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200, max_id = minId1),
                           error = function(e) {
                             ## Do nothing if error thrown  
                           })
        if (is.null(temp2)==FALSE) { 
          minId1 <- min(temp$status_id)
          minId2 <- min(temp2$status_id)
          if (minId1==minId2) {
            MEPsDF$OldestReached[i] <- TRUE
          } else {
            temp <- bind_rows(temp, temp2)
          }
        } else {
          minId2 <- minId1 # if end of timeline reached, skip to next MEP
        }
        Sys.sleep(time = 1)
      }
      allMEPtweets[[i]] <- temp %>% distinct()
      saveRDS(object = allMEPtweets, file = file.path("TwitterMEP", "data", "allMEPtweets.rds"))
      saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
      message(paste("New tweets for user", MEPsDF$SCREEN_NAME[i], "stored.")) # inform of progress
      Sys.sleep(time = 1)
    }
  }
 }

 # transform into data frame
 allMEPtweetsDF <- map_df(allMEPtweets, bind_rows) %>% distinct()

 nrow(allMEPtweetsDF %>% distinct())

 # merge with initial data frame to include more details on MEPs
 allMEPfull <- left_join(allMEPtweetsDF, MEPsDF %>% rename(screen_name = SCREEN_NAME), by = "screen_name")

 # store the final dataset
 saveRDS(object = allMEPfull, file = file.path("TwitterMEP", "data", "allMEPfull.rds"))
	if (!require("pacman")) install.packages("pacman") # for taking care of package installation/loading
	pacman::p_load("tidyverse")
	pacman::p_load("ROAuth")
	pacman::p_load("rtweet")

	# this assumes authentication has been taken care of as explained here: http://rtweet.info/articles/auth.html
	# it should still work, but without access tokens the limits for API requests are much slower (it would take a lot longer to get all tweets)

	# it is expected that this script will need to be run more than once
	# interim files are automatically stored and recovered if the script is re-run

	# create folders
	dir.create(path = file.path("TwitterMEP"), showWarnings = FALSE)
	dir.create(path = file.path("TwitterMEP", "data"), showWarnings = FALSE)

	## get list of twitter accounts of all MEPs
	if (file.exists(file.path("TwitterMEP", "data", "MEPsDF.rds"))==FALSE) {
	MEPsDF <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>%
	mutate(SCREEN_NAME = stringr::str_replace_all(string = SCREEN_NAME, pattern = stringr::fixed("@"), replacement = ""))
	# add a control: to be changed when oldest tweet reached to skip faster to others
	MEPsDF$OldestReached <- FALSE

	saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
	} else {
	MEPsDF <- read_rds(file.path("TwitterMEP", "data", "MEPsDF.rds"))
	}

	# if not first run, load previous results
	if (file.exists(file.path("TwitterMEP", "data", "allMEPtweets.rds"))) {
	allMEPtweets <- readRDS(file.path("TwitterMEP", "data", "allMEPtweets.rds"))
	} else { # otherwise create empty list
	allMEPtweets <- setNames(vector("list", length(MEPsDF$SCREEN_NAME)), MEPsDF$SCREEN_NAME)
	}


	for (i in sample(which(is.na(MEPsDF$SCREEN_NAME)==FALSE))) { # exclude MEP not on Twitter
	if(MEPsDF$OldestReached[i]==FALSE) { # if the oldest tweet for a MEP has not yet been found, proceed and ask for tweets
	if (is.null(allMEPtweets[[i]])) { # if no tweet has previous been collected for given MEP, ask for the latest tweets
	temp <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200),
	error = function(e) {
	# Do nothing if error thrown
	})
	} else { # if some tweets already present, load them in memory
	temp <- allMEPtweets[[i]]
	}
	if (is.null(temp)==FALSE) {
	minId1 <- min(temp$status_id) # find id of oldest tweet
	if (is.na(minId1)==TRUE){ # suspended accounts throw back a data_frame of NAs
	minId1 <- 0
	}
	minId2 <- 0
	while (minId1 != minId2) { #until oldest tweet is found, keep on asking for previous tweets for a given user
	temp2 <- tryCatch(expr = get_timeline(user = MEPsDF$SCREEN_NAME[i], n = 3200, max_id = minId1),
	error = function(e) {
	## Do nothing if error thrown
	})
	if (is.null(temp2)==FALSE) {
	minId1 <- min(temp$status_id)
	minId2 <- min(temp2$status_id)
	if (minId1==minId2) {
	MEPsDF$OldestReached[i] <- TRUE
	} else {
	temp <- bind_rows(temp, temp2)
	}
	} else {
	minId2 <- minId1 # if end of timeline reached, skip to next MEP
	}
	Sys.sleep(time = 1)
	}
	allMEPtweets[[i]] <- temp %>% distinct()
	saveRDS(object = allMEPtweets, file = file.path("TwitterMEP", "data", "allMEPtweets.rds"))
	saveRDS(object = MEPsDF, file = file.path("TwitterMEP", "data", "MEPsDF.rds"))
	message(paste("New tweets for user", MEPsDF$SCREEN_NAME[i], "stored.")) # inform of progress
	Sys.sleep(time = 1)
	}
	}
	}

	# transform into data frame
	allMEPtweetsDF <- map_df(allMEPtweets, bind_rows) %>% distinct()

	nrow(allMEPtweetsDF %>% distinct())

	# merge with initial data frame to include more details on MEPs
	allMEPfull <- left_join(allMEPtweetsDF, MEPsDF %>% rename(screen_name = SCREEN_NAME), by = "screen_name")

	# store the final dataset
	saveRDS(object = allMEPfull, file = file.path("TwitterMEP", "data", "allMEPfull.rds"))