Last active
February 23, 2016 21:19
-
-
Save bearloga/255073053a9be8236bb2 to your computer and use it in GitHub Desktop.
Scrapes some basic info from my Twitter followers' bios. I used this to get an approximate lower bound on how many data science-y folks follow me.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
your_handle <- "bearloga" | |
# Note: must be logged in to twitter to view your own or anyone's list of followers | |
library(magrittr) # install.packages('magrittr') | |
library(rvest) # install.packages('rvest') | |
library(RSelenium) # install.packages('RSelenium') | |
# Scrolling solution by NicE (http://stackoverflow.com/a/29965233/1091835): | |
checkForServer() | |
startServer() | |
remote_driver <- remoteDriver$new() | |
remote_driver$open() | |
remote_driver$navigate(paste0("https://twitter.com/", your_handle, "/followers")) | |
for (i in 1:25) { | |
# Sroll down N times, waiting for the page to load at each time | |
remote_driver$executeScript(paste("scroll(0, ",i * 10000,");")) | |
Sys.sleep(3) | |
}; rm(i) | |
page_source <- remote_driver$getPageSource() | |
remote_driver$close() | |
# Test: page_source <- read_html("~/Desktop/followers.html") | |
html <- read_html(page_source[[1]]) | |
bios <- html %>% | |
html_nodes('div[data-test-selector="ProfileTimeline"] p.ProfileCard-bio') %>% | |
html_text(trim = TRUE) | |
stats_keywords <- data.frame(Biostatistics = grepl("biostat", bios, ignore.case = TRUE), | |
Statistics = grepl("stats", bios, ignore.case = TRUE) | grepl("statist", bios, ignore.case = TRUE), | |
Data = grepl("data", bios, ignore.case = TRUE), | |
R = grepl("\\bR\\b", bios) | grepl("rstats", bios, ignore.case = TRUE), | |
Datavis = grepl("datavi[sz]", bios, ignore.case = TRUE) | grepl("visualiz", bios, ignore.case = TRUE), | |
ML = grepl("\\bML\\b", bios, ignore.case = TRUE) | grepl("machine\\s?learning", bios, ignore.case = TRUE), | |
Ecology = grepl("ecolo", bios, ignore.case = TRUE) | grepl("marine life", bios, ignore.case = TRUE), | |
Math = grepl("ecolo", bios, ignore.case = TRUE), | |
Analytics = grepl("analyst", bios, ignore.case = TRUE) | grepl("analytic", bios, ignore.case = TRUE)) | |
stats_keywords <- cbind(stats_keywords, "Data Science-y\n(Any)" = apply(stats_keywords, 1, any)) | |
handles <- html %>% | |
html_nodes('div[data-test-selector="ProfileTimeline"] span.u-linkComplex-target') %>% | |
html_text(trim = TRUE) %>% | |
paste0("@", .) | |
display_names <- html %>% | |
html_nodes('div[data-test-selector="ProfileTimeline"] a.ProfileNameTruncated-link') %>% | |
html_text(trim = TRUE) | |
followers <- cbind(handle = handles, name = display_names, stats_keywords, bio = bios) | |
followers <- followers[order(followers$`Data Science-y\n(Any)`, decreasing = TRUE), ] | |
# View(followers) | |
prop.table(table(followers$`Data Science-y\n(Any)`)) # ~36.9% | |
# Next step requires tidyr, dplyr, and ggplot2... | |
followers %>% | |
tidyr::gather("keyword", "indicator", Biostatistics:`Data Science-y\n(Any)`) %>% | |
dplyr::group_by(keyword) %>% | |
dplyr::summarize(prop = sum(indicator)/n()) %>% | |
ggplot(data = ., aes(y = prop, x = reorder(keyword, -prop))) + | |
geom_bar(stat = "identity") + | |
scale_y_continuous("Proportion of total followers", | |
labels = scales::percent_format()) + | |
xlab("Keyword") + | |
ggtitle("Keywords found followers' Twitter bios") + | |
geom_text(aes(label = sprintf("%.1f%%", 100*prop), y = prop+0.01), | |
position = position_dodge(width = 1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note:
remoteDriver$new()
might have problems if you use Chrome, or at least it did for me when I specifiedbrowserName = "chrome"
inside ofnew()
. Using Firefox solved the issue for me.