Last active
November 15, 2017 23:19
-
-
Save seandavi/1308c15707d443f1771c3cadeef78547 to your computer and use it in GitHub Desktop.
script skeleton to mine tweets for software projects
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Mine tweets from a meeting using the meeting hashtag. | |
# | |
# Looks for URLs in tweets that match: | |
# - github | |
# - github pages (docs) | |
# - bitbucket | |
# - CRAN | |
# - BitBucket | |
# | |
# Results in a tidy data.frame that can be further manipulated | |
# or saved as is. | |
library(rtweet) | |
library(dplyr) | |
library(purrr) | |
#Need to set these up. See the rtweet package docs. | |
consumer_key = "SECRET_KEY_FROM_TWITTER" | |
consumer_secret = "SECRET_FROM_TWITTER" | |
app = 'TWITTER_APP_NAME' | |
twitter_token = create_token(app = app, | |
consumer_key = consumer_key, | |
consumer_secret = consumer_secret) | |
# Just regex matching to "mine" the tweets. | |
re_github = "http[s]?://github.com/(\\w+)/([\\w-]+).*" | |
re_ghpages = "http[s]?://(.*).github.io/(\\w+)/.*" | |
re_bitbucket = "http[s]?s://bitbucket.org/(\\w+)/(\\w+).*" | |
re_pubmed = "http[s]?://www.ncbi.nlm.nih.gov/pubmed/(\\d)" | |
re_biorxiv = "http[s]?://www.biorxiv.org/content/.*early/(\\d+)/(\\d+)/(\\d+)/(\\d+).*" | |
re_bioconductor = "http://bioconductor.org/packages.*/(\\w+).*" | |
re_cran = "https://cran.*/packages/(\\w+).*" | |
process_cran = function(urls,re = re_cran) { | |
df1 = str_match(urls,re) | |
df1 = df1[!is.na(df1[,1]),,drop=FALSE] | |
unique(data.frame(url=sprintf('https://cran.rstudio.com/packages/%s',df1[,2]),name=df1[,2],user=NA,type='CRAN')) | |
} | |
process_bioc = function(urls,re = re_bioconductor) { | |
df1 = str_match(urls,re) | |
df1 = df1[!is.na(df1[,1]),,drop=FALSE] | |
unique(data.frame(url=sprintf('https://bioconductor.org/packages/%s',df1[,2]),name=df1[,2],user=NA,type='Bioconductor')) | |
} | |
process_github = function(urls,re = re_github) { | |
df1 = str_match(urls,re) | |
df1 = df1[!is.na(df1[,1]),,drop=FALSE] | |
unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github')) | |
} | |
process_ghpages = function(urls,re = re_ghpages) { | |
df1 = str_match(urls,re) | |
df1 = df1[!is.na(df1[,1]),,drop=FALSE] | |
unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github')) | |
} | |
process_bitbucket = function(urls,re = re_bitbucket) { | |
df1 = str_match(urls,re) | |
df1 = df1[!is.na(df1[,1]),,drop=FALSE] | |
unique(data.frame(url=sprintf('https://bitbucket.org/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='bitbucket')) | |
} | |
# Just change the hashtag below. | |
# Tweets limited to last 7 days, so this will not work forever after | |
# a meeting. | |
tweets <- search_tweets('#GI2017 AND (github OR bitbucket OR bioconductor OR cran)', n=5000,token = twitter_token, | |
include_rts = FALSE) | |
urls = purrr::flatten_chr(tweets$urls_expanded_url) | |
# Details of "saving" below should change | |
write.csv(unique(do.call(rbind,list(process_bioc(urls),process_cran(urls),process_ghpages(urls),process_bitbucket(urls),process_github(urls)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE) | |
system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv") | |
# And the stuff below is for abstracts. In this case, I | |
# had to process a PDF. YMMV. | |
library(pdftools) | |
abstract_txt = pdf_text('~/Downloads/Info2017_AbstractBook.pdf') | |
write.csv(unique(do.call(rbind,list( | |
process_bioc(urls), | |
process_cran(urls), | |
process_ghpages(urls), | |
process_bitbucket(urls), | |
process_github(urls), | |
process_github(abstract_txt)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE) | |
system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment