Skip to content

Instantly share code, notes, and snippets.

@seandavi
Last active November 15, 2017 23:19
Show Gist options
  • Save seandavi/1308c15707d443f1771c3cadeef78547 to your computer and use it in GitHub Desktop.
Save seandavi/1308c15707d443f1771c3cadeef78547 to your computer and use it in GitHub Desktop.
script skeleton to mine tweets for software projects
# Mine tweets from a meeting using the meeting hashtag.
#
# Looks for URLs in tweets that match:
# - github
# - github pages (docs)
# - bitbucket
# - CRAN
# - BitBucket
#
# Results in a tidy data.frame that can be further manipulated
# or saved as is.
library(rtweet)
library(dplyr)
library(purrr)
#Need to set these up. See the rtweet package docs.
consumer_key = "SECRET_KEY_FROM_TWITTER"
consumer_secret = "SECRET_FROM_TWITTER"
app = 'TWITTER_APP_NAME'
twitter_token = create_token(app = app,
consumer_key = consumer_key,
consumer_secret = consumer_secret)
# Just regex matching to "mine" the tweets.
re_github = "http[s]?://github.com/(\\w+)/([\\w-]+).*"
re_ghpages = "http[s]?://(.*).github.io/(\\w+)/.*"
re_bitbucket = "http[s]?s://bitbucket.org/(\\w+)/(\\w+).*"
re_pubmed = "http[s]?://www.ncbi.nlm.nih.gov/pubmed/(\\d)"
re_biorxiv = "http[s]?://www.biorxiv.org/content/.*early/(\\d+)/(\\d+)/(\\d+)/(\\d+).*"
re_bioconductor = "http://bioconductor.org/packages.*/(\\w+).*"
re_cran = "https://cran.*/packages/(\\w+).*"
process_cran = function(urls,re = re_cran) {
df1 = str_match(urls,re)
df1 = df1[!is.na(df1[,1]),,drop=FALSE]
unique(data.frame(url=sprintf('https://cran.rstudio.com/packages/%s',df1[,2]),name=df1[,2],user=NA,type='CRAN'))
}
process_bioc = function(urls,re = re_bioconductor) {
df1 = str_match(urls,re)
df1 = df1[!is.na(df1[,1]),,drop=FALSE]
unique(data.frame(url=sprintf('https://bioconductor.org/packages/%s',df1[,2]),name=df1[,2],user=NA,type='Bioconductor'))
}
process_github = function(urls,re = re_github) {
df1 = str_match(urls,re)
df1 = df1[!is.na(df1[,1]),,drop=FALSE]
unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
}
process_ghpages = function(urls,re = re_ghpages) {
df1 = str_match(urls,re)
df1 = df1[!is.na(df1[,1]),,drop=FALSE]
unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
}
process_bitbucket = function(urls,re = re_bitbucket) {
df1 = str_match(urls,re)
df1 = df1[!is.na(df1[,1]),,drop=FALSE]
unique(data.frame(url=sprintf('https://bitbucket.org/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='bitbucket'))
}
# Just change the hashtag below.
# Tweets limited to last 7 days, so this will not work forever after
# a meeting.
tweets <- search_tweets('#GI2017 AND (github OR bitbucket OR bioconductor OR cran)', n=5000,token = twitter_token,
include_rts = FALSE)
urls = purrr::flatten_chr(tweets$urls_expanded_url)
# Details of "saving" below should change
write.csv(unique(do.call(rbind,list(process_bioc(urls),process_cran(urls),process_ghpages(urls),process_bitbucket(urls),process_github(urls)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")
# And the stuff below is for abstracts. In this case, I
# had to process a PDF. YMMV.
library(pdftools)
abstract_txt = pdf_text('~/Downloads/Info2017_AbstractBook.pdf')
write.csv(unique(do.call(rbind,list(
process_bioc(urls),
process_cran(urls),
process_ghpages(urls),
process_bitbucket(urls),
process_github(urls),
process_github(abstract_txt)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment