seandavi · November 15, 2017 23:19
diff --git a/gistfile1.R b/gistfile1.R
 # Mine tweets from a meeting using the meeting hashtag.
 # 
 # Looks for URLs in tweets that match:
 #   - github
 #   - github pages (docs)
 #   - bitbucket
 #   - CRAN
 #   - BitBucket
 #
 # Results in a tidy data.frame that can be further manipulated
 # or saved as is.

 library(rtweet)
 library(dplyr)
 library(purrr)

 #Need to set these up. See the rtweet package docs.
 consumer_key    = "SECRET_KEY_FROM_TWITTER"
 consumer_secret = "SECRET_FROM_TWITTER"
 app             = 'TWITTER_APP_NAME'

 twitter_token = create_token(app = app, 
                             consumer_key = consumer_key,
                             consumer_secret = consumer_secret)


 # Just regex matching to "mine" the tweets.
 re_github = "http[s]?://github.com/(\\w+)/([\\w-]+).*"
 re_ghpages = "http[s]?://(.*).github.io/(\\w+)/.*"
 re_bitbucket = "http[s]?s://bitbucket.org/(\\w+)/(\\w+).*"
 re_pubmed = "http[s]?://www.ncbi.nlm.nih.gov/pubmed/(\\d)"
 re_biorxiv = "http[s]?://www.biorxiv.org/content/.*early/(\\d+)/(\\d+)/(\\d+)/(\\d+).*"
 re_bioconductor = "http://bioconductor.org/packages.*/(\\w+).*"
 re_cran = "https://cran.*/packages/(\\w+).*"

 process_cran = function(urls,re = re_cran) {
  df1 = str_match(urls,re)
  df1 = df1[!is.na(df1[,1]),,drop=FALSE]
  unique(data.frame(url=sprintf('https://cran.rstudio.com/packages/%s',df1[,2]),name=df1[,2],user=NA,type='CRAN'))
 }

 process_bioc = function(urls,re = re_bioconductor) {
  df1 = str_match(urls,re)
  df1 = df1[!is.na(df1[,1]),,drop=FALSE]
  unique(data.frame(url=sprintf('https://bioconductor.org/packages/%s',df1[,2]),name=df1[,2],user=NA,type='Bioconductor'))
 }

 process_github = function(urls,re = re_github) {
  df1 = str_match(urls,re)
  df1 = df1[!is.na(df1[,1]),,drop=FALSE]
  unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
 }

 process_ghpages = function(urls,re = re_ghpages) {
  df1 = str_match(urls,re)
  df1 = df1[!is.na(df1[,1]),,drop=FALSE]
  unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
 }

 process_bitbucket = function(urls,re = re_bitbucket) {
  df1 = str_match(urls,re)
  df1 = df1[!is.na(df1[,1]),,drop=FALSE]
  unique(data.frame(url=sprintf('https://bitbucket.org/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='bitbucket'))
 }

 # Just change the hashtag below. 
 # Tweets limited to last 7 days, so this will not work forever after
 # a meeting.
 tweets <- search_tweets('#GI2017 AND (github OR bitbucket OR bioconductor OR cran)', n=5000,token = twitter_token,
                        include_rts = FALSE)


 urls = purrr::flatten_chr(tweets$urls_expanded_url)

 # Details of "saving" below should change

 write.csv(unique(do.call(rbind,list(process_bioc(urls),process_cran(urls),process_ghpages(urls),process_bitbucket(urls),process_github(urls)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
 system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")

 # And the stuff below is for abstracts. In this case, I 
 # had to process a PDF. YMMV.
 library(pdftools)
 abstract_txt = pdf_text('~/Downloads/Info2017_AbstractBook.pdf')

 write.csv(unique(do.call(rbind,list(
  process_bioc(urls),
  process_cran(urls),
  process_ghpages(urls),
  process_bitbucket(urls),
  process_github(urls),
  process_github(abstract_txt)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
 system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")
	# Mine tweets from a meeting using the meeting hashtag.
	#
	# Looks for URLs in tweets that match:
	# - github
	# - github pages (docs)
	# - bitbucket
	# - CRAN
	# - BitBucket
	#
	# Results in a tidy data.frame that can be further manipulated
	# or saved as is.

	library(rtweet)
	library(dplyr)
	library(purrr)

	#Need to set these up. See the rtweet package docs.
	consumer_key = "SECRET_KEY_FROM_TWITTER"
	consumer_secret = "SECRET_FROM_TWITTER"
	app = 'TWITTER_APP_NAME'

	twitter_token = create_token(app = app,
	consumer_key = consumer_key,
	consumer_secret = consumer_secret)


	# Just regex matching to "mine" the tweets.
	re_github = "http[s]?://github.com/(\\w+)/([\\w-]+).*"
	re_ghpages = "http[s]?://(.).github.io/(\\w+)/."
	re_bitbucket = "http[s]?s://bitbucket.org/(\\w+)/(\\w+).*"
	re_pubmed = "http[s]?://www.ncbi.nlm.nih.gov/pubmed/(\\d)"
	re_biorxiv = "http[s]?://www.biorxiv.org/content/.early/(\\d+)/(\\d+)/(\\d+)/(\\d+)."
	re_bioconductor = "http://bioconductor.org/packages./(\\w+)."
	re_cran = "https://cran./packages/(\\w+)."

	process_cran = function(urls,re = re_cran) {
	df1 = str_match(urls,re)
	df1 = df1[!is.na(df1[,1]),,drop=FALSE]
	unique(data.frame(url=sprintf('https://cran.rstudio.com/packages/%s',df1[,2]),name=df1[,2],user=NA,type='CRAN'))
	}

	process_bioc = function(urls,re = re_bioconductor) {
	df1 = str_match(urls,re)
	df1 = df1[!is.na(df1[,1]),,drop=FALSE]
	unique(data.frame(url=sprintf('https://bioconductor.org/packages/%s',df1[,2]),name=df1[,2],user=NA,type='Bioconductor'))
	}

	process_github = function(urls,re = re_github) {
	df1 = str_match(urls,re)
	df1 = df1[!is.na(df1[,1]),,drop=FALSE]
	unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
	}

	process_ghpages = function(urls,re = re_ghpages) {
	df1 = str_match(urls,re)
	df1 = df1[!is.na(df1[,1]),,drop=FALSE]
	unique(data.frame(url=sprintf('https://github.com/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='github'))
	}

	process_bitbucket = function(urls,re = re_bitbucket) {
	df1 = str_match(urls,re)
	df1 = df1[!is.na(df1[,1]),,drop=FALSE]
	unique(data.frame(url=sprintf('https://bitbucket.org/%s/%s',df1[,2],df1[,3]),name=df1[,3],user=df1[,2],type='bitbucket'))
	}

	# Just change the hashtag below.
	# Tweets limited to last 7 days, so this will not work forever after
	# a meeting.
	tweets <- search_tweets('#GI2017 AND (github OR bitbucket OR bioconductor OR cran)', n=5000,token = twitter_token,
	include_rts = FALSE)


	urls = purrr::flatten_chr(tweets$urls_expanded_url)

	# Details of "saving" below should change

	write.csv(unique(do.call(rbind,list(process_bioc(urls),process_cran(urls),process_ghpages(urls),process_bitbucket(urls),process_github(urls)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
	system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")

	# And the stuff below is for abstracts. In this case, I
	# had to process a PDF. YMMV.
	library(pdftools)
	abstract_txt = pdf_text('~/Downloads/Info2017_AbstractBook.pdf')

	write.csv(unique(do.call(rbind,list(
	process_bioc(urls),
	process_cran(urls),
	process_ghpages(urls),
	process_bitbucket(urls),
	process_github(urls),
	process_github(abstract_txt)))),'GI2017_software.csv',row.names=FALSE,quote=FALSE)
	system("gist --filename Genome_Informatics_2017_software.csv --description 'Software list mined from twitter feed for CSHL Genome Informatics meeting, 2017' -u https://gist.github.com/0b20c492527b234912ba2350a05cb10c GI2017_software.csv")