ajdamico · March 11, 2018 18:08
diff --git a/top 25 words weighted by download.R b/top 25 words weighted by download.R

 # devtools::install_github( "ajdamico/lodown" )

 library(tm)
 library(tidyverse)
 library(rvest)

 cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" )

 gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" )

 tf <- tempfile()

 year_package_counts <- NULL

 for( this_year in 2013:2017 ){

 	day_package_counts <- NULL
 	
 	this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE )
 	this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ]
 	
 	for( this_day_file in this_year_files ){
 	
 		lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' )
 	
 		this_gz_file <- read_csv( gzfile( tf ) )
 	
 		day_package_counts <-
 			rbind( 
 				day_package_counts ,
 				this_gz_file %>% group_by( date , package ) %>% summarize( count = n() )
 			)
 			
 	}
 	
 	year_package_counts <-
 		rbind(
 			year_package_counts ,
 			day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) )
 		)
 	
 }


 # compute year-package weight
 year_package_weights <- 
 	data.frame( year_package_counts %>% spread( year , count ) )

 # overwrite missings with zeroes
 year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0
 	
 	
 	
 traceback()

 cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' )
 package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ]

 package_title_description$package <- as.character( package_title_description$Package )
 package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description )

 package_title_description$text <- tolower( gsub( "\\n|\\t" , " " , package_title_description$text ) )
 package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) )
 package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " )
 package_title_description$text <- gsub( "  " , " " , package_title_description$text )

 word_list <- strsplit( package_title_description$text , " " )
 word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) )

 merged_list <- 
 	mapply( 
 		merge , 
 		word_list , 
 		lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) , 
 		SIMPLIFY = FALSE 
 	)

 merged_df <- do.call( rbind , merged_list )


 weighted_df <- merge( merged_df , year_package_weights )

 word_weighted_df <-
 	data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) )
 	
 # remove some other words
 word_weighted_df <-
 	subset(
 		word_weighted_df ,
 		!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) )
 	)
 	
 top_twenty_five <-
 	data.frame(
 		y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) ,
 		y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) ,
 		y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) ,
 		y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) ,
 		y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 )
 		
 	)
 	
 top_twenty_five

	# devtools::install_github( "ajdamico/lodown" )

	library(tm)
	library(tidyverse)
	library(rvest)

	cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" )

	gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" )

	tf <- tempfile()

	year_package_counts <- NULL

	for( this_year in 2013:2017 ){

	day_package_counts <- NULL

	this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE )
	this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ]

	for( this_day_file in this_year_files ){

	lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' )

	this_gz_file <- read_csv( gzfile( tf ) )

	day_package_counts <-
	rbind(
	day_package_counts ,
	this_gz_file %>% group_by( date , package ) %>% summarize( count = n() )
	)

	}

	year_package_counts <-
	rbind(
	year_package_counts ,
	day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) )
	)

	}


	# compute year-package weight
	year_package_weights <-
	data.frame( year_package_counts %>% spread( year , count ) )

	# overwrite missings with zeroes
	year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0



	traceback()

	cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' )
	package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ]

	package_title_description$package <- as.character( package_title_description$Package )
	package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description )

	package_title_description$text <- tolower( gsub( "\\n\|\\t" , " " , package_title_description$text ) )
	package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) )
	package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " )
	package_title_description$text <- gsub( " " , " " , package_title_description$text )

	word_list <- strsplit( package_title_description$text , " " )
	word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) )

	merged_list <-
	mapply(
	merge ,
	word_list ,
	lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) ,
	SIMPLIFY = FALSE
	)

	merged_df <- do.call( rbind , merged_list )


	weighted_df <- merge( merged_df , year_package_weights )

	word_weighted_df <-
	data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) )

	# remove some other words
	word_weighted_df <-
	subset(
	word_weighted_df ,
	!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) )
	)

	top_twenty_five <-
	data.frame(
	y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) ,
	y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) ,
	y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) ,
	y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) ,
	y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 )

	)

	top_twenty_five