Created
March 11, 2018 18:08
-
-
Save ajdamico/d2b0bf5e6e66ce6e605a1ce14dc995b5 to your computer and use it in GitHub Desktop.
twenty-five most common words in CRAN title + description fields, weighted by downloads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# devtools::install_github( "ajdamico/lodown" ) | |
library(tm) | |
library(tidyverse) | |
library(rvest) | |
cranlogs_html <- read_html( "http://cran-logs.rstudio.com/" ) | |
gz_files <- html_attr( html_nodes( cranlogs_html , "a" ) , "href" ) | |
tf <- tempfile() | |
year_package_counts <- NULL | |
for( this_year in 2013:2017 ){ | |
day_package_counts <- NULL | |
this_year_files <- grep( paste0( "^" , this_year ) , gz_files , value = TRUE ) | |
this_year_files <- this_year_files[ !grepl( "-r\\.csv\\.gz" , this_year_files ) ] | |
for( this_day_file in this_year_files ){ | |
lodown::cachaca( paste0( "http://cran-logs.rstudio.com/" , this_day_file ) , tf , mode = 'wb' ) | |
this_gz_file <- read_csv( gzfile( tf ) ) | |
day_package_counts <- | |
rbind( | |
day_package_counts , | |
this_gz_file %>% group_by( date , package ) %>% summarize( count = n() ) | |
) | |
} | |
year_package_counts <- | |
rbind( | |
year_package_counts , | |
day_package_counts %>% group_by( year = substr( date , 1 , 4 ) , package ) %>% summarize( count = sum( count ) ) | |
) | |
} | |
# compute year-package weight | |
year_package_weights <- | |
data.frame( year_package_counts %>% spread( year , count ) ) | |
# overwrite missings with zeroes | |
year_package_weights[ , -1 ][ is.na( year_package_weights[ , -1 ] ) ] <- 0 | |
traceback() | |
cachaca( "https://cloud.r-project.org/web/packages/packages.rds" , tf , mode = 'wb' ) | |
package_title_description <- data.frame( readRDS( tf ) )[ c( 'Package' , 'Title' , 'Description' ) ] | |
package_title_description$package <- as.character( package_title_description$Package ) | |
package_title_description$text <- paste0( package_title_description$Title , package_title_description$Description ) | |
package_title_description$text <- tolower( gsub( "\\n|\\t" , " " , package_title_description$text ) ) | |
package_title_description$text <- removeWords( package_title_description$text , stopwords( "english" ) ) | |
package_title_description$text <- str_replace_all( package_title_description$text , "[[:punct:]]", " " ) | |
package_title_description$text <- gsub( " " , " " , package_title_description$text ) | |
word_list <- strsplit( package_title_description$text , " " ) | |
word_list <- lapply( word_list , function( z ) data.frame( word = z[ z != '' ] , stringsAsFactors = FALSE ) ) | |
merged_list <- | |
mapply( | |
merge , | |
word_list , | |
lapply( package_title_description$package , function( z ) data.frame( package = z , stringsAsFactors = FALSE ) ) , | |
SIMPLIFY = FALSE | |
) | |
merged_df <- do.call( rbind , merged_list ) | |
weighted_df <- merge( merged_df , year_package_weights ) | |
word_weighted_df <- | |
data.frame( weighted_df[ , -1 ] %>% group_by( word ) %>% summarize_all( sum ) ) | |
# remove some other words | |
word_weighted_df <- | |
subset( | |
word_weighted_df , | |
!( word %in% c( 'based' , 'also' , 'can' , 'including' , 'provides' , 'provided' , '<doi' , '1' , '10' , 'well' , 'using' , 'use' , 'used' , 'uses' ) ) | |
) | |
top_twenty_five <- | |
data.frame( | |
y2013 = head( word_weighted_df[ order( -word_weighted_df$X2013 ) , 'word' ] , 25 ) , | |
y2014 = head( word_weighted_df[ order( -word_weighted_df$X2014 ) , 'word' ] , 25 ) , | |
y2015 = head( word_weighted_df[ order( -word_weighted_df$X2015 ) , 'word' ] , 25 ) , | |
y2016 = head( word_weighted_df[ order( -word_weighted_df$X2016 ) , 'word' ] , 25 ) , | |
y2017 = head( word_weighted_df[ order( -word_weighted_df$X2017 ) , 'word' ] , 25 ) | |
) | |
top_twenty_five |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment