Created
May 29, 2018 09:10
-
-
Save ajdamico/c24dd25c8b0cab3203c39b8a47d7b1a0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
tf <- tempfile() | |
tf2 <- tempfile() | |
# download and unzip | |
download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' ) | |
R.utils::bunzip2( tf , tf2 , remove = FALSE ) | |
# import and restrict to english | |
x <- data.table::fread( tf2 , skip = 25 , header = FALSE ) | |
y <- subset( x , V1 == 'en.z' ) | |
# how many average pageviews overall? | |
average_pageviews <- mean( y$V3 ) | |
# sample fifty thousand articles | |
sampled_pageviews <- NULL | |
this_n <- 50000 | |
for( i in seq( this_n ) ){ | |
this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) ) | |
this_article_name <- gsub( " " , "_" , gsub( "(.*)>(.*) - Wikipedia<(.*)" , "\\2" , this_page ) ) | |
# look inside `y` for the sampled article's pageviews | |
this_pageviews <- subset( y , V2 == this_article_name )$V3 | |
if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews ) | |
if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 ) | |
if( length( this_pageviews ) > 1 ) stop( "problem" ) | |
} | |
# these two numbers get close as this_n gets bigger | |
mean( sampled_pageviews ) | |
average_pageviews | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment