ajdamico · May 29, 2018 09:10
diff --git a/wikipedia random articles are indeed random.R b/wikipedia random articles are indeed random.R
 library(rvest)

 tf <- tempfile()
 tf2 <- tempfile()

 # download and unzip
 download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' )
 R.utils::bunzip2( tf , tf2 , remove = FALSE )

 # import and restrict to english
 x <- data.table::fread( tf2 , skip = 25 , header = FALSE )
 y <- subset( x , V1 == 'en.z' )

 # how many average pageviews overall?
 average_pageviews <- mean( y$V3 )


 # sample fifty thousand articles
 sampled_pageviews <- NULL

 this_n <- 50000

 for( i in seq( this_n ) ){

    this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) )
    this_article_name <- gsub( " " , "_" , gsub( "(.*)>(.*) - Wikipedia<(.*)" , "\\2" , this_page ) )
    
 	# look inside `y` for the sampled article's pageviews
 	this_pageviews <- subset( y , V2 == this_article_name )$V3

    if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews )
    if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 )
    if( length( this_pageviews ) > 1 ) stop( "problem" )

 }

 # these two numbers get close as this_n gets bigger
 mean( sampled_pageviews )
 average_pageviews
	library(rvest)

	tf <- tempfile()
	tf2 <- tempfile()

	# download and unzip
	download.file( "https://dumps.wikimedia.org/other/pagecounts-ez/merged/2018/2018-05/pagecounts-2018-05-27.bz2" , tf , mode = 'wb' )
	R.utils::bunzip2( tf , tf2 , remove = FALSE )

	# import and restrict to english
	x <- data.table::fread( tf2 , skip = 25 , header = FALSE )
	y <- subset( x , V1 == 'en.z' )

	# how many average pageviews overall?
	average_pageviews <- mean( y$V3 )


	# sample fifty thousand articles
	sampled_pageviews <- NULL

	this_n <- 50000

	for( i in seq( this_n ) ){

	this_page <- as.character( html_nodes( read_html("https://en.wikipedia.org/wiki/Special:Random") , "title" ) )
	this_article_name <- gsub( " " , "_" , gsub( "(.)>(.) - Wikipedia<(.*)" , "\\2" , this_page ) )

	# look inside `y` for the sampled article's pageviews
	this_pageviews <- subset( y , V2 == this_article_name )$V3

	if( length( this_pageviews ) == 1 ) sampled_pageviews <- c( sampled_pageviews , this_pageviews )
	if( length( this_pageviews ) == 0 ) sampled_pageviews <- c( sampled_pageviews , 0 )
	if( length( this_pageviews ) > 1 ) stop( "problem" )

	}

	# these two numbers get close as this_n gets bigger
	mean( sampled_pageviews )
	average_pageviews