jmclawson · August 6, 2019 16:15 · jmclawson · Aug 6, 2019
diff --git a/wiki_corpus.R b/wiki_corpus.R
 base_beg <- "https://en.wikipedia.org/wiki/Category:"
 base_end <- "th-century_novels"

 get_cat_pages <- function(){
 categories <<- data.frame(century=c(),
                         nation=c(),
                         url=c(),
                         stringsAsFactors = FALSE)
 for (century in centuries){
  cat_url <- paste0(base_beg,century,base_end)
  page <- read_html(cat_url)
  cat_list <- html_nodes(page, "div#mw-subcategories > div.mw-content-ltr  ul > li")
  for (nation in nations) {
    if (length(grep(nation, html_text(cat_list))) > 0) {
      this_url <- 
        cat_list[[grep(nation, html_text(cat_list))]] %>% 
        html_nodes("a") %>% 
        html_attr("href")
      this_url <- paste0("https://en.wikipedia.org",this_url)
      categories <<- rbind(categories,
                          data.frame(century=century,
                                     nation=nation,
                                     url=this_url,
                                     stringsAsFactors = FALSE))
    }
  }
 }
 }

 get_subcat_urls <- function(){
 subcategories <<- data.frame(century=c(),
                            nation=c(),
                            year=c(),
                            url=c(),
                            stringsAsFactors = FALSE)
 for (row in 1:nrow(categories)){
  page <- read_html(categories[row,"url"])
  sub_list <- html_nodes(page, "div#mw-subcategories")
  if (length(sub_list)>0){
    sub_list <- html_nodes(page, "div.mw-content-ltr ul > li")}
  if (length(sub_list)>0){
    index <- grep("[0-9]{4}",html_text(sub_list))
    if (length(index) > 0) {
      for (ind in index){
        the_url <- 
          sub_list[[ind]] %>% 
          html_nodes("a") %>% 
          html_attr("href")
        the_url <- paste0("https://en.wikipedia.org",the_url)
        the_year <- gsub("[a-zA-Z ]","",
                         sub_list[[ind]] %>% 
                           html_nodes("a") %>% 
                           html_text()) %>% 
          as.numeric()
        subcategories <<- 
          rbind(subcategories,
                data.frame(century=categories[row,"century"],
                           nation=categories[row,"nation"],
                           year=the_year,
                           url=the_url,
                           stringsAsFactors = FALSE))
      }
    }
  }
 }
 }

 get_subcat_pages <- function() {
 if(!dir.exists("year_lists")){dir.create("year_lists")}

 for (row in 1:nrow(subcategories)){
  id <- paste0(subcategories[row,"nation"],
               subcategories[row,"year"])
  this_url <- subcategories[row,"url"]
  this_file <- paste0("year_lists/",id,".html")
  if(!file.exists(this_file)){
    download.file(this_url, destfile = this_file)
    
    # Save the server! Wait before downloading more
    randomtime <- sample(1:10,1)*sample(c(0.5,1,pi/2),1)
      cat("Wait for",randomtime,"seconds")
      Sys.sleep(randomtime)
  }
 }
 }

 parse_subcat_pages <- function(){
 corpus_wikipedia <<- data.frame(titles=c(),
                               year=c(),
                               nation=c(),
                               stringsAsFactors = FALSE)
  
 files <- list.files(path="year_lists/")

 for (filename in files){
  id <- gsub(".html","",filename)
  this_length <- nchar(id)
  this_year <- substr(id,this_length-3,this_length)
  if (!is.na(as.numeric(this_year))) {
    this_nation <- substr(id,1,this_length-4)
    these_titles <- read_html(paste0("year_lists/",filename)) %>% 
      html_nodes("div#mw-pages") %>% 
      html_nodes("div.mw-content-ltr") %>% 
      html_nodes("li") %>% 
      html_text()
    this_data <- data.frame(titles=as.character(these_titles),
                            year=as.numeric(this_year),
                            nation=this_nation,
                            stringsAsFactors = FALSE)
    corpus_wikipedia <<- rbind(corpus_wikipedia,
                               this_data)
  }
 }
 corpus_byyear <<- corpus_wikipedia %>% 
   group_by(year) %>% 
   summarize(count=n())

 nation_byyear <<- corpus_wikipedia %>% 
   group_by(nation,year) %>% 
   summarize(count=n())
 }

 record_avg <- function(df=corpus_byyear,
                       window=3){
  moving_avg <<- data.frame(avg=c(),
                         year=c(),
                         stringsAsFactors = FALSE)
  start <- ceiling(window/2)
  end <- nrow(df)-ceiling(window/2)
  for (row in start:end){
    this_start <- row-ceiling(window/2)
    this_end <- row+ceiling(window/2)
    this_avg <- mean(df$count[this_start:this_end])
    if ("nation" %in% colnames(df)){
      moving_avg <<- rbind(moving_avg,
                           data.frame(avg=this_avg,
                                      year=df$year[row],
                                      nation=df$nation[row]))
    } else {
      moving_avg <<- rbind(moving_avg,
                           data.frame(avg=this_avg,
                                      year=df$year[row]))
    }
  }
 }
	base_beg <- "https://en.wikipedia.org/wiki/Category:"
	base_end <- "th-century_novels"

	get_cat_pages <- function(){
	categories <<- data.frame(century=c(),
	nation=c(),
	url=c(),
	stringsAsFactors = FALSE)
	for (century in centuries){
	cat_url <- paste0(base_beg,century,base_end)
	page <- read_html(cat_url)
	cat_list <- html_nodes(page, "div#mw-subcategories > div.mw-content-ltr ul > li")
	for (nation in nations) {
	if (length(grep(nation, html_text(cat_list))) > 0) {
	this_url <-
	cat_list[[grep(nation, html_text(cat_list))]] %>%
	html_nodes("a") %>%
	html_attr("href")
	this_url <- paste0("https://en.wikipedia.org",this_url)
	categories <<- rbind(categories,
	data.frame(century=century,
	nation=nation,
	url=this_url,
	stringsAsFactors = FALSE))
	}
	}
	}
	}

	get_subcat_urls <- function(){
	subcategories <<- data.frame(century=c(),
	nation=c(),
	year=c(),
	url=c(),
	stringsAsFactors = FALSE)
	for (row in 1:nrow(categories)){
	page <- read_html(categories[row,"url"])
	sub_list <- html_nodes(page, "div#mw-subcategories")
	if (length(sub_list)>0){
	sub_list <- html_nodes(page, "div.mw-content-ltr ul > li")}
	if (length(sub_list)>0){
	index <- grep("[0-9]{4}",html_text(sub_list))
	if (length(index) > 0) {
	for (ind in index){
	the_url <-
	sub_list[[ind]] %>%
	html_nodes("a") %>%
	html_attr("href")
	the_url <- paste0("https://en.wikipedia.org",the_url)
	the_year <- gsub("[a-zA-Z ]","",
	sub_list[[ind]] %>%
	html_nodes("a") %>%
	html_text()) %>%
	as.numeric()
	subcategories <<-
	rbind(subcategories,
	data.frame(century=categories[row,"century"],
	nation=categories[row,"nation"],
	year=the_year,
	url=the_url,
	stringsAsFactors = FALSE))
	}
	}
	}
	}
	}

	get_subcat_pages <- function() {
	if(!dir.exists("year_lists")){dir.create("year_lists")}

	for (row in 1:nrow(subcategories)){
	id <- paste0(subcategories[row,"nation"],
	subcategories[row,"year"])
	this_url <- subcategories[row,"url"]
	this_file <- paste0("year_lists/",id,".html")
	if(!file.exists(this_file)){
	download.file(this_url, destfile = this_file)

	# Save the server! Wait before downloading more
	randomtime <- sample(1:10,1)*sample(c(0.5,1,pi/2),1)
	cat("Wait for",randomtime,"seconds")
	Sys.sleep(randomtime)
	}
	}
	}

	parse_subcat_pages <- function(){
	corpus_wikipedia <<- data.frame(titles=c(),
	year=c(),
	nation=c(),
	stringsAsFactors = FALSE)

	files <- list.files(path="year_lists/")

	for (filename in files){
	id <- gsub(".html","",filename)
	this_length <- nchar(id)
	this_year <- substr(id,this_length-3,this_length)
	if (!is.na(as.numeric(this_year))) {
	this_nation <- substr(id,1,this_length-4)
	these_titles <- read_html(paste0("year_lists/",filename)) %>%
	html_nodes("div#mw-pages") %>%
	html_nodes("div.mw-content-ltr") %>%
	html_nodes("li") %>%
	html_text()
	this_data <- data.frame(titles=as.character(these_titles),
	year=as.numeric(this_year),
	nation=this_nation,
	stringsAsFactors = FALSE)
	corpus_wikipedia <<- rbind(corpus_wikipedia,
	this_data)
	}
	}
	corpus_byyear <<- corpus_wikipedia %>%
	group_by(year) %>%
	summarize(count=n())

	nation_byyear <<- corpus_wikipedia %>%
	group_by(nation,year) %>%
	summarize(count=n())
	}

	record_avg <- function(df=corpus_byyear,
	window=3){
	moving_avg <<- data.frame(avg=c(),
	year=c(),
	stringsAsFactors = FALSE)
	start <- ceiling(window/2)
	end <- nrow(df)-ceiling(window/2)
	for (row in start:end){
	this_start <- row-ceiling(window/2)
	this_end <- row+ceiling(window/2)
	this_avg <- mean(df$count[this_start:this_end])
	if ("nation" %in% colnames(df)){
	moving_avg <<- rbind(moving_avg,
	data.frame(avg=this_avg,
	year=df$year[row],
	nation=df$nation[row]))
	} else {
	moving_avg <<- rbind(moving_avg,
	data.frame(avg=this_avg,
	year=df$year[row]))
	}
	}
	}