Last active
August 6, 2019 16:15
-
-
Save jmclawson/79d95f5d10f4e5abd577fc5cf6e8e6ea to your computer and use it in GitHub Desktop.
building a corpus of titles from Wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
base_beg <- "https://en.wikipedia.org/wiki/Category:" | |
base_end <- "th-century_novels" | |
get_cat_pages <- function(){ | |
categories <<- data.frame(century=c(), | |
nation=c(), | |
url=c(), | |
stringsAsFactors = FALSE) | |
for (century in centuries){ | |
cat_url <- paste0(base_beg,century,base_end) | |
page <- read_html(cat_url) | |
cat_list <- html_nodes(page, "div#mw-subcategories > div.mw-content-ltr ul > li") | |
for (nation in nations) { | |
if (length(grep(nation, html_text(cat_list))) > 0) { | |
this_url <- | |
cat_list[[grep(nation, html_text(cat_list))]] %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
this_url <- paste0("https://en.wikipedia.org",this_url) | |
categories <<- rbind(categories, | |
data.frame(century=century, | |
nation=nation, | |
url=this_url, | |
stringsAsFactors = FALSE)) | |
} | |
} | |
} | |
} | |
get_subcat_urls <- function(){ | |
subcategories <<- data.frame(century=c(), | |
nation=c(), | |
year=c(), | |
url=c(), | |
stringsAsFactors = FALSE) | |
for (row in 1:nrow(categories)){ | |
page <- read_html(categories[row,"url"]) | |
sub_list <- html_nodes(page, "div#mw-subcategories") | |
if (length(sub_list)>0){ | |
sub_list <- html_nodes(page, "div.mw-content-ltr ul > li")} | |
if (length(sub_list)>0){ | |
index <- grep("[0-9]{4}",html_text(sub_list)) | |
if (length(index) > 0) { | |
for (ind in index){ | |
the_url <- | |
sub_list[[ind]] %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
the_url <- paste0("https://en.wikipedia.org",the_url) | |
the_year <- gsub("[a-zA-Z ]","", | |
sub_list[[ind]] %>% | |
html_nodes("a") %>% | |
html_text()) %>% | |
as.numeric() | |
subcategories <<- | |
rbind(subcategories, | |
data.frame(century=categories[row,"century"], | |
nation=categories[row,"nation"], | |
year=the_year, | |
url=the_url, | |
stringsAsFactors = FALSE)) | |
} | |
} | |
} | |
} | |
} | |
get_subcat_pages <- function() { | |
if(!dir.exists("year_lists")){dir.create("year_lists")} | |
for (row in 1:nrow(subcategories)){ | |
id <- paste0(subcategories[row,"nation"], | |
subcategories[row,"year"]) | |
this_url <- subcategories[row,"url"] | |
this_file <- paste0("year_lists/",id,".html") | |
if(!file.exists(this_file)){ | |
download.file(this_url, destfile = this_file) | |
# Save the server! Wait before downloading more | |
randomtime <- sample(1:10,1)*sample(c(0.5,1,pi/2),1) | |
cat("Wait for",randomtime,"seconds") | |
Sys.sleep(randomtime) | |
} | |
} | |
} | |
parse_subcat_pages <- function(){ | |
corpus_wikipedia <<- data.frame(titles=c(), | |
year=c(), | |
nation=c(), | |
stringsAsFactors = FALSE) | |
files <- list.files(path="year_lists/") | |
for (filename in files){ | |
id <- gsub(".html","",filename) | |
this_length <- nchar(id) | |
this_year <- substr(id,this_length-3,this_length) | |
if (!is.na(as.numeric(this_year))) { | |
this_nation <- substr(id,1,this_length-4) | |
these_titles <- read_html(paste0("year_lists/",filename)) %>% | |
html_nodes("div#mw-pages") %>% | |
html_nodes("div.mw-content-ltr") %>% | |
html_nodes("li") %>% | |
html_text() | |
this_data <- data.frame(titles=as.character(these_titles), | |
year=as.numeric(this_year), | |
nation=this_nation, | |
stringsAsFactors = FALSE) | |
corpus_wikipedia <<- rbind(corpus_wikipedia, | |
this_data) | |
} | |
} | |
corpus_byyear <<- corpus_wikipedia %>% | |
group_by(year) %>% | |
summarize(count=n()) | |
nation_byyear <<- corpus_wikipedia %>% | |
group_by(nation,year) %>% | |
summarize(count=n()) | |
} | |
record_avg <- function(df=corpus_byyear, | |
window=3){ | |
moving_avg <<- data.frame(avg=c(), | |
year=c(), | |
stringsAsFactors = FALSE) | |
start <- ceiling(window/2) | |
end <- nrow(df)-ceiling(window/2) | |
for (row in start:end){ | |
this_start <- row-ceiling(window/2) | |
this_end <- row+ceiling(window/2) | |
this_avg <- mean(df$count[this_start:this_end]) | |
if ("nation" %in% colnames(df)){ | |
moving_avg <<- rbind(moving_avg, | |
data.frame(avg=this_avg, | |
year=df$year[row], | |
nation=df$nation[row])) | |
} else { | |
moving_avg <<- rbind(moving_avg, | |
data.frame(avg=this_avg, | |
year=df$year[row])) | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
These functions are explained further in a corresponding blog post here: https://jmclawson.net/blog/posts/selecting-a-better-corpus/
The resulting data is available for exploring here: https://jmclawson.net/projects/wiki-corpus.html