jeffeaton · December 8, 2020 09:50
diff --git a/malawi-census-2018-district-population-projections.R b/malawi-census-2018-district-population-projections.R
 library(dplyr)
 library(pdftools)
 library(readr)
 library(stringr)
 library(tidyr)

 url <- "http://www.nsomalawi.mw/images/stories/data_on_line/demography/census_2018/Thematic_Reports/Population%20Projections%202018-2050.pdf"

 file <- tempfile(fileext = ".pdf")
 download.file(url, file)
                      
 raw <- pdf_text(file)

 #' ## Annex 1 in pages 32-38 contains Malawi national projections
 #' for five year age groups and "special age groups" to 2050

 #' ## Annex 2 in pages numbered 39-263 contains population projections
 #' for each district 2018-2043.
 #' * Corresponds to actual pages 49:273

 text <- str_split(raw[49:273], "\n")

 parse_page <- function(x) {

  page_number <- str_subset(str_squish(x), "^[0-9]{2,3}$")
  print(paste("Parsing page number:", page_number))
  
  ## Extract district name from the page if exists
  district <- x %>%
    str_extract(", ([^,]+) (District|City|Rural)") %>%
    str_replace(", ", "")

  if (all(is.na(district))) {
    district <- NA_character_
  } else {
    district <- district[!is.na(district)]
  }

  if (length(district) > 1) {
    stop("Multiple districts parsed on page: ", paste(district, collapse = ", "))
  }

  ## Page 242 has Neno district label but no data.
  ## Return just the district name
  if (page_number == 242) {
    return( tibble(district = district) )
  }

  
  ## Extract year headers

  years <- str_squish(x) %>%
    str_extract("^20[0-9]{2} 20[0-9]{2}$")
  years <- years[!is.na(years)]
  years <- unlist(str_split(years, " "))

  ## Page 164 is missing year headers. Manually add
  if (page_number == 164) {
    years <- c("2042", "2043")
  }

  if (!length(years) %in% c(2, 4)) {
    stop("Something gone wrong with years extraction")
  }

  ## Get data rows
  data <- str_subset(str_trim(x), "^(Total|[0-9]{1,2}[-+])")

  ## Identify indices of column breaks based on column of spaces
  ## Note: If everything is well formatted we don't need this.
  ##       Keeping it commented b/c it's clever code that maybe handy later.
  ## is_space <- str_split(data, "", simplify = TRUE) == " "
  ## empty_col <- apply(is_space, 2, all)
  ## colsep_idx <- which(empty_col & !lag(empty_col))

  data <- str_split(str_squish(data), " ", simplify = TRUE)

  if (!ncol(data) == 8 ||
      any(data == "") ||
      !nrow(data) %in% c(21, 42)) {
    stop("Something has gone wrong with data parsing")
  }

  ## Note: Not checking the columb headers; assuming no deviation 
  ## from "Both sexes", "Male", "Female". This could be more robust.

  block1 <- tibble(
    district = district,
    year = years[1],
    age_group = data[1:21, 1],
    both = data[1:21, 2],
    female = data[1:21, 3],
    male = data[1:21, 4]
  )

  block2 <- tibble(
    district = district,
    year = years[2],
    age_group = data[1:21, 5],
    both = data[1:21, 6],
    female = data[1:21, 7],
    male = data[1:21, 8]
  )

  val <- bind_rows(block1, block2)

  if (nrow(data) == 42) {
    block3 <- tibble(
      district = district,
      year = years[3],
      age_group = data[22:42, 1],
      both = data[22:42, 2],
      female = data[22:42, 3],
      male = data[22:42, 4]
  )
    
    block4 <- tibble(
      district = district,
      year = years[4],
      age_group = data[22:42, 5],
      both = data[22:42, 6],
      female = data[22:42, 7],
      male = data[22:42, 8]
    )
    val <- bind_rows(val, block3, block4)
  }

  val
 }
    

 ## Process pages and tidy up

 dfl <- lapply(text, parse_page)
 df <- bind_rows(dfl)

 df <- df %>%
  fill(district, .direction = "down") %>%  
  filter(!is.na(year)) %>%
  mutate(year = as.integer(year),
         both = as.integer(str_remove_all(both, ",")),
         female = as.integer(str_remove_all(female, ",")),
         male = as.integer(str_remove_all(male, ",")))

 ## Check that all combinations of district / year / age_group represented
 ## (There is some tidyr function for this, but I can't remember what it is)

 all_combos <- crossing(district = df$district,
                       year = 2018:2043,
                       age_group = c("Total", paste0(0:18*5, "-", 0:18*5+4), "95+"))

 stopifnot(nrow(anti_join(all_combos, df)) == 0)
 stopifnot(nrow(anti_join(df, all_combos)) == 0)

 ## Check no duplicated district / year / age_group
 stopifnot(!duplicated(df[c("district", "year", "age_group")]))

 ## Check no missing values
 stopifnot(!is.na(df))


 ## Save results
 write_csv(df, "malawi-census-2018-district-population-projections.csv")
	library(dplyr)
	library(pdftools)
	library(readr)
	library(stringr)
	library(tidyr)

	url <- "http://www.nsomalawi.mw/images/stories/data_on_line/demography/census_2018/Thematic_Reports/Population%20Projections%202018-2050.pdf"

	file <- tempfile(fileext = ".pdf")
	download.file(url, file)

	raw <- pdf_text(file)

	#' ## Annex 1 in pages 32-38 contains Malawi national projections
	#' for five year age groups and "special age groups" to 2050

	#' ## Annex 2 in pages numbered 39-263 contains population projections
	#' for each district 2018-2043.
	#' * Corresponds to actual pages 49:273

	text <- str_split(raw[49:273], "\n")

	parse_page <- function(x) {

	page_number <- str_subset(str_squish(x), "^[0-9]{2,3}$")
	print(paste("Parsing page number:", page_number))

	## Extract district name from the page if exists
	district <- x %>%
	str_extract(", ([^,]+) (District\|City\|Rural)") %>%
	str_replace(", ", "")

	if (all(is.na(district))) {
	district <- NA_character_
	} else {
	district <- district[!is.na(district)]
	}

	if (length(district) > 1) {
	stop("Multiple districts parsed on page: ", paste(district, collapse = ", "))
	}

	## Page 242 has Neno district label but no data.
	## Return just the district name
	if (page_number == 242) {
	return( tibble(district = district) )
	}


	## Extract year headers

	years <- str_squish(x) %>%
	str_extract("^20[0-9]{2} 20[0-9]{2}$")
	years <- years[!is.na(years)]
	years <- unlist(str_split(years, " "))

	## Page 164 is missing year headers. Manually add
	if (page_number == 164) {
	years <- c("2042", "2043")
	}

	if (!length(years) %in% c(2, 4)) {
	stop("Something gone wrong with years extraction")
	}

	## Get data rows
	data <- str_subset(str_trim(x), "^(Total\|[0-9]{1,2}[-+])")

	## Identify indices of column breaks based on column of spaces
	## Note: If everything is well formatted we don't need this.
	## Keeping it commented b/c it's clever code that maybe handy later.
	## is_space <- str_split(data, "", simplify = TRUE) == " "
	## empty_col <- apply(is_space, 2, all)
	## colsep_idx <- which(empty_col & !lag(empty_col))

	data <- str_split(str_squish(data), " ", simplify = TRUE)

	if (!ncol(data) == 8 \|\|
	any(data == "") \|\|
	!nrow(data) %in% c(21, 42)) {
	stop("Something has gone wrong with data parsing")
	}

	## Note: Not checking the columb headers; assuming no deviation
	## from "Both sexes", "Male", "Female". This could be more robust.

	block1 <- tibble(
	district = district,
	year = years[1],
	age_group = data[1:21, 1],
	both = data[1:21, 2],
	female = data[1:21, 3],
	male = data[1:21, 4]
	)

	block2 <- tibble(
	district = district,
	year = years[2],
	age_group = data[1:21, 5],
	both = data[1:21, 6],
	female = data[1:21, 7],
	male = data[1:21, 8]
	)

	val <- bind_rows(block1, block2)

	if (nrow(data) == 42) {
	block3 <- tibble(
	district = district,
	year = years[3],
	age_group = data[22:42, 1],
	both = data[22:42, 2],
	female = data[22:42, 3],
	male = data[22:42, 4]
	)

	block4 <- tibble(
	district = district,
	year = years[4],
	age_group = data[22:42, 5],
	both = data[22:42, 6],
	female = data[22:42, 7],
	male = data[22:42, 8]
	)
	val <- bind_rows(val, block3, block4)
	}

	val
	}


	## Process pages and tidy up

	dfl <- lapply(text, parse_page)
	df <- bind_rows(dfl)

	df <- df %>%
	fill(district, .direction = "down") %>%
	filter(!is.na(year)) %>%
	mutate(year = as.integer(year),
	both = as.integer(str_remove_all(both, ",")),
	female = as.integer(str_remove_all(female, ",")),
	male = as.integer(str_remove_all(male, ",")))

	## Check that all combinations of district / year / age_group represented
	## (There is some tidyr function for this, but I can't remember what it is)

	all_combos <- crossing(district = df$district,
	year = 2018:2043,
	age_group = c("Total", paste0(0:185, "-", 0:185+4), "95+"))

	stopifnot(nrow(anti_join(all_combos, df)) == 0)
	stopifnot(nrow(anti_join(df, all_combos)) == 0)

	## Check no duplicated district / year / age_group
	stopifnot(!duplicated(df[c("district", "year", "age_group")]))

	## Check no missing values
	stopifnot(!is.na(df))


	## Save results
	write_csv(df, "malawi-census-2018-district-population-projections.csv")