Last active
December 8, 2020 09:50
-
-
Save jeffeaton/46d771a3d11fc3449ef2884f147a070a to your computer and use it in GitHub Desktop.
Malawi Census 2018 District population projections (2018-2043)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(pdftools) | |
library(readr) | |
library(stringr) | |
library(tidyr) | |
url <- "http://www.nsomalawi.mw/images/stories/data_on_line/demography/census_2018/Thematic_Reports/Population%20Projections%202018-2050.pdf" | |
file <- tempfile(fileext = ".pdf") | |
download.file(url, file) | |
raw <- pdf_text(file) | |
#' ## Annex 1 in pages 32-38 contains Malawi national projections | |
#' for five year age groups and "special age groups" to 2050 | |
#' ## Annex 2 in pages numbered 39-263 contains population projections | |
#' for each district 2018-2043. | |
#' * Corresponds to actual pages 49:273 | |
text <- str_split(raw[49:273], "\n") | |
parse_page <- function(x) { | |
page_number <- str_subset(str_squish(x), "^[0-9]{2,3}$") | |
print(paste("Parsing page number:", page_number)) | |
## Extract district name from the page if exists | |
district <- x %>% | |
str_extract(", ([^,]+) (District|City|Rural)") %>% | |
str_replace(", ", "") | |
if (all(is.na(district))) { | |
district <- NA_character_ | |
} else { | |
district <- district[!is.na(district)] | |
} | |
if (length(district) > 1) { | |
stop("Multiple districts parsed on page: ", paste(district, collapse = ", ")) | |
} | |
## Page 242 has Neno district label but no data. | |
## Return just the district name | |
if (page_number == 242) { | |
return( tibble(district = district) ) | |
} | |
## Extract year headers | |
years <- str_squish(x) %>% | |
str_extract("^20[0-9]{2} 20[0-9]{2}$") | |
years <- years[!is.na(years)] | |
years <- unlist(str_split(years, " ")) | |
## Page 164 is missing year headers. Manually add | |
if (page_number == 164) { | |
years <- c("2042", "2043") | |
} | |
if (!length(years) %in% c(2, 4)) { | |
stop("Something gone wrong with years extraction") | |
} | |
## Get data rows | |
data <- str_subset(str_trim(x), "^(Total|[0-9]{1,2}[-+])") | |
## Identify indices of column breaks based on column of spaces | |
## Note: If everything is well formatted we don't need this. | |
## Keeping it commented b/c it's clever code that maybe handy later. | |
## is_space <- str_split(data, "", simplify = TRUE) == " " | |
## empty_col <- apply(is_space, 2, all) | |
## colsep_idx <- which(empty_col & !lag(empty_col)) | |
data <- str_split(str_squish(data), " ", simplify = TRUE) | |
if (!ncol(data) == 8 || | |
any(data == "") || | |
!nrow(data) %in% c(21, 42)) { | |
stop("Something has gone wrong with data parsing") | |
} | |
## Note: Not checking the columb headers; assuming no deviation | |
## from "Both sexes", "Male", "Female". This could be more robust. | |
block1 <- tibble( | |
district = district, | |
year = years[1], | |
age_group = data[1:21, 1], | |
both = data[1:21, 2], | |
female = data[1:21, 3], | |
male = data[1:21, 4] | |
) | |
block2 <- tibble( | |
district = district, | |
year = years[2], | |
age_group = data[1:21, 5], | |
both = data[1:21, 6], | |
female = data[1:21, 7], | |
male = data[1:21, 8] | |
) | |
val <- bind_rows(block1, block2) | |
if (nrow(data) == 42) { | |
block3 <- tibble( | |
district = district, | |
year = years[3], | |
age_group = data[22:42, 1], | |
both = data[22:42, 2], | |
female = data[22:42, 3], | |
male = data[22:42, 4] | |
) | |
block4 <- tibble( | |
district = district, | |
year = years[4], | |
age_group = data[22:42, 5], | |
both = data[22:42, 6], | |
female = data[22:42, 7], | |
male = data[22:42, 8] | |
) | |
val <- bind_rows(val, block3, block4) | |
} | |
val | |
} | |
## Process pages and tidy up | |
dfl <- lapply(text, parse_page) | |
df <- bind_rows(dfl) | |
df <- df %>% | |
fill(district, .direction = "down") %>% | |
filter(!is.na(year)) %>% | |
mutate(year = as.integer(year), | |
both = as.integer(str_remove_all(both, ",")), | |
female = as.integer(str_remove_all(female, ",")), | |
male = as.integer(str_remove_all(male, ","))) | |
## Check that all combinations of district / year / age_group represented | |
## (There is some tidyr function for this, but I can't remember what it is) | |
all_combos <- crossing(district = df$district, | |
year = 2018:2043, | |
age_group = c("Total", paste0(0:18*5, "-", 0:18*5+4), "95+")) | |
stopifnot(nrow(anti_join(all_combos, df)) == 0) | |
stopifnot(nrow(anti_join(df, all_combos)) == 0) | |
## Check no duplicated district / year / age_group | |
stopifnot(!duplicated(df[c("district", "year", "age_group")])) | |
## Check no missing values | |
stopifnot(!is.na(df)) | |
## Save results | |
write_csv(df, "malawi-census-2018-district-population-projections.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment