Created
May 21, 2025 15:40
-
-
Save kathsherratt/7132c9aed36bef7f7e0a8b15832bd023 to your computer and use it in GitHub Desktop.
Deduplicating UK school pupil households
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Last modified June 6 2020 | |
| # ------------------------- | |
| library(data.table) | |
| library(tidyverse) | |
| # Aim: find duplicate residences (multiple students in the same house) | |
| # # Read in raw data | |
| # autumn_20 <- fread("Autumn_Census_Addresses_2020.txt") | |
| # | |
| # # Filter students | |
| # autumn_20 <- as.data.table(autumn_20) | |
| autumn_20[, unfiltered_rowkey := .I] | |
| # | |
| # address_dt <- autumn_20[EnrolStatus_AUT20 %in% c("C", "M", "F")] | |
| # address_dt$date <- lubridate::ymd(address_dt$EntryDate_AUT20) | |
| # address_dt <- address_dt[address_dt[, .I[which.max(date)], by=PupilMatchingRefAnonymous_AUT20]$V1] | |
| # | |
| # # Original data set: 8,281,864 | |
| # # Filtered by enrolment: 8,262,658 | |
| # # Filtered to latest enrol date by student: 8,260,919 | |
| # # Check: 2 pupils with same ID, different entry dates | |
| # # example_multi_id <- address_dt[PupilMatchingRefAnonymous_AUT20 %in% c("CCA55EC13DD3B9F9CA", "CCA55FC033DEBCF1CD")] | |
| # | |
| # | |
| # # Address matching -------------------------------------------------------- | |
| # address <- as_tibble(address_dt) | |
| # | |
| # # Set to missing any addresses same as school address | |
| # address <- address %>% | |
| # select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, UniquePropertyReferenceNumber_AUT20, LAEstab_AUT20, postcode_school = SchoolPostcode_AUT20, postcode_resi = Postcode_AUT20, address_full = Address_Adjusted_AUT20) %>% | |
| # mutate(postcode_adj = ifelse(postcode_resi == postcode_school, NA, postcode_resi)) | |
| # | |
| # | |
| # # Single vs multiple student postcodes ----------------------------------------------------- | |
| # # Step 1 Find unique postcodes with only 1 line in them - these are unique 1 student households | |
| # # Step 2 Find unique postcodes with multiple lines in them - these need sorting to whether multiple addresses or not | |
| # | |
| # n_per_postcode <- address %>% | |
| # group_by(postcode_adj) %>% | |
| # summarise(n_students = n()) | |
| # | |
| # single_student_postcode <- filter(n_per_postcode, n_students == 1) %>% | |
| # rename(n_students_hh = n_students) | |
| # | |
| # multi_postcode <- filter(n_per_postcode, n_students > 1) | |
| # | |
| # # Now dealing only with multiple students living in one postcode | |
| # multi_postcode_address <- filter(address, postcode_adj %in% multi_postcode$postcode_adj) | |
| # | |
| # # Find most common words for building names to remove | |
| # # token_address <- address %>% | |
| # # tidytext::unnest_tokens(address_line, address_full) %>% | |
| # # count(address_line, sort = T) | |
| # | |
| # # Clean address to create unique match for each household | |
| # multi_postcode_address <- multi_postcode_address %>% | |
| # mutate( | |
| # address_trim = str_to_lower(address_full), | |
| # address_trim = str_remove_all(address_trim, "flat"), | |
| # address_trim = str_remove_all(address_trim, "house"), | |
| # address_trim = str_remove_all(address_trim, "court"), | |
| # address_trim = str_remove_all(address_trim, "estate"), | |
| # address_trim = str_remove_all(address_trim, "block"), | |
| # address_trim = str_remove_all(address_trim, "apartment+"), | |
| # address_trim = str_trim(address_trim), | |
| # # All numbers | |
| # number_only = str_remove_all(address_trim, "[[:alpha:]]"), | |
| # number_only = str_remove_all(number_only, "[[:punct:]]"), | |
| # number_only = str_replace_na(number_only, "_"), | |
| # # Number then letter e.g. Flat 8B | |
| # number_letter_locate = str_locate_all(address_trim, "[[:digit:]][[:digit:]][[:alpha:]]"), | |
| # number_letter_start = as.numeric(str_sub(number_letter_locate, start = 1, end = 1)), | |
| # number_letter_end = as.numeric(str_sub(number_letter_locate, start = 3, end = 3)), | |
| # number_letter = str_sub(address_trim, start = number_letter_start, end = number_letter_end), | |
| # number_letter = str_replace_na(number_letter, "_"), | |
| # # House name only | |
| # name_only = str_to_lower(address_trim), | |
| # name_only = str_remove_all(name_only, "[[:digit:]]"), | |
| # name_only = word(address_trim), | |
| # name_only = ifelse(name_only == "the", word(address_trim, 1,2), name_only), | |
| # name_only = str_remove_all(name_only, "[[:punct:]]"), | |
| # name_only = str_replace_na(name_only, "_"), | |
| # # Combine all above | |
| # hh_address = str_c(postcode_adj, "_", number_only, "_", number_letter, "_", name_only), | |
| # hh_address = str_remove_all(hh_address, "[[:space:]]") | |
| # ) | |
| # | |
| # # Group by household identifier | |
| # multi_students_by_hh <- multi_postcode_address %>% | |
| # group_by(postcode_adj, hh_address) %>% | |
| # summarise(n_students_hh = n()) %>% | |
| # ungroup() %>% | |
| # mutate(hh_id = paste0("hh_", 1:n())) | |
| # | |
| # | |
| # # Matching back to data --------------------------------------------------- | |
| # | |
| # Match multi-student postcodes back to student records | |
| multi_key <- left_join(multi_postcode_address, multi_students_by_hh, by = c("hh_address" = "hh_address")) %>% | |
| select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, hh_id) | |
| # Key for single student postcodes | |
| single_key <- single_student_postcode %>% | |
| mutate(hh_id = paste0("hh_", nrow(multi_students_by_hh)+1:n())) | |
| # Attach all data | |
| address_with_id <- address %>% | |
| left_join(multi_key, by = c("unfiltered_rowkey" = "unfiltered_rowkey")) %>% | |
| left_join(single_key, by = c("postcode_adj" = "postcode_adj")) %>% | |
| mutate(hh_id = ifelse(is.na(hh_id.x), hh_id.y, hh_id.x)) %>% | |
| select(unfiltered_rowkey, hh_id) | |
| # Write keys to csv | |
| final_dt <- address %>% | |
| select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, postcode_adj) %>% | |
| left_join(address_with_id, by = "unfiltered_rowkey") | |
| readr::write_csv(final_dt, path = here::here("aut_hh_v2.csv"), col_names = TRUE) | |
| # check mean students per household ----------------------------------------------------------------- | |
| # address_by_hh <- address_with_id %>% | |
| # group_by(hh_id) %>% | |
| # summarise(n_students = n()) %>% | |
| # summary(n_students) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment