Skip to content

Instantly share code, notes, and snippets.

@kathsherratt
Created May 21, 2025 15:40
Show Gist options
  • Select an option

  • Save kathsherratt/7132c9aed36bef7f7e0a8b15832bd023 to your computer and use it in GitHub Desktop.

Select an option

Save kathsherratt/7132c9aed36bef7f7e0a8b15832bd023 to your computer and use it in GitHub Desktop.
Deduplicating UK school pupil households
# Last modified June 6 2020
# -------------------------
library(data.table)
library(tidyverse)
# Aim: find duplicate residences (multiple students in the same house)
# # Read in raw data
# autumn_20 <- fread("Autumn_Census_Addresses_2020.txt")
#
# # Filter students
# autumn_20 <- as.data.table(autumn_20)
autumn_20[, unfiltered_rowkey := .I]
#
# address_dt <- autumn_20[EnrolStatus_AUT20 %in% c("C", "M", "F")]
# address_dt$date <- lubridate::ymd(address_dt$EntryDate_AUT20)
# address_dt <- address_dt[address_dt[, .I[which.max(date)], by=PupilMatchingRefAnonymous_AUT20]$V1]
#
# # Original data set: 8,281,864
# # Filtered by enrolment: 8,262,658
# # Filtered to latest enrol date by student: 8,260,919
# # Check: 2 pupils with same ID, different entry dates
# # example_multi_id <- address_dt[PupilMatchingRefAnonymous_AUT20 %in% c("CCA55EC13DD3B9F9CA", "CCA55FC033DEBCF1CD")]
#
#
# # Address matching --------------------------------------------------------
# address <- as_tibble(address_dt)
#
# # Set to missing any addresses same as school address
# address <- address %>%
# select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, UniquePropertyReferenceNumber_AUT20, LAEstab_AUT20, postcode_school = SchoolPostcode_AUT20, postcode_resi = Postcode_AUT20, address_full = Address_Adjusted_AUT20) %>%
# mutate(postcode_adj = ifelse(postcode_resi == postcode_school, NA, postcode_resi))
#
#
# # Single vs multiple student postcodes -----------------------------------------------------
# # Step 1 Find unique postcodes with only 1 line in them - these are unique 1 student households
# # Step 2 Find unique postcodes with multiple lines in them - these need sorting to whether multiple addresses or not
#
# n_per_postcode <- address %>%
# group_by(postcode_adj) %>%
# summarise(n_students = n())
#
# single_student_postcode <- filter(n_per_postcode, n_students == 1) %>%
# rename(n_students_hh = n_students)
#
# multi_postcode <- filter(n_per_postcode, n_students > 1)
#
# # Now dealing only with multiple students living in one postcode
# multi_postcode_address <- filter(address, postcode_adj %in% multi_postcode$postcode_adj)
#
# # Find most common words for building names to remove
# # token_address <- address %>%
# # tidytext::unnest_tokens(address_line, address_full) %>%
# # count(address_line, sort = T)
#
# # Clean address to create unique match for each household
# multi_postcode_address <- multi_postcode_address %>%
# mutate(
# address_trim = str_to_lower(address_full),
# address_trim = str_remove_all(address_trim, "flat"),
# address_trim = str_remove_all(address_trim, "house"),
# address_trim = str_remove_all(address_trim, "court"),
# address_trim = str_remove_all(address_trim, "estate"),
# address_trim = str_remove_all(address_trim, "block"),
# address_trim = str_remove_all(address_trim, "apartment+"),
# address_trim = str_trim(address_trim),
# # All numbers
# number_only = str_remove_all(address_trim, "[[:alpha:]]"),
# number_only = str_remove_all(number_only, "[[:punct:]]"),
# number_only = str_replace_na(number_only, "_"),
# # Number then letter e.g. Flat 8B
# number_letter_locate = str_locate_all(address_trim, "[[:digit:]][[:digit:]][[:alpha:]]"),
# number_letter_start = as.numeric(str_sub(number_letter_locate, start = 1, end = 1)),
# number_letter_end = as.numeric(str_sub(number_letter_locate, start = 3, end = 3)),
# number_letter = str_sub(address_trim, start = number_letter_start, end = number_letter_end),
# number_letter = str_replace_na(number_letter, "_"),
# # House name only
# name_only = str_to_lower(address_trim),
# name_only = str_remove_all(name_only, "[[:digit:]]"),
# name_only = word(address_trim),
# name_only = ifelse(name_only == "the", word(address_trim, 1,2), name_only),
# name_only = str_remove_all(name_only, "[[:punct:]]"),
# name_only = str_replace_na(name_only, "_"),
# # Combine all above
# hh_address = str_c(postcode_adj, "_", number_only, "_", number_letter, "_", name_only),
# hh_address = str_remove_all(hh_address, "[[:space:]]")
# )
#
# # Group by household identifier
# multi_students_by_hh <- multi_postcode_address %>%
# group_by(postcode_adj, hh_address) %>%
# summarise(n_students_hh = n()) %>%
# ungroup() %>%
# mutate(hh_id = paste0("hh_", 1:n()))
#
#
# # Matching back to data ---------------------------------------------------
#
# Match multi-student postcodes back to student records
multi_key <- left_join(multi_postcode_address, multi_students_by_hh, by = c("hh_address" = "hh_address")) %>%
select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, hh_id)
# Key for single student postcodes
single_key <- single_student_postcode %>%
mutate(hh_id = paste0("hh_", nrow(multi_students_by_hh)+1:n()))
# Attach all data
address_with_id <- address %>%
left_join(multi_key, by = c("unfiltered_rowkey" = "unfiltered_rowkey")) %>%
left_join(single_key, by = c("postcode_adj" = "postcode_adj")) %>%
mutate(hh_id = ifelse(is.na(hh_id.x), hh_id.y, hh_id.x)) %>%
select(unfiltered_rowkey, hh_id)
# Write keys to csv
final_dt <- address %>%
select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, postcode_adj) %>%
left_join(address_with_id, by = "unfiltered_rowkey")
readr::write_csv(final_dt, path = here::here("aut_hh_v2.csv"), col_names = TRUE)
# check mean students per household -----------------------------------------------------------------
# address_by_hh <- address_with_id %>%
# group_by(hh_id) %>%
# summarise(n_students = n()) %>%
# summary(n_students)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment