kathsherratt · May 21, 2025 15:40
diff --git a/household-matching-v2.R b/household-matching-v2.R
 # Last modified June 6 2020
 # -------------------------

 library(data.table)
 library(tidyverse)
 # Aim: find duplicate residences (multiple students in the same house)

 # # Read in raw data
 # autumn_20 <- fread("Autumn_Census_Addresses_2020.txt")
 # 
 # # Filter students
 # autumn_20 <- as.data.table(autumn_20)
 autumn_20[, unfiltered_rowkey := .I]
 # 
 # address_dt <- autumn_20[EnrolStatus_AUT20 %in% c("C", "M", "F")]
 # address_dt$date <- lubridate::ymd(address_dt$EntryDate_AUT20)
 # address_dt <- address_dt[address_dt[, .I[which.max(date)], by=PupilMatchingRefAnonymous_AUT20]$V1]
 # 
 # # Original data set: 8,281,864
 # # Filtered by enrolment: 8,262,658
 # # Filtered to latest enrol date by student: 8,260,919
 # # Check: 2 pupils with same ID, different entry dates
 # # example_multi_id <- address_dt[PupilMatchingRefAnonymous_AUT20 %in% c("CCA55EC13DD3B9F9CA", "CCA55FC033DEBCF1CD")]
 # 
 # 
 # # Address matching --------------------------------------------------------
 # address <- as_tibble(address_dt)
 # 
 # # Set to missing any addresses same as school address
 # address <- address %>%
 #   select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, UniquePropertyReferenceNumber_AUT20, LAEstab_AUT20, postcode_school = SchoolPostcode_AUT20, postcode_resi = Postcode_AUT20, address_full = Address_Adjusted_AUT20) %>%
 #   mutate(postcode_adj = ifelse(postcode_resi == postcode_school, NA, postcode_resi))
 # 
 # 
 # # Single vs multiple student postcodes -----------------------------------------------------
 # # Step 1 Find unique postcodes with only 1 line in them - these are unique 1 student households
 # # Step 2 Find unique postcodes with multiple lines in them - these need sorting to whether multiple addresses or not
 # 
 # n_per_postcode <- address %>%
 #   group_by(postcode_adj) %>%
 #   summarise(n_students = n())
 # 
 # single_student_postcode <- filter(n_per_postcode, n_students == 1) %>%
 #   rename(n_students_hh = n_students)
 # 
 # multi_postcode <- filter(n_per_postcode, n_students > 1)
 # 
 # # Now dealing only with multiple students living in one postcode 
 # multi_postcode_address <- filter(address, postcode_adj %in% multi_postcode$postcode_adj)
 # 
 # # Find most common words for building names to remove
 # # token_address <- address %>%
 # #   tidytext::unnest_tokens(address_line, address_full) %>%
 # #   count(address_line, sort = T)
 # 
 # # Clean address to create unique match for each household
 # multi_postcode_address <- multi_postcode_address %>%
 #   mutate(
 #     address_trim = str_to_lower(address_full),
 #     address_trim = str_remove_all(address_trim, "flat"),
 #     address_trim = str_remove_all(address_trim, "house"),
 #     address_trim = str_remove_all(address_trim, "court"),
 #     address_trim = str_remove_all(address_trim, "estate"),
 #     address_trim = str_remove_all(address_trim, "block"),
 #     address_trim = str_remove_all(address_trim, "apartment+"),
 #     address_trim = str_trim(address_trim),
 #     # All numbers
 #     number_only = str_remove_all(address_trim, "[[:alpha:]]"),
 #     number_only = str_remove_all(number_only, "[[:punct:]]"),
 #     number_only = str_replace_na(number_only, "_"),
 #     # Number then letter e.g. Flat 8B
 #     number_letter_locate = str_locate_all(address_trim, "[[:digit:]][[:digit:]][[:alpha:]]"),
 #     number_letter_start = as.numeric(str_sub(number_letter_locate, start = 1, end = 1)),
 #     number_letter_end = as.numeric(str_sub(number_letter_locate, start = 3, end = 3)),
 #     number_letter = str_sub(address_trim, start = number_letter_start, end = number_letter_end),
 #     number_letter = str_replace_na(number_letter, "_"),
 #     # House name only
 #     name_only = str_to_lower(address_trim),
 #     name_only = str_remove_all(name_only, "[[:digit:]]"),
 #     name_only = word(address_trim),
 #     name_only = ifelse(name_only == "the", word(address_trim, 1,2), name_only),
 #     name_only = str_remove_all(name_only, "[[:punct:]]"),
 #     name_only = str_replace_na(name_only, "_"),
 #     # Combine all above
 #     hh_address = str_c(postcode_adj, "_", number_only, "_", number_letter, "_", name_only),
 #     hh_address = str_remove_all(hh_address, "[[:space:]]")
 #   )
 # 
 # # Group by household identifier
 # multi_students_by_hh <- multi_postcode_address %>%
 #   group_by(postcode_adj, hh_address) %>%
 #   summarise(n_students_hh = n()) %>%
 #   ungroup() %>%
 #   mutate(hh_id = paste0("hh_", 1:n()))
 # 
 # 
 # # Matching back to data ---------------------------------------------------
 # 
 # Match multi-student postcodes back to student records
 multi_key <- left_join(multi_postcode_address, multi_students_by_hh, by = c("hh_address" = "hh_address")) %>%
  select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, hh_id)

 # Key for single student postcodes
 single_key <- single_student_postcode %>%
  mutate(hh_id = paste0("hh_", nrow(multi_students_by_hh)+1:n()))

 # Attach all data
 address_with_id <- address %>%
  left_join(multi_key, by = c("unfiltered_rowkey" = "unfiltered_rowkey")) %>%
  left_join(single_key, by = c("postcode_adj" = "postcode_adj")) %>%
  mutate(hh_id = ifelse(is.na(hh_id.x), hh_id.y, hh_id.x)) %>%
  select(unfiltered_rowkey, hh_id)

 # Write keys to csv
 final_dt <- address %>%
  select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, postcode_adj) %>%
  left_join(address_with_id, by = "unfiltered_rowkey")

 readr::write_csv(final_dt, path = here::here("aut_hh_v2.csv"), col_names = TRUE)


 # check mean students per household -----------------------------------------------------------------
 # address_by_hh <- address_with_id %>%
 #   group_by(hh_id) %>%
 #   summarise(n_students = n()) %>%
 #   summary(n_students)
	# Last modified June 6 2020
	# -------------------------

	library(data.table)
	library(tidyverse)
	# Aim: find duplicate residences (multiple students in the same house)

	# # Read in raw data
	# autumn_20 <- fread("Autumn_Census_Addresses_2020.txt")
	#
	# # Filter students
	# autumn_20 <- as.data.table(autumn_20)
	autumn_20[, unfiltered_rowkey := .I]
	#
	# address_dt <- autumn_20[EnrolStatus_AUT20 %in% c("C", "M", "F")]
	# address_dt$date <- lubridate::ymd(address_dt$EntryDate_AUT20)
	# address_dt <- address_dt[address_dt[, .I[which.max(date)], by=PupilMatchingRefAnonymous_AUT20]$V1]
	#
	# # Original data set: 8,281,864
	# # Filtered by enrolment: 8,262,658
	# # Filtered to latest enrol date by student: 8,260,919
	# # Check: 2 pupils with same ID, different entry dates
	# # example_multi_id <- address_dt[PupilMatchingRefAnonymous_AUT20 %in% c("CCA55EC13DD3B9F9CA", "CCA55FC033DEBCF1CD")]
	#
	#
	# # Address matching --------------------------------------------------------
	# address <- as_tibble(address_dt)
	#
	# # Set to missing any addresses same as school address
	# address <- address %>%
	# select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, UniquePropertyReferenceNumber_AUT20, LAEstab_AUT20, postcode_school = SchoolPostcode_AUT20, postcode_resi = Postcode_AUT20, address_full = Address_Adjusted_AUT20) %>%
	# mutate(postcode_adj = ifelse(postcode_resi == postcode_school, NA, postcode_resi))
	#
	#
	# # Single vs multiple student postcodes -----------------------------------------------------
	# # Step 1 Find unique postcodes with only 1 line in them - these are unique 1 student households
	# # Step 2 Find unique postcodes with multiple lines in them - these need sorting to whether multiple addresses or not
	#
	# n_per_postcode <- address %>%
	# group_by(postcode_adj) %>%
	# summarise(n_students = n())
	#
	# single_student_postcode <- filter(n_per_postcode, n_students == 1) %>%
	# rename(n_students_hh = n_students)
	#
	# multi_postcode <- filter(n_per_postcode, n_students > 1)
	#
	# # Now dealing only with multiple students living in one postcode
	# multi_postcode_address <- filter(address, postcode_adj %in% multi_postcode$postcode_adj)
	#
	# # Find most common words for building names to remove
	# # token_address <- address %>%
	# # tidytext::unnest_tokens(address_line, address_full) %>%
	# # count(address_line, sort = T)
	#
	# # Clean address to create unique match for each household
	# multi_postcode_address <- multi_postcode_address %>%
	# mutate(
	# address_trim = str_to_lower(address_full),
	# address_trim = str_remove_all(address_trim, "flat"),
	# address_trim = str_remove_all(address_trim, "house"),
	# address_trim = str_remove_all(address_trim, "court"),
	# address_trim = str_remove_all(address_trim, "estate"),
	# address_trim = str_remove_all(address_trim, "block"),
	# address_trim = str_remove_all(address_trim, "apartment+"),
	# address_trim = str_trim(address_trim),
	# # All numbers
	# number_only = str_remove_all(address_trim, "[[:alpha:]]"),
	# number_only = str_remove_all(number_only, "[[:punct:]]"),
	# number_only = str_replace_na(number_only, "_"),
	# # Number then letter e.g. Flat 8B
	# number_letter_locate = str_locate_all(address_trim, "[[:digit:]][[:digit:]][[:alpha:]]"),
	# number_letter_start = as.numeric(str_sub(number_letter_locate, start = 1, end = 1)),
	# number_letter_end = as.numeric(str_sub(number_letter_locate, start = 3, end = 3)),
	# number_letter = str_sub(address_trim, start = number_letter_start, end = number_letter_end),
	# number_letter = str_replace_na(number_letter, "_"),
	# # House name only
	# name_only = str_to_lower(address_trim),
	# name_only = str_remove_all(name_only, "[[:digit:]]"),
	# name_only = word(address_trim),
	# name_only = ifelse(name_only == "the", word(address_trim, 1,2), name_only),
	# name_only = str_remove_all(name_only, "[[:punct:]]"),
	# name_only = str_replace_na(name_only, "_"),
	# # Combine all above
	# hh_address = str_c(postcode_adj, "_", number_only, "_", number_letter, "_", name_only),
	# hh_address = str_remove_all(hh_address, "[[:space:]]")
	# )
	#
	# # Group by household identifier
	# multi_students_by_hh <- multi_postcode_address %>%
	# group_by(postcode_adj, hh_address) %>%
	# summarise(n_students_hh = n()) %>%
	# ungroup() %>%
	# mutate(hh_id = paste0("hh_", 1:n()))
	#
	#
	# # Matching back to data ---------------------------------------------------
	#
	# Match multi-student postcodes back to student records
	multi_key <- left_join(multi_postcode_address, multi_students_by_hh, by = c("hh_address" = "hh_address")) %>%
	select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, hh_id)

	# Key for single student postcodes
	single_key <- single_student_postcode %>%
	mutate(hh_id = paste0("hh_", nrow(multi_students_by_hh)+1:n()))

	# Attach all data
	address_with_id <- address %>%
	left_join(multi_key, by = c("unfiltered_rowkey" = "unfiltered_rowkey")) %>%
	left_join(single_key, by = c("postcode_adj" = "postcode_adj")) %>%
	mutate(hh_id = ifelse(is.na(hh_id.x), hh_id.y, hh_id.x)) %>%
	select(unfiltered_rowkey, hh_id)

	# Write keys to csv
	final_dt <- address %>%
	select(unfiltered_rowkey, PupilMatchingRefAnonymous_AUT20, postcode_adj) %>%
	left_join(address_with_id, by = "unfiltered_rowkey")

	readr::write_csv(final_dt, path = here::here("aut_hh_v2.csv"), col_names = TRUE)


	# check mean students per household -----------------------------------------------------------------
	# address_by_hh <- address_with_id %>%
	# group_by(hh_id) %>%
	# summarise(n_students = n()) %>%
	# summary(n_students)
No results found