Last active
August 22, 2024 23:36
-
-
Save geotheory/0ba06870950d0fb2a0f26b6343228550 to your computer and use it in GitHub Desktop.
Extract root domains from URLs in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Vectorised workflow to extract the root domains from a list of urls where possible, | |
# based on identification of the url segment leftwardly adjacent to its public suffix. | |
# Handling for used domains that are also a registered public suffix. | |
# Handling also for domains unmatched in public suffix (inc. punycode URLs and URLs of top-level domains | |
# that are not registered as public suffixes (notable .bd, .ck, .er, .fk, .jm, .kh, .mm, .np, .pg, .za). | |
# Unrecognised domains will be returned as is. | |
# latest official list of public suffixes | |
psl_data = readLines("https://publicsuffix.org/list/public_suffix_list.dat") | |
psl_data = psl_data[!grepl("^(//|\\s*$)", psl_data)] |> tolower() | |
get_root_domain <- function(urls, psl_data) { | |
domains = gsub('https?://', '', urls) | |
domains = gsub('^www[0-9]{,2}\\.', '', domains) # handle unusual www prefixes | |
domains = gsub('[^a-zA-Z0-9.-].*', '', domains) # adhere to valid domain characters | |
segments = strsplit(domains, '\\.') | |
combined_segments = segments |> lapply(rev) |> | |
lapply(\(segs) Reduce(\(x, y) paste(y, x, sep = '.'), segs, accumulate = TRUE)) |> | |
lapply(rev) | |
root_doms = combined_segments |> lapply(\(x) x[which(x %in% psl_data)[1]-1]) # main process | |
# handling cases where the url root is a registered pub suffix | |
ps_roots = sapply(root_doms, length) == 0 | |
if(any(ps_roots)){ | |
root_doms[ps_roots] = combined_segments[ps_roots] |> lapply(\(x) x[which(x %in% psl_data)[1]]) | |
} | |
# unmatched handling - just accept whole domain as root as there's no way to identify subdomains | |
nas = is.na(root_doms) | |
if(any(nas)){ | |
root_doms[nas] = combined_segments[nas] |> sapply(\(x) x[1]) | |
} | |
unlist(root_doms) # return | |
} | |
## Example usage | |
urls <- c("https://www.example.com", | |
"http://example.org/path", | |
"https://subdomain.example.co.uk/path", | |
"http://www2.another.subdomain.example.jp:443/path", # rare www prefix and added port | |
"https://www.africa.com/path", # case where used root domain is a registered public suffix | |
"https://thenational.com.pg/path", # top-level domain unregistered as public suffix | |
"https://subdom.thenational.com.pg/path", | |
"https://xn--j1aidcn.org/path", # punycode url that contains a registered public suffix | |
"https://xn--b1aga5aadd.xn--p1ai/path", # punycode containing no registered public suffix | |
"https://www12.madeup.nonsense/path") | |
get_root_domain(urls, psl_data) | |
# [1] "example.com" "example.org" "example.co.uk" | |
# [4] "example.jp" "africa.com" "thenational.com.pg" | |
# [7] "subdom.thenational.com.pg" "xn--j1aidcn.org" "xn--b1aga5aadd.xn--p1ai" | |
# [10] "madeup.nonsense" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment