geotheory · August 22, 2024 23:36
diff --git a/root-domain.R b/root-domain.R
 # Vectorised workflow to extract the root domains from a list of urls where possible,
 # based on identification of the url segment leftwardly adjacent to its public suffix.
 # Handling for used domains that are also a registered public suffix.
 # Handling also for domains unmatched in public suffix (inc. punycode URLs and URLs of top-level domains
 # that are not registered as public suffixes (notable .bd, .ck, .er, .fk, .jm, .kh, .mm, .np, .pg, .za).
 # Unrecognised domains will be returned as is.

 # latest official list of public suffixes
 psl_data = readLines("https://publicsuffix.org/list/public_suffix_list.dat")
 psl_data = psl_data[!grepl("^(//|\\s*$)", psl_data)] |> tolower()


 get_root_domain <- function(urls, psl_data) {
  domains = gsub('https?://', '', urls)
  domains = gsub('^www[0-9]{,2}\\.', '', domains) # handle unusual www prefixes
  domains = gsub('[^a-zA-Z0-9.-].*', '', domains) # adhere to valid domain characters
  segments = strsplit(domains, '\\.')
  
  combined_segments = segments |> lapply(rev) |> 
    lapply(\(segs) Reduce(\(x, y) paste(y, x, sep = '.'), segs, accumulate = TRUE)) |> 
    lapply(rev)
  
  root_doms = combined_segments |> lapply(\(x) x[which(x %in% psl_data)[1]-1]) # main process
  
  # handling cases where the url root is a registered pub suffix
  ps_roots = sapply(root_doms, length) == 0
  if(any(ps_roots)){
    root_doms[ps_roots] = combined_segments[ps_roots] |> lapply(\(x) x[which(x %in% psl_data)[1]])
  }
  
  # unmatched handling - just accept whole domain as root as there's no way to identify subdomains
  nas = is.na(root_doms)
  if(any(nas)){
    root_doms[nas] = combined_segments[nas] |> sapply(\(x) x[1])
  }  
  unlist(root_doms) # return
 }


                              
 ## Example usage
                                                                
 urls <- c("https://www.example.com", 
          "http://example.org/path",
          "https://subdomain.example.co.uk/path",
          "http://www2.another.subdomain.example.jp:443/path", # rare www prefix and added port
          "https://www.africa.com/path",  # case where used root domain is a registered public suffix
          "https://thenational.com.pg/path", # top-level domain unregistered as public suffix
          "https://subdom.thenational.com.pg/path",
          "https://xn--j1aidcn.org/path", # punycode url that contains a registered public suffix
          "https://xn--b1aga5aadd.xn--p1ai/path",  # punycode containing no registered public suffix
          "https://www12.madeup.nonsense/path")

 get_root_domain(urls, psl_data)
 #  [1] "example.com"               "example.org"               "example.co.uk"            
 #  [4] "example.jp"                "africa.com"                "thenational.com.pg"       
 #  [7] "subdom.thenational.com.pg" "xn--j1aidcn.org"           "xn--b1aga5aadd.xn--p1ai"  
 # [10] "madeup.nonsense"
	# Vectorised workflow to extract the root domains from a list of urls where possible,
	# based on identification of the url segment leftwardly adjacent to its public suffix.
	# Handling for used domains that are also a registered public suffix.
	# Handling also for domains unmatched in public suffix (inc. punycode URLs and URLs of top-level domains
	# that are not registered as public suffixes (notable .bd, .ck, .er, .fk, .jm, .kh, .mm, .np, .pg, .za).
	# Unrecognised domains will be returned as is.

	# latest official list of public suffixes
	psl_data = readLines("https://publicsuffix.org/list/public_suffix_list.dat")
	psl_data = psl_data[!grepl("^(//\|\\s*$)", psl_data)] \|> tolower()


	get_root_domain <- function(urls, psl_data) {
	domains = gsub('https?://', '', urls)
	domains = gsub('^www[0-9]{,2}\\.', '', domains) # handle unusual www prefixes
	domains = gsub('[^a-zA-Z0-9.-].*', '', domains) # adhere to valid domain characters
	segments = strsplit(domains, '\\.')

	combined_segments = segments \|> lapply(rev) \|>
	lapply(\(segs) Reduce(\(x, y) paste(y, x, sep = '.'), segs, accumulate = TRUE)) \|>
	lapply(rev)

	root_doms = combined_segments \|> lapply(\(x) x[which(x %in% psl_data)[1]-1]) # main process

	# handling cases where the url root is a registered pub suffix
	ps_roots = sapply(root_doms, length) == 0
	if(any(ps_roots)){
	root_doms[ps_roots] = combined_segments[ps_roots] \|> lapply(\(x) x[which(x %in% psl_data)[1]])
	}

	# unmatched handling - just accept whole domain as root as there's no way to identify subdomains
	nas = is.na(root_doms)
	if(any(nas)){
	root_doms[nas] = combined_segments[nas] \|> sapply(\(x) x[1])
	}
	unlist(root_doms) # return
	}



	## Example usage

	urls <- c("https://www.example.com",
	"http://example.org/path",
	"https://subdomain.example.co.uk/path",
	"http://www2.another.subdomain.example.jp:443/path", # rare www prefix and added port
	"https://www.africa.com/path", # case where used root domain is a registered public suffix
	"https://thenational.com.pg/path", # top-level domain unregistered as public suffix
	"https://subdom.thenational.com.pg/path",
	"https://xn--j1aidcn.org/path", # punycode url that contains a registered public suffix
	"https://xn--b1aga5aadd.xn--p1ai/path", # punycode containing no registered public suffix
	"https://www12.madeup.nonsense/path")

	get_root_domain(urls, psl_data)
	# [1] "example.com" "example.org" "example.co.uk"
	# [4] "example.jp" "africa.com" "thenational.com.pg"
	# [7] "subdom.thenational.com.pg" "xn--j1aidcn.org" "xn--b1aga5aadd.xn--p1ai"
	# [10] "madeup.nonsense"