Created
September 29, 2022 13:54
-
-
Save alekrutkowski/819a1625a41d7040078f6fec22e9c8d2 to your computer and use it in GitHub Desktop.
R function: email address ➜ institution name, quick and dirty (just the 1st hit in Bing)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Usage example (disclaimer: fictitious email addresses, any resemblance to real ones is coincidental): | |
| ## emailAddressToInstitutionName(c('[email protected]', | |
| ## '[email protected]')) | |
| ##> "Ministère du Travail, du Plein emploi et de l'Insertion" | |
| ##> "Ministerie van Sociale Zaken en Werkgelegenheid | Rijksoverheid.nl" | |
| # Required packages: `magrittr`, `rvest` | |
| library(magrittr) # for the %>% operator | |
| emailAddressToInstitutionName <- function(charvec_of_email_addresses, | |
| wait_time_before_retry=3) { # in seconds | |
| stopifnot(is.character(charvec_of_email_addresses), | |
| length(charvec_of_email_addresses)>0, | |
| all(grepl('^\\S+@\\S+$',charvec_of_email_addresses)), | |
| is.numeric(wait_time_before_retry), | |
| length(wait_time_before_retry)==1, | |
| wait_time_before_retry>0) | |
| query <- function(x) | |
| x %>% | |
| rvest::read_html() %>% | |
| rvest::html_nodes('#b_results > li.b_algo.b_vtl_deeplinks > h2 > a') %>% | |
| rvest::html_text() %>% | |
| ifelse(length(.)!=0, ., | |
| {Sys.sleep(wait_time_before_retry); query(x)}) # retry | |
| charvec_of_email_addresses %>% | |
| sub('.+@(.+)','\\1',.) %>% # extract domain | |
| paste0('https://www.bing.com/search?q=',.) %>% | |
| sapply(query, USE.NAMES=FALSE) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment