Skip to content

Instantly share code, notes, and snippets.

@benjaminrobinson
Created November 3, 2021 02:25
Show Gist options
  • Save benjaminrobinson/7e78dd8c3c4b88d6b922d4d447f99f1e to your computer and use it in GitHub Desktop.
Save benjaminrobinson/7e78dd8c3c4b88d6b922d4d447f99f1e to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
getAgentList <- function(x) {
message(paste("Scraping Page", x))
paste0('https://nflpa.com/search/agents?page=', x) %>%
read_html -> web
if (web %>%
html_table %>%
bind_rows %>%
pull(Name) %>%
is.na %>%
all) {
stop("\n Nothing to see here.")
} else {
web %>%
html_table %>%
bind_rows %>%
clean_names %>%
select(-full_profile) %>%
mutate(
agent_id = web %>%
html_nodes('table') %>%
html_nodes(xpath = ".//td[1]/a") %>%
html_attr("href") %>%
gsub("/profile/agent/", "", .)
) %>%
as_tibble %>%
mutate_all(~ str_squish(.)) %>%
mutate_all(~ ifelse(. == '', NA, .)) %>%
arrange(agent_id)
}
}
getAgentInfo <- function(x) {
message(paste("Scraping Agent Page", x))
paste0('https://nflpa.com/profile/agent/', x) %>%
read_html -> web
tibble(
agent_id = x,
name = web %>%
html_nodes('.profile__header-name') %>%
html_text %>%
str_squish,
company = web %>%
html_nodes('.profile__company') %>%
html_text %>%
str_squish %>%
ifelse(length(.) == 0, NA, .),
certified_since = web %>%
html_nodes('.items-center') %>%
html_text %>%
str_squish %>%
.[. != 'Accept and Close'] %>%
gsub("NFLPA Certified since ", "", .) %>%
as.numeric,
phone_number = paste0(
"(",
web %>%
html_nodes('.profile__icon-list') %>%
html_text %>%
str_squish %>%
.[1] %>%
sub(".*[(]", "", .)
),
address = web %>%
html_nodes('.profile__icon-list') %>%
html_text %>%
str_squish %>%
.[1] %>%
sub(" [(].*", "", .),
website = web %>%
html_nodes('.profile__icon-list') %>%
html_text %>%
str_squish %>%
.[2],
school = web %>%
html_nodes('.flex') %>%
html_text %>%
str_squish %>%
.[. != 'Accept and Close'
& . != 'Search NFLPA.com'
& !grepl("Certified", .)] %>%
unique %>%
paste(collapse = ', '),
services = web %>%
html_nodes('div') %>%
html_children %>%
html_nodes("li") %>%
html_text %>%
str_squish %>%
.[!grepl("http", .)] %>%
paste(collapse = ', ') %>%
gsub(", [instgram].*", "", .) %>%
sub(".*[0-9][0-9][0-9][-][0-9][0-9][0-9][0-9], ", "", .)
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment