For example, this.
library(httr)
library(tibble)
library(tidyr)
library(dplyr)
library(janitor)
headers = c(
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
`Accept` = '*/*',
`Accept-Language` = 'en-US,en;q=0.5',
`Accept-Encoding` = 'gzip, deflate, br',
`Content-Type` = 'application/json',
`Origin` = 'https://nccd.cdc.gov',
`Connection` = 'keep-alive',
`Referer` = 'https://nccd.cdc.gov/',
`Sec-Fetch-Dest` = 'empty',
`Sec-Fetch-Mode` = 'cors',
`Sec-Fetch-Site` = 'same-site',
`TE` = 'trailers'
)
d <- glue::glue(
'
{
"ThemeId": 29,
"ThemeFilterOptions": [
{"FilterId": 9, "FilterOptionId": "1"},
{"FilterId": 2, "FilterOptionId": "1"},
{"FilterId": 3, "FilterOptionId": "1"},
{"FilterId": 4, "FilterOptionId": "1"},
{"FilterId": 7, "FilterOptionId": "1"}
]
}',
.open = '<<<',
.close = '>>>'
)
resp <- POST(
url = 'https://nccd-proxy.services.cdc.gov/DHDSP_ATLAS/report/national',
add_headers(headers),
body = d
)
cont <- content(resp)
cont$Data |>
enframe() |>
select(value) |>
unnest_wider(value) |>
clean_names()
#> # A tibble: 6 × 2
#> national_value race_name
#> <dbl> <chr>
#> 1 422. All Races/Ethnicities
#> 2 572. Black (Non-Hispanic)
#> 3 426. White (Non-Hispanic)
#> 4 316. Hispanic
#> 5 370. American Indian and Alaskan Native
#> 6 253. Asian and Pacific Islander
library(httr)
library(rvest)
library(purrr)
cookies = c(
'ASP.NET_SessionId' = '0lfkmqevy1fymmcucrhip0tu',
'TS01ae363a' = '015d0abe875df1ee3644d53e0c7025c28236ce4c0bf3cb4f2df5b44dc7ba48945e400ebeebc6c48215ce3bd84cba71c554def7c9e2428264e01720bc67091cd95231959a78',
's_fid' = '0A331B0A56C1652E-3937C02126313DE7',
's_vnum' = '1669874400052%26vn%3D2',
's_lv' = '1668899711399',
's_ppvl' = 'Interactive%2520Atlas%2520of%2520Heart%2520Disease%2520and%2520Stroke%2520Tables%2C41%2C41%2C1209%2C1718%2C767%2C3440%2C1440%2C1%2CL',
's_ppv' = 'Interactive%2520Atlas%2520of%2520Heart%2520Disease%2520and%2520Stroke%2520Tables%2C29%2C39%2C1154%2C1718%2C848%2C3440%2C1440%2C1%2CL',
's_ptc' = '0.04%5E%5E0.00%5E%5E0.00%5E%5E0.00%5E%5E0.09%5E%5E0.01%5E%5E0.89%5E%5E0.02%5E%5E1.05',
's_cc' = 'true',
'_ga' = 'GA1.2.1503614998.1668874414',
'_gid' = 'GA1.2.434784005.1668874414',
's_tps' = '151',
's_pvs' = '1130',
's_ria' = 'Flash%20Not%20Detected%7C',
'gpv_c54' = 'https%3A%2F%2Fnccd.cdc.gov%2FDHDSPAtlas%2FReports.aspx',
's_invisit' = 'true',
's_lv_s' = 'Less%20than%201%20day',
's_visit' = '1',
'gpv_v45' = 'Interactive%20Atlas%20of%20Heart%20Disease%20and%20Stroke%20Tables'
)
headers = c(
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
`Accept` = '*/*',
`Accept-Language` = 'en-US,en;q=0.5',
`Accept-Encoding` = 'gzip, deflate, br',
`Content-Type` = 'application/json',
`X-Requested-With` = 'XMLHttpRequest',
`Connection` = 'keep-alive',
`Referer` = 'https://nccd.cdc.gov/DHDSPAtlas/Reports.aspx',
`Sec-Fetch-Dest` = 'empty',
`Sec-Fetch-Mode` = 'cors',
`Sec-Fetch-Site` = 'same-origin'
)
resp <- httr::GET(url = 'https://nccd.cdc.gov/DHDSPAtlas/ThematicDataHandler.ashx?{%22CountyVersion%22:%222018%22,%22ThemeId%22:14,%22ThemeFilterOptions%22:[{%22FilterId%22:9,%22FilterOptionId%22:%223%22},{%22FilterId%22:2,%22FilterOptionId%22:%221%22},{%22FilterId%22:3,%22FilterOptionId%22:%221%22},{%22FilterId%22:4,%22FilterOptionId%22:%221%22},{%22FilterId%22:7,%22FilterOptionId%22:%222%22}],%22GeographyType%22:%22state%22,%22ParentGeographyFilter%22:%22%22}', httr::add_headers(.headers=headers), httr::set_cookies(.cookies = cookies))
cont <- content(resp)
cont |>
read_html() |>
html_table() |>
pluck(1)
For example, Texas.
library(rvest)
library(purrr)
url <- 'https://statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=texas'
all_tables <- url |>
read_html() |>
html_table()
## pick whichever table
all_tables |> pluck(1)
#> # A tibble: 23 × 9
#> Age-Adjusted In…¹ Texas…² USA R…³ Map Table CI*Ra…⁴ Rate …⁵ Histo…⁶ 5-Yea…⁷
#> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 All Cancer Sites 415. 449. Map Table Rank G… Rate G… Histor… 5-Year…
#> 2 Bladder 14.8 19.4 Map Table Rank G… Rate G… Histor… 5-Year…
#> 3 Brain & ONS 6.1 6.4 Map Table Rank G… Rate G… Histor… 5-Year…
#> 4 Breast (Female) 117 128. Map Table Rank G… Rate G… Histor… 5-Year…
#> 5 Breast (in situ)… 22.4 29.4 Map Table Rank G… Rate G… Histor… 5-Year…
#> 6 Cervix (Female) 9.4 7.7 Map Table Rank G… Rate G… Histor… 5-Year…
#> 7 Childhood (Ages … 17.9 17.3 Map Table Rank G… Rate G… Histor… N/A
#> 8 Childhood (Ages … 19.3 19 Map Table Rank G… Rate G… Histor… N/A
#> 9 Colon & Rectum 38 37.7 Map Table Rank G… Rate G… Histor… 5-Year…
#> 10 Esophagus 4 4.6 Map Table Rank G… Rate G… Histor… 5-Year…
#> # … with 13 more rows, and abbreviated variable names
#> # ¹`Age-Adjusted Incidence Rates by Cancer Site, All Stages (2015-2019)`,
#> # ²`Texas Rate`, ³`USA Rate`, ⁴`CI*Rank Graph`, ⁵`Rate Graph`,
#> # ⁶`Historical Trends Graph`, ⁷`5-Year Rate Change Graph`
all_urls <- sprintf('https://statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=%s', gsub(' ', '', tolower(state.name)))
library(httr)
library(rvest)
library(purrr)
cookies = c(
'SERVERID' = 'web-dmzst-02',
'_ga_XGHGG2QGBZ' = 'GS1.1.1668899965.2.1.1668900305.0.0.0',
'_ga' = 'GA1.2.1276294231.1668874894',
's_cc' = 'true',
's_fid' = '28212AD53953279A-002BE259493E9BE0',
's_ppv' = '98%7C0',
's_sq' = 'ncienterprise%3D%2526pid%253Dstatecancerprofiles.cancer.gov%25252Frisk%25252Findex.php%2526pidt%253D1%2526oid%253Dfunctiononclick%252528event%252529%25257BsubmitForm%252528%252529%25253B%25257D%2526oidt%253D2%2526ot%253DBUTTON',
'_gid' = 'GA1.2.1329675980.1668874895',
'_ga' = 'GA1.3.1276294231.1668874894',
'_gid' = 'GA1.3.1329675980.1668874895',
's_vi' = '[CS]v1|31BC8147497EB470-60001F36C34E43E6[CE]',
'gpv_pn' = 'statecancerprofiles.cancer.gov%2Frisk%2Findex.php',
'_gat_gtag_UA_112281461_1' = '1',
'_gat_GSA_ENOR0' = '1'
)
headers = c(
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
`Accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
`Accept-Language` = 'en-US,en;q=0.5',
`Accept-Encoding` = 'gzip, deflate, br',
`Connection` = 'keep-alive',
`Referer` = 'https://statecancerprofiles.cancer.gov/risk/index.php',
`Upgrade-Insecure-Requests` = '1',
`Sec-Fetch-Dest` = 'document',
`Sec-Fetch-Mode` = 'navigate',
`Sec-Fetch-Site` = 'same-origin',
`Sec-Fetch-User` = '?1'
)
params = list(
`topic` = 'colorec',
`risk` = 'v09',
`race` = '00',
`sex` = '0',
`datatype` = '0',
`type` = 'risk',
`sortVariableName` = 'default',
`sortOrder` = 'default'
)
resp <- httr::GET(url = 'https://statecancerprofiles.cancer.gov/risk/index.php#results', httr::add_headers(.headers=headers), query = params, httr::set_cookies(.cookies = cookies))
cont <- content(resp)
cont |>
html_table() |>
pluck(1)
library(httr)
library(rvest)
library(purrr)
library(dplyr)
library(stringr)
cookies = c(
'SERVERID' = 'web-dmzst-02',
'_ga_XGHGG2QGBZ' = 'GS1.1.1668899965.2.1.1668900521.0.0.0',
'_ga' = 'GA1.2.1276294231.1668874894',
's_cc' = 'true',
's_fid' = '28212AD53953279A-002BE259493E9BE0',
's_ppv' = '95%7C20',
's_sq' = 'ncienterprise%3D%2526pid%253Dstatecancerprofiles.cancer.gov%25252Fincidencerates%25252Findex.php%2526pidt%253D1%2526oid%253Dfunctiononclick%252528event%252529%25257BsubmitForm%252528%252529%25253B%25257D%2526oidt%253D2%2526ot%253DBUTTON',
'_gid' = 'GA1.2.1329675980.1668874895',
'_ga' = 'GA1.3.1276294231.1668874894',
'_gid' = 'GA1.3.1329675980.1668874895',
's_vi' = '[CS]v1|31BC8147497EB470-60001F36C34E43E6[CE]',
'gpv_pn' = 'statecancerprofiles.cancer.gov%2Fincidencerates%2Findex.php',
'_gat_gtag_UA_112281461_1' = '1',
'_gat_GSA_ENOR0' = '1'
)
headers = c(
`User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
`Accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
`Accept-Language` = 'en-US,en;q=0.5',
`Accept-Encoding` = 'gzip, deflate, br',
`Connection` = 'keep-alive',
`Referer` = 'https://statecancerprofiles.cancer.gov/incidencerates/index.php',
`Upgrade-Insecure-Requests` = '1',
`Sec-Fetch-Dest` = 'document',
`Sec-Fetch-Mode` = 'navigate',
`Sec-Fetch-Site` = 'same-origin',
`Sec-Fetch-User` = '?1',
`TE` = 'trailers'
)
params = list(
`stateFIPS` = '00',
`areatype` = 'state',
`cancer` = '001',
`race` = '00',
`sex` = '0',
`age` = '001',
`stage` = '999',
`year` = '0',
`type` = 'incd',
`sortVariableName` = 'rate',
`sortOrder` = 'default',
`output` = '0'
)
resp <- httr::GET(url = 'https://statecancerprofiles.cancer.gov/incidencerates/index.php#results', httr::add_headers(.headers=headers), query = params, httr::set_cookies(.cookies = cookies))
tb <- content(resp) |>
html_table() |>
pluck(1)
tb |>
mutate(
State = str_replace_all(State, '(^.*)\\-(.*)-(.*$)', '\\2')
)