Skip to content

Instantly share code, notes, and snippets.

@tonyelhabr
Last active November 19, 2022 23:38
Show Gist options
  • Save tonyelhabr/f95dc800505e718661ef42191ff54a2b to your computer and use it in GitHub Desktop.
Save tonyelhabr/f95dc800505e718661ef42191ff54a2b to your computer and use it in GitHub Desktop.
cdc get rekt

Heart Disease and Stroke

For example, this.

library(httr)
library(tibble)
library(tidyr)
library(dplyr)
library(janitor)

headers = c(
  `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
  `Accept` = '*/*',
  `Accept-Language` = 'en-US,en;q=0.5',
  `Accept-Encoding` = 'gzip, deflate, br',
  `Content-Type` = 'application/json',
  `Origin` = 'https://nccd.cdc.gov',
  `Connection` = 'keep-alive',
  `Referer` = 'https://nccd.cdc.gov/',
  `Sec-Fetch-Dest` = 'empty',
  `Sec-Fetch-Mode` = 'cors',
  `Sec-Fetch-Site` = 'same-site',
  `TE` = 'trailers'
)

d <- glue::glue(
  '
  {
    "ThemeId": 29,
    "ThemeFilterOptions": [
      {"FilterId": 9, "FilterOptionId": "1"},
      {"FilterId": 2, "FilterOptionId": "1"},
      {"FilterId": 3, "FilterOptionId": "1"},
      {"FilterId": 4, "FilterOptionId": "1"},
      {"FilterId": 7, "FilterOptionId": "1"}
    ]
  }',
  .open = '<<<',
  .close = '>>>'
)

resp <- POST(
  url = 'https://nccd-proxy.services.cdc.gov/DHDSP_ATLAS/report/national',
  add_headers(headers),
  body = d
)
cont <- content(resp)

cont$Data |> 
  enframe() |> 
  select(value) |> 
  unnest_wider(value) |> 
  clean_names()
  #> # A tibble: 6 × 2
#>   national_value race_name                         
#>            <dbl> <chr>                             
#> 1           422. All Races/Ethnicities             
#> 2           572. Black (Non-Hispanic)              
#> 3           426. White (Non-Hispanic)              
#> 4           316. Hispanic                          
#> 5           370. American Indian and Alaskan Native
#> 6           253. Asian and Pacific Islander

Heart Disease and Stroke State Table (NEW)

library(httr)
library(rvest)
library(purrr)

cookies = c(
  'ASP.NET_SessionId' = '0lfkmqevy1fymmcucrhip0tu',
  'TS01ae363a' = '015d0abe875df1ee3644d53e0c7025c28236ce4c0bf3cb4f2df5b44dc7ba48945e400ebeebc6c48215ce3bd84cba71c554def7c9e2428264e01720bc67091cd95231959a78',
  's_fid' = '0A331B0A56C1652E-3937C02126313DE7',
  's_vnum' = '1669874400052%26vn%3D2',
  's_lv' = '1668899711399',
  's_ppvl' = 'Interactive%2520Atlas%2520of%2520Heart%2520Disease%2520and%2520Stroke%2520Tables%2C41%2C41%2C1209%2C1718%2C767%2C3440%2C1440%2C1%2CL',
  's_ppv' = 'Interactive%2520Atlas%2520of%2520Heart%2520Disease%2520and%2520Stroke%2520Tables%2C29%2C39%2C1154%2C1718%2C848%2C3440%2C1440%2C1%2CL',
  's_ptc' = '0.04%5E%5E0.00%5E%5E0.00%5E%5E0.00%5E%5E0.09%5E%5E0.01%5E%5E0.89%5E%5E0.02%5E%5E1.05',
  's_cc' = 'true',
  '_ga' = 'GA1.2.1503614998.1668874414',
  '_gid' = 'GA1.2.434784005.1668874414',
  's_tps' = '151',
  's_pvs' = '1130',
  's_ria' = 'Flash%20Not%20Detected%7C',
  'gpv_c54' = 'https%3A%2F%2Fnccd.cdc.gov%2FDHDSPAtlas%2FReports.aspx',
  's_invisit' = 'true',
  's_lv_s' = 'Less%20than%201%20day',
  's_visit' = '1',
  'gpv_v45' = 'Interactive%20Atlas%20of%20Heart%20Disease%20and%20Stroke%20Tables'
)

headers = c(
  `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
  `Accept` = '*/*',
  `Accept-Language` = 'en-US,en;q=0.5',
  `Accept-Encoding` = 'gzip, deflate, br',
  `Content-Type` = 'application/json',
  `X-Requested-With` = 'XMLHttpRequest',
  `Connection` = 'keep-alive',
  `Referer` = 'https://nccd.cdc.gov/DHDSPAtlas/Reports.aspx',
  `Sec-Fetch-Dest` = 'empty',
  `Sec-Fetch-Mode` = 'cors',
  `Sec-Fetch-Site` = 'same-origin'
)

resp <- httr::GET(url = 'https://nccd.cdc.gov/DHDSPAtlas/ThematicDataHandler.ashx?{%22CountyVersion%22:%222018%22,%22ThemeId%22:14,%22ThemeFilterOptions%22:[{%22FilterId%22:9,%22FilterOptionId%22:%223%22},{%22FilterId%22:2,%22FilterOptionId%22:%221%22},{%22FilterId%22:3,%22FilterOptionId%22:%221%22},{%22FilterId%22:4,%22FilterOptionId%22:%221%22},{%22FilterId%22:7,%22FilterOptionId%22:%222%22}],%22GeographyType%22:%22state%22,%22ParentGeographyFilter%22:%22%22}', httr::add_headers(.headers=headers), httr::set_cookies(.cookies = cookies))
cont <- content(resp)

cont |> 
  read_html() |> 
  html_table() |> 
  pluck(1)

State Cancer Profiles

For example, Texas.

library(rvest)
library(purrr)
url <- 'https://statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=texas'
all_tables <- url |> 
  read_html() |> 
  html_table()
## pick whichever table
all_tables |> pluck(1)
#> # A tibble: 23 × 9
#>    Age-Adjusted In…¹ Texas…² USA R…³ Map   Table CI*Ra…⁴ Rate …⁵ Histo…⁶ 5-Yea…⁷
#>    <chr>               <dbl>   <dbl> <chr> <chr> <chr>   <chr>   <chr>   <chr>  
#>  1 All Cancer Sites    415.    449.  Map   Table Rank G… Rate G… Histor… 5-Year…
#>  2 Bladder              14.8    19.4 Map   Table Rank G… Rate G… Histor… 5-Year…
#>  3 Brain & ONS           6.1     6.4 Map   Table Rank G… Rate G… Histor… 5-Year…
#>  4 Breast (Female)     117     128.  Map   Table Rank G… Rate G… Histor… 5-Year…
#>  5 Breast (in situ)…    22.4    29.4 Map   Table Rank G… Rate G… Histor… 5-Year…
#>  6 Cervix (Female)       9.4     7.7 Map   Table Rank G… Rate G… Histor… 5-Year…
#>  7 Childhood (Ages …    17.9    17.3 Map   Table Rank G… Rate G… Histor… N/A    
#>  8 Childhood (Ages …    19.3    19   Map   Table Rank G… Rate G… Histor… N/A    
#>  9 Colon & Rectum       38      37.7 Map   Table Rank G… Rate G… Histor… 5-Year…
#> 10 Esophagus             4       4.6 Map   Table Rank G… Rate G… Histor… 5-Year…
#> # … with 13 more rows, and abbreviated variable names
#> #   ¹​`Age-Adjusted Incidence Rates by Cancer Site, All Stages (2015-2019)`,
#> #   ²​`Texas Rate`, ³​`USA Rate`, ⁴​`CI*Rank Graph`, ⁵​`Rate Graph`,
#> #   ⁶​`Historical Trends Graph`, ⁷​`5-Year Rate Change Graph`

all_urls <- sprintf('https://statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=%s', gsub(' ', '', tolower(state.name)))

Screening and Risk Factors (NEW)

Example

library(httr)
library(rvest)
library(purrr)

cookies = c(
  'SERVERID' = 'web-dmzst-02',
  '_ga_XGHGG2QGBZ' = 'GS1.1.1668899965.2.1.1668900305.0.0.0',
  '_ga' = 'GA1.2.1276294231.1668874894',
  's_cc' = 'true',
  's_fid' = '28212AD53953279A-002BE259493E9BE0',
  's_ppv' = '98%7C0',
  's_sq' = 'ncienterprise%3D%2526pid%253Dstatecancerprofiles.cancer.gov%25252Frisk%25252Findex.php%2526pidt%253D1%2526oid%253Dfunctiononclick%252528event%252529%25257BsubmitForm%252528%252529%25253B%25257D%2526oidt%253D2%2526ot%253DBUTTON',
  '_gid' = 'GA1.2.1329675980.1668874895',
  '_ga' = 'GA1.3.1276294231.1668874894',
  '_gid' = 'GA1.3.1329675980.1668874895',
  's_vi' = '[CS]v1|31BC8147497EB470-60001F36C34E43E6[CE]',
  'gpv_pn' = 'statecancerprofiles.cancer.gov%2Frisk%2Findex.php',
  '_gat_gtag_UA_112281461_1' = '1',
  '_gat_GSA_ENOR0' = '1'
)

headers = c(
  `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
  `Accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  `Accept-Language` = 'en-US,en;q=0.5',
  `Accept-Encoding` = 'gzip, deflate, br',
  `Connection` = 'keep-alive',
  `Referer` = 'https://statecancerprofiles.cancer.gov/risk/index.php',
  `Upgrade-Insecure-Requests` = '1',
  `Sec-Fetch-Dest` = 'document',
  `Sec-Fetch-Mode` = 'navigate',
  `Sec-Fetch-Site` = 'same-origin',
  `Sec-Fetch-User` = '?1'
)

params = list(
  `topic` = 'colorec',
  `risk` = 'v09',
  `race` = '00',
  `sex` = '0',
  `datatype` = '0',
  `type` = 'risk',
  `sortVariableName` = 'default',
  `sortOrder` = 'default'
)

resp <- httr::GET(url = 'https://statecancerprofiles.cancer.gov/risk/index.php#results', httr::add_headers(.headers=headers), query = params, httr::set_cookies(.cookies = cookies))
cont <- content(resp)
cont |> 
  html_table() |>
  pluck(1)

Incidence Rate Table (NEW)

Example

library(httr)
library(rvest)
library(purrr)
library(dplyr)
library(stringr)

cookies = c(
  'SERVERID' = 'web-dmzst-02',
  '_ga_XGHGG2QGBZ' = 'GS1.1.1668899965.2.1.1668900521.0.0.0',
  '_ga' = 'GA1.2.1276294231.1668874894',
  's_cc' = 'true',
  's_fid' = '28212AD53953279A-002BE259493E9BE0',
  's_ppv' = '95%7C20',
  's_sq' = 'ncienterprise%3D%2526pid%253Dstatecancerprofiles.cancer.gov%25252Fincidencerates%25252Findex.php%2526pidt%253D1%2526oid%253Dfunctiononclick%252528event%252529%25257BsubmitForm%252528%252529%25253B%25257D%2526oidt%253D2%2526ot%253DBUTTON',
  '_gid' = 'GA1.2.1329675980.1668874895',
  '_ga' = 'GA1.3.1276294231.1668874894',
  '_gid' = 'GA1.3.1329675980.1668874895',
  's_vi' = '[CS]v1|31BC8147497EB470-60001F36C34E43E6[CE]',
  'gpv_pn' = 'statecancerprofiles.cancer.gov%2Fincidencerates%2Findex.php',
  '_gat_gtag_UA_112281461_1' = '1',
  '_gat_GSA_ENOR0' = '1'
)

headers = c(
  `User-Agent` = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0',
  `Accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
  `Accept-Language` = 'en-US,en;q=0.5',
  `Accept-Encoding` = 'gzip, deflate, br',
  `Connection` = 'keep-alive',
  `Referer` = 'https://statecancerprofiles.cancer.gov/incidencerates/index.php',
  `Upgrade-Insecure-Requests` = '1',
  `Sec-Fetch-Dest` = 'document',
  `Sec-Fetch-Mode` = 'navigate',
  `Sec-Fetch-Site` = 'same-origin',
  `Sec-Fetch-User` = '?1',
  `TE` = 'trailers'
)

params = list(
  `stateFIPS` = '00',
  `areatype` = 'state',
  `cancer` = '001',
  `race` = '00',
  `sex` = '0',
  `age` = '001',
  `stage` = '999',
  `year` = '0',
  `type` = 'incd',
  `sortVariableName` = 'rate',
  `sortOrder` = 'default',
  `output` = '0'
)

resp <- httr::GET(url = 'https://statecancerprofiles.cancer.gov/incidencerates/index.php#results', httr::add_headers(.headers=headers), query = params, httr::set_cookies(.cookies = cookies))
tb <- content(resp) |> 
  html_table() |> 
  pluck(1)
tb |> 
  mutate(
    State = str_replace_all(State, '(^.*)\\-(.*)-(.*$)', '\\2')
  )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment