Skip to content

Instantly share code, notes, and snippets.

@seandavi
Last active October 19, 2023 23:32
Show Gist options
  • Save seandavi/3d3136aaf66b0b58e2f1906085415312 to your computer and use it in GitHub Desktop.
Save seandavi/3d3136aaf66b0b58e2f1906085415312 to your computer and use it in GitHub Desktop.
Scrape the cancer incidence from https://statecancerprofiles.gov
"""This script scrapes the state cancer profiles website
and saves the results as csv files.
The website is a bit of a mess, so this script is a bit of a mess.
It requires scraping the select options from the website
and then iterating over all the possible combinations of
select options to get the data.
The script is designed to be run from the command line
with the following command:
```
python scrape_statecancerprofiles.py
```
The script will create two csv files, one for incidence
and one for death. Each of these files is about 700k lines.
The script will also print out the url for each request
it makes. Note that the script will make a lot of requests
and take a long time to run (about 30 minutes or so, depending
on bandwidth). We use a messy try-except block a lot
because the website is not very robust and some of the
options are not valid for some of the other options.
"""
import httpx
import pandas as pd
from bs4 import BeautifulSoup
def get_select_options():
soup = BeautifulSoup(
httpx.get(
"https://statecancerprofiles.cancer.gov/incidencerates/index.php"
).text,
"html.parser",
)
select_dict = {}
for s in soup.find_all("select"):
option_dict = {}
field = s.attrs.get("id") # age, stage, ....
for o in s.find_all("option"):
txt = o.get_text()
val = o.attrs.get("value")
if not txt.startswith("---"):
option_dict[o.attrs.get("value")] = o.get_text()
if field == "age":
# website is missing some age groups
# since the developers tried to be clever
# with javascript, apparently.
# This is a hack to add the missing age groups
# to the select options for pediatrics.
option_dict["016"] = "Age < 15"
option_dict["015"] = "Age < 20"
select_dict[s.attrs.get("id")] = option_dict
return select_dict
def column_text_replace(txt: str):
return (
txt.strip()
.replace("[", "")
.replace("]", "")
.replace("(", "")
.replace(")", "")
.replace("*", "_")
.replace(" ", "_")
.replace("%", "pct")
.replace("-", "_")
.replace(",", "_")
.replace(".", "_")
.replace("?", "")
.lower()
)
def get_table(
year: str = "0",
stateFIPS: str = "00", # 00 includes all states
sex: str = "0",
stage: str = "999",
race: str = "00",
cancer: str = "001",
areatype: str = "county",
age: str = "001",
_type: str = "incd",
):
if _type == "incd":
rate_col = "age_adjusted_incidence_raterate_note___cases_per_100_000"
url_insert = "incidencerates"
else:
rate_col = "age_adjusted_death_raterate_note___deaths_per_100_000"
url_insert = "deathrates"
url = (
f"https://statecancerprofiles.cancer.gov/{url_insert}/index.php?stateFIPS={stateFIPS}"
f"&areatype={areatype}&cancer={cancer}&race={race}"
f"&stage={stage}&year={year}"
f"&sex={sex}&age={age}&type={_type}&output=1"
)
print(url)
import pandas as pd
df = pd.read_csv(
url,
skiprows=8,
low_memory=False,
na_values=["*", "N/A", " N/A", "N/A ", " N/A "],
skipinitialspace=True,
dtype={"FIPS": str},
)
df.columns = [column_text_replace(c) for c in df.columns]
select_opts = get_select_options()
def get_text_from_select_id(group, id):
return select_opts[group][id]
df["year"] = get_text_from_select_id("year", year)
df["sex"] = get_text_from_select_id("sex", sex)
df["stage"] = get_text_from_select_id("stage", stage)
df["race"] = get_text_from_select_id("race", race)
df["cancer"] = get_text_from_select_id("cancer", cancer)
df["areatype"] = get_text_from_select_id("areatype", areatype)
df["age"] = get_text_from_select_id("age", age)
df["state_fips"] = df["fips"].str[:2]
df["measurement"] = _type
df["locale_type"] = "other"
df.loc[df["fips"].isna(), "fips"] = ""
df.loc[df["fips"].str.endswith("000"), "locale_type"] = "state"
df.loc[df["fips"].str.startswith("00"), "locale_type"] = "national"
df.loc[df["county"].str.contains("County"), "locale_type"] = "county"
df["_extracted_at"] = pd.Timestamp.now().isoformat()
df["url"] = url.replace("&output=1", "")
for numeric_column in [
rate_col,
"lower_95pct_confidence_interval",
"upper_95pct_confidence_interval",
"lower_ci_ci_rank",
"upper_ci_ci_rank",
"average_annual_count",
"recent_5_year_trend_trend_note_in_incidence_rates",
"lower_95pct_confidence_interval_1",
"upper_95pct_confidence_interval_1",
]:
try:
df[numeric_column] = pd.to_numeric(df[numeric_column], errors="coerce")
except:
pass
return df[df[rate_col].notna()]
# This function uses argparse to collect the command line arguments
# and then calls get_table() with those arguments.
# use last five years of data (year=0), all states (stateFIPS=00)
def master_table(year: str = "0", stateFIPS="00", _type="incd"):
select_options = get_select_options()
print(select_options)
dflist = []
for cancer in list(select_options["cancer"].keys())[6:]:
print(cancer)
# The state cancer profiles folks in their
# infinite wisdom decided to make the age
# groups for cancer 515 and 516 (pediatrics) different
# than the other cancers. So we have to
# handle them separately.
if cancer == "516":
ages = ["016"]
elif cancer == "515":
ages = ["015"]
else:
ages = select_options["age"].keys()
for age in ages:
for sex in select_options["sex"].keys():
for race in select_options["race"].keys():
for stage in select_options["stage"].keys():
try:
df = get_table(
cancer=cancer,
age=age,
sex=sex,
race=race,
stage=stage,
_type=_type,
)
dflist.append(df)
except KeyboardInterrupt:
raise
except Exception as e:
print(e)
pass
df = pd.concat(dflist)
return df
def main():
df = master_table(_type="incd")
df.to_csv("state_cancer_profiles_incidence.csv.gz", index=False, compression="gzip")
df = master_table(_type="death")
df.to_csv("state_cancer_profiles_death.csv.gz", index=False, compression="gzip")
if __name__ == "__main__":
print(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment