Created
August 7, 2023 12:25
-
-
Save davidad/7234bfd0a2f82aa5c09ebd953ba53477 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Load the Supplementary Data S1 from the lead poisoning PNAS paper | |
# https://www.pnas.org/doi/full/10.1073/pnas.2118631119 | |
df = pd.read_excel('pnas.2118631119.sd01.xlsx') | |
# Load the Supplementary Data S1 from the ADDM/CDDS paper | |
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/ | |
file_path_s1 = 'autism_S1.xls' | |
autism_s1 = pd.read_excel(file_path_s1) | |
# Load the Supplementary Data S4 from the ADDM/CDDS paper | |
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/ | |
asd_s4_path = 'autism_S4.xlsx' | |
asd_df = pd.read_excel(asd_s4_path) | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
# Filter the data for estimated lead poisoning at age 1 and remove forecast | |
df_filtered = df[(df['AGE'] == 1) & (df['YEAR'] <= 2023)] | |
# Create a new column 'condition_min' to hold the minimum value of each range | |
df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0])) | |
# Fix typo in units | |
df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL') | |
# Pivot the data to get years as index, conditions as columns and leadpop as values | |
df_pivot = df_filtered.pivot_table(index='COHORT', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum') | |
# Sort the columns by 'condition_min' | |
df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=True) | |
# Normalize by population | |
df_youth_pop = df_pivot.sum(axis=1) | |
df_pivot = df_pivot.div(df_youth_pop, axis=0) | |
# Drop the 'condition_min' level in the column index | |
df_pivot.columns = df_pivot.columns.droplevel('condition_min') | |
# Calculate the weighted sum of lead concentrations from each bin | |
# capped at 10, and assuming 1.25 average for the lowest bin | |
mid_points = { | |
'0-4.9 (ug/dL)': 1.25, | |
'5-9.9 (ug/dL)': 7.5, | |
'10-14.9 (ug/dL)': 10, | |
'15-19.9 (ug/dL)': 10, | |
'20-24.9 (ug/dL)': 10, | |
'25-29.9 (ug/dL)': 10, | |
'30+ (ug/dL)': 10 | |
} | |
weighted_lead_sum = df_filtered.groupby('COHORT').apply( | |
lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points[condition] for condition in mid_points) | |
) | |
# Calculate the weighted sum of lead concentrations from each bin | |
# capped at 10, and assuming 2 average for the lowest bin | |
mid_points2 = { | |
'0-4.9 (ug/dL)': 2, | |
'5-9.9 (ug/dL)': 7.5, | |
'10-14.9 (ug/dL)': 10, | |
'15-19.9 (ug/dL)': 10, | |
'20-24.9 (ug/dL)': 10, | |
'25-29.9 (ug/dL)': 10, | |
'30+ (ug/dL)': 10 | |
} | |
weighted_lead_sum2 = df_filtered.groupby('COHORT').apply( | |
lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points2[condition] for condition in mid_points) | |
) | |
# Divide by the total population for each cohort to get the average lead concentration | |
average_lead_concentration = weighted_lead_sum / df_filtered.groupby('COHORT')['leadpop'].sum() | |
average_lead_concentration2 = weighted_lead_sum2 / df_filtered.groupby('COHORT')['leadpop'].sum() | |
# Convert to g/dL | |
average_lead_concentration = average_lead_concentration * 1e-6 | |
average_lead_concentration2 = average_lead_concentration2 * 1e-6 | |
# Manually defining the birth years | |
birth_years = [1992, 1994, 1996, 1998, 2000, 2002, 2004] | |
# Removing unnecessary rows and columns | |
asd_df = asd_df.iloc[1:16, 2:9] | |
# Renaming the columns to birth years | |
asd_df.columns = birth_years | |
# Transposing the data to have birth years as the index | |
asd_df = asd_df.transpose() | |
# Converting the values to numeric | |
asd_df = asd_df.apply(pd.to_numeric, errors='coerce') | |
# Averaging the prevalence values across all states for each birth year | |
asd_avg_prevalence = asd_df.mean(axis=1) | |
# Add the latest data from https://www.cdc.gov/ncbddd/autism/data.html | |
asd_avg_prevalence[2006] = 16.8 | |
asd_avg_prevalence[2008] = 18.5 | |
asd_avg_prevalence[2010] = 23.0 | |
asd_avg_prevalence[2012] = 27.6 | |
# Divide the asd prevalence values by 10 to convert to percentages | |
asd_avg_prevalence /= 10 | |
# Extract the prevalence percentages and ages for the 2017 report | |
autism_age_resolved_2017 = autism_s1[['2017.1', '2017.2']].dropna().rename(columns={'2017.1': 'Prevalence (%)', '2017.2': 'Age'}) | |
# Convert 'Age' column to numeric | |
autism_age_resolved_2017['Age'] = pd.to_numeric(autism_age_resolved_2017['Age'], errors='coerce') | |
# Remove rows containing NaN values | |
autism_age_resolved_2017 = autism_age_resolved_2017.dropna() | |
# Convert the prevalence numbers into percentages | |
autism_age_resolved_2017['Prevalence (%)'] = autism_age_resolved_2017['Prevalence (%)'] / 100 | |
# Calculate the birth year by subtracting the age from the report year | |
autism_age_resolved_2017['Birth Year'] = 2017 - autism_age_resolved_2017['Age'] | |
# Filter to age of at least 3 | |
autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] < 2014)] | |
# Filter to people who were not already over 3 years old at initial DSM recognition of autism in 1968 | |
autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] > 1965)] | |
# Divide the prevalence values by 100 to convert to percentages | |
asd_avg_prevalence /= 100 | |
# Set the index to the birth year | |
autism_age_resolved_2017.set_index('Birth Year', inplace=True) | |
# Sort the index to have the birth years in ascending order | |
autism_age_resolved_2017 = autism_age_resolved_2017.sort_index() | |
# Create the plot with the calculated average lead concentrations | |
fig, ax1 = plt.subplots(figsize=(14, 8)) | |
# Line plot for average lead concentrations (inverted y-axis) | |
ax1.plot(average_lead_concentration.index, average_lead_concentration.values, '--', color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 1.25 µg/dL)') | |
ax1.plot(average_lead_concentration2.index, average_lead_concentration2.values, color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 2.0 µg/dL)') | |
ax1.set_ylabel('Blood Lead Concentration') | |
ax1.set_ylabel('Blood Lead Concentration') | |
ax1.legend(loc='upper left') | |
ax1.set_ylim(top=0.8e-6, bottom=10.8e-6) # Invert y-axis with 0 at the top and set upper limit | |
ax1.set_yscale('log') # Log scale for primary y-axis | |
ax1.yaxis.set_major_formatter(mtick.EngFormatter(unit='g/dL')) | |
ax1.yaxis.set_minor_locator(mtick.FixedLocator([2e-6,3e-6,4e-6,5e-6,6e-6,7e-6,8e-6,9e-6,10e-6])) | |
ax1.yaxis.set_minor_formatter(mtick.EngFormatter(unit='g/dL')) | |
# Create a secondary y-axis | |
ax2 = ax1.twinx() | |
# Line plot for ASD prevalence | |
ax2.plot(asd_avg_prevalence.index, asd_avg_prevalence.values, color='blue', label='ASD Prevalence (ADDM)') # Divide by 100 for PercentFormatter | |
ax2.set_ylabel('Prevalence') | |
ax2.set_ylim(bottom=0.0004,top=0.06) # Set lower limit to 0 | |
ax2.set_yscale('log') # Log scale for secondary y-axis | |
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
ax2.yaxis.set_minor_locator(mtick.FixedLocator([0.001, 0.002, 0.003, 0.004, 0.006, 0.008, 0.01, 0.012, 0.014, 0.0175, 0.02, 0.025, 0.03])) | |
ax2.yaxis.set_minor_formatter(mtick.PercentFormatter(1.0)) | |
# Line plot for Code 1 autism prevalence (S1 data, 2017 report) | |
ax2.plot(autism_age_resolved_2017.index, autism_age_resolved_2017['Prevalence (%)'] / 100, color='red', label='Code 1 Autism Prevalence (CDDS)') | |
ax2.legend(loc='upper right') | |
# Set labels and title | |
ax1.set_title('Childhood Lead Poisoning (Reversed!) and Autism/ASD Prevalence') | |
ax1.set_xlabel('Birth Year') | |
ax1.xaxis.set_major_locator(mtick.MultipleLocator(5)) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment