davidad · August 7, 2023 12:25
diff --git a/lead_autism.py b/lead_autism.py
 import pandas as pd

 # Load the Supplementary Data S1 from the lead poisoning PNAS paper
 # https://www.pnas.org/doi/full/10.1073/pnas.2118631119
 df = pd.read_excel('pnas.2118631119.sd01.xlsx')

 # Load the Supplementary Data S1 from the ADDM/CDDS paper
 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/
 file_path_s1 = 'autism_S1.xls'
 autism_s1 = pd.read_excel(file_path_s1)

 # Load the Supplementary Data S4 from the ADDM/CDDS paper
 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/
 asd_s4_path = 'autism_S4.xlsx'
 asd_df = pd.read_excel(asd_s4_path)

 import matplotlib.pyplot as plt
 import matplotlib.ticker as mtick

 # Filter the data for estimated lead poisoning at age 1 and remove forecast
 df_filtered = df[(df['AGE'] == 1) & (df['YEAR'] <= 2023)]

 # Create a new column 'condition_min' to hold the minimum value of each range
 df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0]))

 # Fix typo in units
 df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL')

 # Pivot the data to get years as index, conditions as columns and leadpop as values
 df_pivot = df_filtered.pivot_table(index='COHORT', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum')

 # Sort the columns by 'condition_min'
 df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=True)

 # Normalize by population
 df_youth_pop = df_pivot.sum(axis=1)
 df_pivot = df_pivot.div(df_youth_pop, axis=0)

 # Drop the 'condition_min' level in the column index
 df_pivot.columns = df_pivot.columns.droplevel('condition_min')

 # Calculate the weighted sum of lead concentrations from each bin
 # capped at 10, and assuming 1.25 average for the lowest bin
 mid_points = {
    '0-4.9 (ug/dL)': 1.25,
    '5-9.9 (ug/dL)': 7.5,
    '10-14.9 (ug/dL)': 10,
    '15-19.9 (ug/dL)': 10,
    '20-24.9 (ug/dL)': 10,
    '25-29.9 (ug/dL)': 10,
    '30+ (ug/dL)': 10
 }
 weighted_lead_sum = df_filtered.groupby('COHORT').apply(
    lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points[condition] for condition in mid_points)
 )

 # Calculate the weighted sum of lead concentrations from each bin
 # capped at 10, and assuming 2 average for the lowest bin
 mid_points2 = {
    '0-4.9 (ug/dL)': 2,
    '5-9.9 (ug/dL)': 7.5,
    '10-14.9 (ug/dL)': 10,
    '15-19.9 (ug/dL)': 10,
    '20-24.9 (ug/dL)': 10,
    '25-29.9 (ug/dL)': 10,
    '30+ (ug/dL)': 10
 }
 weighted_lead_sum2 = df_filtered.groupby('COHORT').apply(
    lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points2[condition] for condition in mid_points)
 )

 # Divide by the total population for each cohort to get the average lead concentration
 average_lead_concentration = weighted_lead_sum / df_filtered.groupby('COHORT')['leadpop'].sum()
 average_lead_concentration2 = weighted_lead_sum2 / df_filtered.groupby('COHORT')['leadpop'].sum()

 # Convert to g/dL
 average_lead_concentration = average_lead_concentration * 1e-6
 average_lead_concentration2 = average_lead_concentration2 * 1e-6

 # Manually defining the birth years
 birth_years = [1992, 1994, 1996, 1998, 2000, 2002, 2004]

 # Removing unnecessary rows and columns
 asd_df = asd_df.iloc[1:16, 2:9]

 # Renaming the columns to birth years
 asd_df.columns = birth_years

 # Transposing the data to have birth years as the index
 asd_df = asd_df.transpose()

 # Converting the values to numeric
 asd_df = asd_df.apply(pd.to_numeric, errors='coerce')

 # Averaging the prevalence values across all states for each birth year
 asd_avg_prevalence = asd_df.mean(axis=1)

 # Add the latest data from https://www.cdc.gov/ncbddd/autism/data.html
 asd_avg_prevalence[2006] = 16.8
 asd_avg_prevalence[2008] = 18.5
 asd_avg_prevalence[2010] = 23.0
 asd_avg_prevalence[2012] = 27.6

 # Divide the asd prevalence values by 10 to convert to percentages
 asd_avg_prevalence /= 10

 # Extract the prevalence percentages and ages for the 2017 report
 autism_age_resolved_2017 = autism_s1[['2017.1', '2017.2']].dropna().rename(columns={'2017.1': 'Prevalence (%)', '2017.2': 'Age'})

 # Convert 'Age' column to numeric
 autism_age_resolved_2017['Age'] = pd.to_numeric(autism_age_resolved_2017['Age'], errors='coerce')

 # Remove rows containing NaN values
 autism_age_resolved_2017 = autism_age_resolved_2017.dropna()

 # Convert the prevalence numbers into percentages
 autism_age_resolved_2017['Prevalence (%)'] = autism_age_resolved_2017['Prevalence (%)'] / 100

 # Calculate the birth year by subtracting the age from the report year
 autism_age_resolved_2017['Birth Year'] = 2017 - autism_age_resolved_2017['Age']

 # Filter to age of at least 3
 autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] < 2014)]

 # Filter to people who were not already over 3 years old at initial DSM recognition of autism in 1968
 autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] > 1965)]

 # Divide the prevalence values by 100 to convert to percentages
 asd_avg_prevalence /= 100

 # Set the index to the birth year
 autism_age_resolved_2017.set_index('Birth Year', inplace=True)

 # Sort the index to have the birth years in ascending order
 autism_age_resolved_2017 = autism_age_resolved_2017.sort_index()

 # Create the plot with the calculated average lead concentrations
 fig, ax1 = plt.subplots(figsize=(14, 8))

 # Line plot for average lead concentrations (inverted y-axis)
 ax1.plot(average_lead_concentration.index, average_lead_concentration.values, '--', color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 1.25 µg/dL)')
 ax1.plot(average_lead_concentration2.index, average_lead_concentration2.values, color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 2.0 µg/dL)')
 ax1.set_ylabel('Blood Lead Concentration')
 ax1.set_ylabel('Blood Lead Concentration')
 ax1.legend(loc='upper left')
 ax1.set_ylim(top=0.8e-6, bottom=10.8e-6) # Invert y-axis with 0 at the top and set upper limit
 ax1.set_yscale('log') # Log scale for primary y-axis
 ax1.yaxis.set_major_formatter(mtick.EngFormatter(unit='g/dL'))
 ax1.yaxis.set_minor_locator(mtick.FixedLocator([2e-6,3e-6,4e-6,5e-6,6e-6,7e-6,8e-6,9e-6,10e-6]))
 ax1.yaxis.set_minor_formatter(mtick.EngFormatter(unit='g/dL'))

 # Create a secondary y-axis
 ax2 = ax1.twinx()

 # Line plot for ASD prevalence
 ax2.plot(asd_avg_prevalence.index, asd_avg_prevalence.values, color='blue', label='ASD Prevalence (ADDM)') # Divide by 100 for PercentFormatter
 ax2.set_ylabel('Prevalence')
 ax2.set_ylim(bottom=0.0004,top=0.06) # Set lower limit to 0
 ax2.set_yscale('log') # Log scale for secondary y-axis
 ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
 ax2.yaxis.set_minor_locator(mtick.FixedLocator([0.001, 0.002, 0.003, 0.004, 0.006, 0.008, 0.01, 0.012, 0.014, 0.0175, 0.02, 0.025, 0.03]))
 ax2.yaxis.set_minor_formatter(mtick.PercentFormatter(1.0))

 # Line plot for Code 1 autism prevalence (S1 data, 2017 report)
 ax2.plot(autism_age_resolved_2017.index, autism_age_resolved_2017['Prevalence (%)'] / 100, color='red', label='Code 1 Autism Prevalence (CDDS)')
 ax2.legend(loc='upper right')


 # Set labels and title
 ax1.set_title('Childhood Lead Poisoning (Reversed!) and Autism/ASD Prevalence')
 ax1.set_xlabel('Birth Year')
 ax1.xaxis.set_major_locator(mtick.MultipleLocator(5))

 plt.tight_layout()
 plt.show()
	import pandas as pd

	# Load the Supplementary Data S1 from the lead poisoning PNAS paper
	# https://www.pnas.org/doi/full/10.1073/pnas.2118631119
	df = pd.read_excel('pnas.2118631119.sd01.xlsx')

	# Load the Supplementary Data S1 from the ADDM/CDDS paper
	# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/
	file_path_s1 = 'autism_S1.xls'
	autism_s1 = pd.read_excel(file_path_s1)

	# Load the Supplementary Data S4 from the ADDM/CDDS paper
	# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6223814/
	asd_s4_path = 'autism_S4.xlsx'
	asd_df = pd.read_excel(asd_s4_path)

	import matplotlib.pyplot as plt
	import matplotlib.ticker as mtick

	# Filter the data for estimated lead poisoning at age 1 and remove forecast
	df_filtered = df[(df['AGE'] == 1) & (df['YEAR'] <= 2023)]

	# Create a new column 'condition_min' to hold the minimum value of each range
	df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0]))

	# Fix typo in units
	df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL')

	# Pivot the data to get years as index, conditions as columns and leadpop as values
	df_pivot = df_filtered.pivot_table(index='COHORT', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum')

	# Sort the columns by 'condition_min'
	df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=True)

	# Normalize by population
	df_youth_pop = df_pivot.sum(axis=1)
	df_pivot = df_pivot.div(df_youth_pop, axis=0)

	# Drop the 'condition_min' level in the column index
	df_pivot.columns = df_pivot.columns.droplevel('condition_min')

	# Calculate the weighted sum of lead concentrations from each bin
	# capped at 10, and assuming 1.25 average for the lowest bin
	mid_points = {
	'0-4.9 (ug/dL)': 1.25,
	'5-9.9 (ug/dL)': 7.5,
	'10-14.9 (ug/dL)': 10,
	'15-19.9 (ug/dL)': 10,
	'20-24.9 (ug/dL)': 10,
	'25-29.9 (ug/dL)': 10,
	'30+ (ug/dL)': 10
	}
	weighted_lead_sum = df_filtered.groupby('COHORT').apply(
	lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points[condition] for condition in mid_points)
	)

	# Calculate the weighted sum of lead concentrations from each bin
	# capped at 10, and assuming 2 average for the lowest bin
	mid_points2 = {
	'0-4.9 (ug/dL)': 2,
	'5-9.9 (ug/dL)': 7.5,
	'10-14.9 (ug/dL)': 10,
	'15-19.9 (ug/dL)': 10,
	'20-24.9 (ug/dL)': 10,
	'25-29.9 (ug/dL)': 10,
	'30+ (ug/dL)': 10
	}
	weighted_lead_sum2 = df_filtered.groupby('COHORT').apply(
	lambda group: sum(group[group['condition'] == condition]['leadpop'].sum() * mid_points2[condition] for condition in mid_points)
	)

	# Divide by the total population for each cohort to get the average lead concentration
	average_lead_concentration = weighted_lead_sum / df_filtered.groupby('COHORT')['leadpop'].sum()
	average_lead_concentration2 = weighted_lead_sum2 / df_filtered.groupby('COHORT')['leadpop'].sum()

	# Convert to g/dL
	average_lead_concentration = average_lead_concentration * 1e-6
	average_lead_concentration2 = average_lead_concentration2 * 1e-6

	# Manually defining the birth years
	birth_years = [1992, 1994, 1996, 1998, 2000, 2002, 2004]

	# Removing unnecessary rows and columns
	asd_df = asd_df.iloc[1:16, 2:9]

	# Renaming the columns to birth years
	asd_df.columns = birth_years

	# Transposing the data to have birth years as the index
	asd_df = asd_df.transpose()

	# Converting the values to numeric
	asd_df = asd_df.apply(pd.to_numeric, errors='coerce')

	# Averaging the prevalence values across all states for each birth year
	asd_avg_prevalence = asd_df.mean(axis=1)

	# Add the latest data from https://www.cdc.gov/ncbddd/autism/data.html
	asd_avg_prevalence[2006] = 16.8
	asd_avg_prevalence[2008] = 18.5
	asd_avg_prevalence[2010] = 23.0
	asd_avg_prevalence[2012] = 27.6

	# Divide the asd prevalence values by 10 to convert to percentages
	asd_avg_prevalence /= 10

	# Extract the prevalence percentages and ages for the 2017 report
	autism_age_resolved_2017 = autism_s1[['2017.1', '2017.2']].dropna().rename(columns={'2017.1': 'Prevalence (%)', '2017.2': 'Age'})

	# Convert 'Age' column to numeric
	autism_age_resolved_2017['Age'] = pd.to_numeric(autism_age_resolved_2017['Age'], errors='coerce')

	# Remove rows containing NaN values
	autism_age_resolved_2017 = autism_age_resolved_2017.dropna()

	# Convert the prevalence numbers into percentages
	autism_age_resolved_2017['Prevalence (%)'] = autism_age_resolved_2017['Prevalence (%)'] / 100

	# Calculate the birth year by subtracting the age from the report year
	autism_age_resolved_2017['Birth Year'] = 2017 - autism_age_resolved_2017['Age']

	# Filter to age of at least 3
	autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] < 2014)]

	# Filter to people who were not already over 3 years old at initial DSM recognition of autism in 1968
	autism_age_resolved_2017 = autism_age_resolved_2017[(autism_age_resolved_2017['Birth Year'] > 1965)]

	# Divide the prevalence values by 100 to convert to percentages
	asd_avg_prevalence /= 100

	# Set the index to the birth year
	autism_age_resolved_2017.set_index('Birth Year', inplace=True)

	# Sort the index to have the birth years in ascending order
	autism_age_resolved_2017 = autism_age_resolved_2017.sort_index()

	# Create the plot with the calculated average lead concentrations
	fig, ax1 = plt.subplots(figsize=(14, 8))

	# Line plot for average lead concentrations (inverted y-axis)
	ax1.plot(average_lead_concentration.index, average_lead_concentration.values, '--', color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 1.25 µg/dL)')
	ax1.plot(average_lead_concentration2.index, average_lead_concentration2.values, color='green', label='Average Blood Lead Concentration at Age 1 (capped at 10 µg/dL, baseline 2.0 µg/dL)')
	ax1.set_ylabel('Blood Lead Concentration')
	ax1.set_ylabel('Blood Lead Concentration')
	ax1.legend(loc='upper left')
	ax1.set_ylim(top=0.8e-6, bottom=10.8e-6) # Invert y-axis with 0 at the top and set upper limit
	ax1.set_yscale('log') # Log scale for primary y-axis
	ax1.yaxis.set_major_formatter(mtick.EngFormatter(unit='g/dL'))
	ax1.yaxis.set_minor_locator(mtick.FixedLocator([2e-6,3e-6,4e-6,5e-6,6e-6,7e-6,8e-6,9e-6,10e-6]))
	ax1.yaxis.set_minor_formatter(mtick.EngFormatter(unit='g/dL'))

	# Create a secondary y-axis
	ax2 = ax1.twinx()

	# Line plot for ASD prevalence
	ax2.plot(asd_avg_prevalence.index, asd_avg_prevalence.values, color='blue', label='ASD Prevalence (ADDM)') # Divide by 100 for PercentFormatter
	ax2.set_ylabel('Prevalence')
	ax2.set_ylim(bottom=0.0004,top=0.06) # Set lower limit to 0
	ax2.set_yscale('log') # Log scale for secondary y-axis
	ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
	ax2.yaxis.set_minor_locator(mtick.FixedLocator([0.001, 0.002, 0.003, 0.004, 0.006, 0.008, 0.01, 0.012, 0.014, 0.0175, 0.02, 0.025, 0.03]))
	ax2.yaxis.set_minor_formatter(mtick.PercentFormatter(1.0))

	# Line plot for Code 1 autism prevalence (S1 data, 2017 report)
	ax2.plot(autism_age_resolved_2017.index, autism_age_resolved_2017['Prevalence (%)'] / 100, color='red', label='Code 1 Autism Prevalence (CDDS)')
	ax2.legend(loc='upper right')


	# Set labels and title
	ax1.set_title('Childhood Lead Poisoning (Reversed!) and Autism/ASD Prevalence')
	ax1.set_xlabel('Birth Year')
	ax1.xaxis.set_major_locator(mtick.MultipleLocator(5))

	plt.tight_layout()
	plt.show()