hotelzululima · August 6, 2023 19:03
diff --git a/lead_crime.py b/lead_crime.py
 import pandas as pd

 # Load the data
 df = pd.read_excel('pnas.2118631119.sd01.xlsx')

 from bs4 import BeautifulSoup

 # Load the HTML file
 with open('crime.html', 'r') as f:
    contents = f.read()

 # Parse the HTML with BeautifulSoup
 soup = BeautifulSoup(contents, 'html.parser')

 # Find the table in the HTML
 table = soup.find('table')

 # Parse the table into a DataFrame
 df_crime = pd.read_html(str(table), header=0)[0]

 # Identify the rows that contain the headers
 header_rows = df_crime[df_crime['Unnamed: 0'] == 'Year'].index

 # Remove the header rows
 df_crime = df_crime.drop(header_rows)

 # Rename the columns based on the first row of headers
 df_crime.columns = ['Year', 'Population', 'Total', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated assault', 'Burglary', 'Larceny', 'Vehicle theft']

 # Remove rows with NaN in the 'Year' column
 df_crime = df_crime.dropna(subset=['Year'])

 # Convert the Year and Violent columns to the appropriate data types
 df_crime['Year'] = df_crime['Year'].astype(int)
 df_crime['Violent'] = df_crime['Violent'].astype(float)

 # Extract the time series for "Violent Crimes"
 df_crime = df_crime[['Year', 'Violent']].dropna()
 df_crime['Year'] = df_crime['Year'].astype(int)
 df_crime.set_index('Year', inplace=True)

 import matplotlib.pyplot as plt
 import matplotlib.ticker as mtick

 # Filter the data for ages
 df_filtered = df[(df['AGE'] >= 14) & (df['AGE'] <= 30) & (df['YEAR'] >= 1955) & (df['YEAR'] <= 2020)]

 # Create a new column 'condition_min' to hold the minimum value of each range
 df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0]))

 # Fix typo in units
 df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL')

 # Pivot the data to get years as index, conditions as columns and leadpop as values
 df_pivot = df_filtered.pivot_table(index='YEAR', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum')

 # Sort the columns by 'condition_min'
 df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=False)

 # Normalize by youth population
 df_youth_pop = df_pivot.sum(axis=1)
 df_pivot = df_pivot.div(df_youth_pop, axis=0)

 # Drop the 'condition_min' level in the column index
 df_pivot.columns = df_pivot.columns.droplevel('condition_min')

 # Ensure the dataframes have the same index by reindexing df_crime to match df_pivot
 df_crime_reindexed = df_crime.reindex(df_pivot.index)
 df_crime_reindexed = df_crime_reindexed.div(df_youth_pop,axis=0)

 # Create a function to format the x-axis labels
 def format_year_labels(year):
    if year % 5 == 0:
        return str(year)
    else:
        return ''

 # Define color palette
 colors = ['#eaf2f0', '#fae5c6', '#f0c2a3', '#e19578', '#cb624c', '#af211f', '#8d0000']

 # Update color palette
 colors[0] = '#cbe3ce'

 # Create the normalized stacked bar chart
 fig, ax1 = plt.subplots(figsize=(14, 8))

 # Bar plot
 bar_plot = df_pivot.plot(kind='bar', stacked=True, ax=ax1, color=colors[::-1])

 # Set labels and title
 ax1.set_title('U.S. Childhood Lead Poisoning in 14-30 Age Group and Violent Crime Rates')
 ax1.set_xlabel('Year')
 ax1.set_ylabel('fraction of the 14-30 U.S. population')
 ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
 ax1.get_legend().remove()

 # Create a secondary y-axis
 ax2 = ax1.twinx()

 # Get the x-axis coordinates for the bar plot
 x_coords = range(len(df_pivot.index))

 # Line plot for violent crime rates using ax.plot with x_coords
 ax2.plot(x_coords, df_crime_reindexed['Violent'], color='black', linewidth=3, label='U.S. Violent Crimes per youth')
 ax2.set_ylabel('violent crimes per annum per total population age 14-30')

 # Adjust the y-axis limit
 ax2.set_ylim(0, 0.05)

 # Show the legend
 lines, labels = ax1.get_legend_handles_labels()
 lines2, labels2 = ax2.get_legend_handles_labels()
 ax2.legend(lines[::-1] + lines2, labels[::-1] + labels2, loc='upper left', bbox_to_anchor=(1.05, 0.8))

 plt.tight_layout()
 plt.show()
	import pandas as pd

	# Load the data
	df = pd.read_excel('pnas.2118631119.sd01.xlsx')

	from bs4 import BeautifulSoup

	# Load the HTML file
	with open('crime.html', 'r') as f:
	contents = f.read()

	# Parse the HTML with BeautifulSoup
	soup = BeautifulSoup(contents, 'html.parser')

	# Find the table in the HTML
	table = soup.find('table')

	# Parse the table into a DataFrame
	df_crime = pd.read_html(str(table), header=0)[0]

	# Identify the rows that contain the headers
	header_rows = df_crime[df_crime['Unnamed: 0'] == 'Year'].index

	# Remove the header rows
	df_crime = df_crime.drop(header_rows)

	# Rename the columns based on the first row of headers
	df_crime.columns = ['Year', 'Population', 'Total', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated assault', 'Burglary', 'Larceny', 'Vehicle theft']

	# Remove rows with NaN in the 'Year' column
	df_crime = df_crime.dropna(subset=['Year'])

	# Convert the Year and Violent columns to the appropriate data types
	df_crime['Year'] = df_crime['Year'].astype(int)
	df_crime['Violent'] = df_crime['Violent'].astype(float)

	# Extract the time series for "Violent Crimes"
	df_crime = df_crime[['Year', 'Violent']].dropna()
	df_crime['Year'] = df_crime['Year'].astype(int)
	df_crime.set_index('Year', inplace=True)

	import matplotlib.pyplot as plt
	import matplotlib.ticker as mtick

	# Filter the data for ages
	df_filtered = df[(df['AGE'] >= 14) & (df['AGE'] <= 30) & (df['YEAR'] >= 1955) & (df['YEAR'] <= 2020)]

	# Create a new column 'condition_min' to hold the minimum value of each range
	df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0]))

	# Fix typo in units
	df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL')

	# Pivot the data to get years as index, conditions as columns and leadpop as values
	df_pivot = df_filtered.pivot_table(index='YEAR', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum')

	# Sort the columns by 'condition_min'
	df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=False)

	# Normalize by youth population
	df_youth_pop = df_pivot.sum(axis=1)
	df_pivot = df_pivot.div(df_youth_pop, axis=0)

	# Drop the 'condition_min' level in the column index
	df_pivot.columns = df_pivot.columns.droplevel('condition_min')

	# Ensure the dataframes have the same index by reindexing df_crime to match df_pivot
	df_crime_reindexed = df_crime.reindex(df_pivot.index)
	df_crime_reindexed = df_crime_reindexed.div(df_youth_pop,axis=0)

	# Create a function to format the x-axis labels
	def format_year_labels(year):
	if year % 5 == 0:
	return str(year)
	else:
	return ''

	# Define color palette
	colors = ['#eaf2f0', '#fae5c6', '#f0c2a3', '#e19578', '#cb624c', '#af211f', '#8d0000']

	# Update color palette
	colors[0] = '#cbe3ce'

	# Create the normalized stacked bar chart
	fig, ax1 = plt.subplots(figsize=(14, 8))

	# Bar plot
	bar_plot = df_pivot.plot(kind='bar', stacked=True, ax=ax1, color=colors[::-1])

	# Set labels and title
	ax1.set_title('U.S. Childhood Lead Poisoning in 14-30 Age Group and Violent Crime Rates')
	ax1.set_xlabel('Year')
	ax1.set_ylabel('fraction of the 14-30 U.S. population')
	ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
	ax1.get_legend().remove()

	# Create a secondary y-axis
	ax2 = ax1.twinx()

	# Get the x-axis coordinates for the bar plot
	x_coords = range(len(df_pivot.index))

	# Line plot for violent crime rates using ax.plot with x_coords
	ax2.plot(x_coords, df_crime_reindexed['Violent'], color='black', linewidth=3, label='U.S. Violent Crimes per youth')
	ax2.set_ylabel('violent crimes per annum per total population age 14-30')

	# Adjust the y-axis limit
	ax2.set_ylim(0, 0.05)

	# Show the legend
	lines, labels = ax1.get_legend_handles_labels()
	lines2, labels2 = ax2.get_legend_handles_labels()
	ax2.legend(lines[::-1] + lines2, labels[::-1] + labels2, loc='upper left', bbox_to_anchor=(1.05, 0.8))

	plt.tight_layout()
	plt.show()