-
-
Save hotelzululima/eac478cf83809f668322e166911a7e3c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Load the data | |
df = pd.read_excel('pnas.2118631119.sd01.xlsx') | |
from bs4 import BeautifulSoup | |
# Load the HTML file | |
with open('crime.html', 'r') as f: | |
contents = f.read() | |
# Parse the HTML with BeautifulSoup | |
soup = BeautifulSoup(contents, 'html.parser') | |
# Find the table in the HTML | |
table = soup.find('table') | |
# Parse the table into a DataFrame | |
df_crime = pd.read_html(str(table), header=0)[0] | |
# Identify the rows that contain the headers | |
header_rows = df_crime[df_crime['Unnamed: 0'] == 'Year'].index | |
# Remove the header rows | |
df_crime = df_crime.drop(header_rows) | |
# Rename the columns based on the first row of headers | |
df_crime.columns = ['Year', 'Population', 'Total', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated assault', 'Burglary', 'Larceny', 'Vehicle theft'] | |
# Remove rows with NaN in the 'Year' column | |
df_crime = df_crime.dropna(subset=['Year']) | |
# Convert the Year and Violent columns to the appropriate data types | |
df_crime['Year'] = df_crime['Year'].astype(int) | |
df_crime['Violent'] = df_crime['Violent'].astype(float) | |
# Extract the time series for "Violent Crimes" | |
df_crime = df_crime[['Year', 'Violent']].dropna() | |
df_crime['Year'] = df_crime['Year'].astype(int) | |
df_crime.set_index('Year', inplace=True) | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
# Filter the data for ages | |
df_filtered = df[(df['AGE'] >= 14) & (df['AGE'] <= 30) & (df['YEAR'] >= 1955) & (df['YEAR'] <= 2020)] | |
# Create a new column 'condition_min' to hold the minimum value of each range | |
df_filtered['condition_min'] = df_filtered['condition'].apply(lambda x: 30 if x=='30+ (ud/dL)' else float(x.split('-')[0])) | |
# Fix typo in units | |
df_filtered['condition'] = df_filtered['condition'].str.replace('ud/dL', 'ug/dL') | |
# Pivot the data to get years as index, conditions as columns and leadpop as values | |
df_pivot = df_filtered.pivot_table(index='YEAR', columns=['condition_min', 'condition'], values='leadpop', aggfunc='sum') | |
# Sort the columns by 'condition_min' | |
df_pivot = df_pivot.sort_index(axis=1, level='condition_min', ascending=False) | |
# Normalize by youth population | |
df_youth_pop = df_pivot.sum(axis=1) | |
df_pivot = df_pivot.div(df_youth_pop, axis=0) | |
# Drop the 'condition_min' level in the column index | |
df_pivot.columns = df_pivot.columns.droplevel('condition_min') | |
# Ensure the dataframes have the same index by reindexing df_crime to match df_pivot | |
df_crime_reindexed = df_crime.reindex(df_pivot.index) | |
df_crime_reindexed = df_crime_reindexed.div(df_youth_pop,axis=0) | |
# Create a function to format the x-axis labels | |
def format_year_labels(year): | |
if year % 5 == 0: | |
return str(year) | |
else: | |
return '' | |
# Define color palette | |
colors = ['#eaf2f0', '#fae5c6', '#f0c2a3', '#e19578', '#cb624c', '#af211f', '#8d0000'] | |
# Update color palette | |
colors[0] = '#cbe3ce' | |
# Create the normalized stacked bar chart | |
fig, ax1 = plt.subplots(figsize=(14, 8)) | |
# Bar plot | |
bar_plot = df_pivot.plot(kind='bar', stacked=True, ax=ax1, color=colors[::-1]) | |
# Set labels and title | |
ax1.set_title('U.S. Childhood Lead Poisoning in 14-30 Age Group and Violent Crime Rates') | |
ax1.set_xlabel('Year') | |
ax1.set_ylabel('fraction of the 14-30 U.S. population') | |
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
ax1.get_legend().remove() | |
# Create a secondary y-axis | |
ax2 = ax1.twinx() | |
# Get the x-axis coordinates for the bar plot | |
x_coords = range(len(df_pivot.index)) | |
# Line plot for violent crime rates using ax.plot with x_coords | |
ax2.plot(x_coords, df_crime_reindexed['Violent'], color='black', linewidth=3, label='U.S. Violent Crimes per youth') | |
ax2.set_ylabel('violent crimes per annum per total population age 14-30') | |
# Adjust the y-axis limit | |
ax2.set_ylim(0, 0.05) | |
# Show the legend | |
lines, labels = ax1.get_legend_handles_labels() | |
lines2, labels2 = ax2.get_legend_handles_labels() | |
ax2.legend(lines[::-1] + lines2, labels[::-1] + labels2, loc='upper left', bbox_to_anchor=(1.05, 0.8)) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment