Last active
September 14, 2025 12:26
-
-
Save Frank-Buss/34e62e1024265481b7089d8ba9cc4cee to your computer and use it in GitHub Desktop.
gender bias in language selection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Analyze gender bias in programming language choice using Stack Overflow Developer Survey data. | |
""" | |
import pandas as pd | |
import numpy as np | |
import matplotlib | |
matplotlib.use('Agg') # Use non-interactive backend | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from pathlib import Path | |
import requests | |
import zipfile | |
import io | |
from typing import Dict, List, Tuple | |
import warnings | |
warnings.filterwarnings('ignore') | |
class GenderLanguageBiasAnalyzer: | |
"""Analyze gender patterns in programming language preferences.""" | |
def __init__(self, year: int = 2022): | |
""" | |
Initialize analyzer with survey year. | |
Args: | |
year: Survey year (2011-2023 available) | |
""" | |
self.year = year | |
self.data_dir = Path('data') | |
self.data_dir.mkdir(exist_ok=True) | |
self.df = None | |
self.languages_df = None | |
def download_data(self) -> bool: | |
""" | |
Download Stack Overflow survey data. | |
Returns: | |
bool: True if successful | |
""" | |
# URLs for different years of Stack Overflow survey data | |
urls = { | |
2023: "https://cdn.stackoverflow.co/files/jo7n4k8s/production/49915bfd46d0902c3564fd9a06b509d08a20488c.zip", | |
2022: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip", | |
2021: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip", | |
2020: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2020.zip", | |
} | |
if self.year not in urls: | |
print(f"Year {self.year} not configured. Using 2023 data.") | |
self.year = 2023 | |
url = urls[self.year] | |
csv_path = self.data_dir / f'survey_{self.year}.csv' | |
if csv_path.exists(): | |
print(f"Data already exists at {csv_path}") | |
return True | |
print(f"Downloading Stack Overflow {self.year} Developer Survey...") | |
try: | |
response = requests.get(url, stream=True) | |
response.raise_for_status() | |
# Extract CSV from zip | |
with zipfile.ZipFile(io.BytesIO(response.content)) as z: | |
for file in z.namelist(): | |
if 'survey_results_public' in file.lower() and file.endswith('.csv'): | |
print(f"Extracting {file}...") | |
with z.open(file) as f: | |
df = pd.read_csv(f, low_memory=False) | |
df.to_csv(csv_path, index=False) | |
print(f"Saved to {csv_path}") | |
return True | |
print("Could not find survey results in zip file") | |
return False | |
except Exception as e: | |
print(f"Error downloading data: {e}") | |
print("You can manually download from: https://insights.stackoverflow.com/survey") | |
return False | |
def load_data(self) -> pd.DataFrame: | |
"""Load survey data from CSV.""" | |
csv_path = self.data_dir / f'survey_{self.year}.csv' | |
if not csv_path.exists(): | |
if not self.download_data(): | |
raise FileNotFoundError(f"Could not download/find data at {csv_path}") | |
print(f"Loading data from {csv_path}...") | |
self.df = pd.read_csv(csv_path, low_memory=False) | |
print(f"Loaded {len(self.df):,} responses") | |
# Clean gender column | |
self._clean_gender_data() | |
return self.df | |
def _clean_gender_data(self): | |
"""Clean and standardize gender column.""" | |
# Column names vary by year | |
gender_cols = ['Gender', 'GenderSelect', 'GenderIdentity'] | |
gender_col = None | |
for col in gender_cols: | |
if col in self.df.columns: | |
gender_col = col | |
break | |
if not gender_col: | |
print("Warning: No gender column found") | |
return | |
# Standardize gender values | |
self.df['gender_clean'] = self.df[gender_col].str.strip() | |
# Map various responses to standard categories | |
gender_mapping = { | |
'Man': 'Man', | |
'Woman': 'Woman', | |
'Male': 'Man', | |
'Female': 'Woman', | |
'Non-binary': 'Non-binary', | |
'Or, in your own words:': 'Other', | |
'Prefer not to say': 'Prefer not to say', | |
} | |
self.df['gender_clean'] = self.df['gender_clean'].map( | |
lambda x: gender_mapping.get(x, 'Other') if pd.notna(x) else np.nan | |
) | |
print(f"Gender distribution:\n{self.df['gender_clean'].value_counts()}") | |
def analyze_languages(self) -> pd.DataFrame: | |
""" | |
Analyze programming language usage by gender. | |
Returns: | |
DataFrame with language usage statistics by gender | |
""" | |
# Find language column (varies by year) | |
lang_cols = ['LanguageWorkedWith', 'LanguageHaveWorkedWith', 'ProgrammingLanguages'] | |
lang_col = None | |
for col in lang_cols: | |
if col in self.df.columns: | |
lang_col = col | |
break | |
if not lang_col: | |
print("Warning: No language column found") | |
return pd.DataFrame() | |
# Prepare data | |
data = [] | |
for _, row in self.df.iterrows(): | |
if pd.isna(row.get('gender_clean')) or pd.isna(row.get(lang_col)): | |
continue | |
gender = row['gender_clean'] | |
languages = str(row[lang_col]).split(';') | |
for lang in languages: | |
lang = lang.strip() | |
if lang and lang != 'nan': | |
data.append({'gender': gender, 'language': lang}) | |
# Create DataFrame | |
lang_df = pd.DataFrame(data) | |
# Calculate statistics | |
total_by_gender = lang_df.groupby('gender').size() | |
lang_by_gender = lang_df.groupby(['language', 'gender']).size().unstack(fill_value=0) | |
# Calculate percentages | |
lang_pct = lang_by_gender.div(total_by_gender, axis=1) * 100 | |
# Add total usage | |
lang_pct['total_users'] = lang_by_gender.sum(axis=1) | |
lang_pct = lang_pct.sort_values('total_users', ascending=False) | |
self.languages_df = lang_pct | |
return lang_pct | |
def calculate_bias_metrics(self, min_users: int = 100) -> pd.DataFrame: | |
""" | |
Calculate gender bias metrics for each language. | |
Args: | |
min_users: Minimum number of users to include language | |
Returns: | |
DataFrame with bias metrics | |
""" | |
if self.languages_df is None: | |
self.analyze_languages() | |
# Filter languages with sufficient data | |
popular_langs = self.languages_df[self.languages_df['total_users'] >= min_users].copy() | |
# Calculate bias metrics | |
if 'Man' in popular_langs.columns and 'Woman' in popular_langs.columns: | |
# Calculate the overall gender distribution | |
total_men = self.df[self.df['gender_clean'] == 'Man'].shape[0] | |
total_women = self.df[self.df['gender_clean'] == 'Woman'].shape[0] | |
baseline_ratio = total_men / total_women # ~19:1 for this data | |
# For each language, calculate actual gender ratio | |
# This tells us: for every woman using this language, how many men use it? | |
lang_ratios = (popular_langs['Man'] * total_men / 100) / (popular_langs['Woman'] * total_women / 100) | |
# Gender bias: positive if more male-dominated than baseline, negative if more female-dominated | |
# We use log scale to make it symmetric (2x more men = -2x more women) | |
import math | |
popular_langs['gender_ratio'] = lang_ratios.apply(lambda x: math.log(x / baseline_ratio) * 100 if x > 0 else 0) | |
# Alternative simple metric: percentage point difference | |
popular_langs['simple_bias'] = popular_langs['Woman'] - popular_langs['Man'] | |
# Sort by bias | |
popular_langs = popular_langs.sort_values('gender_ratio', ascending=False) | |
return popular_langs | |
def plot_gender_distribution(self, top_n: int = 20): | |
""" | |
Create visualization of gender distribution across languages. | |
Args: | |
top_n: Number of top languages to show | |
""" | |
if self.languages_df is None: | |
self.analyze_languages() | |
# Get top languages | |
top_langs = self.languages_df.nlargest(top_n, 'total_users') | |
# Prepare data for plotting - only use columns that exist | |
available_genders = [] | |
for gender in ['Man', 'Woman', 'Non-binary', 'Other']: | |
if gender in top_langs.columns: | |
available_genders.append(gender) | |
plot_data = top_langs[available_genders].copy() | |
plot_data = plot_data.fillna(0) | |
# Create figure | |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8)) | |
# Stacked bar chart | |
plot_data.plot(kind='bar', stacked=True, ax=ax1, | |
color=['#1f77b4', '#ff7f0e', '#2ca02c']) | |
ax1.set_title(f'Gender Distribution by Programming Language ({self.year})') | |
ax1.set_xlabel('Programming Language') | |
ax1.set_ylabel('Percentage of Gender Group Using Language') | |
ax1.legend(title='Gender') | |
ax1.grid(axis='y', alpha=0.3) | |
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right') | |
# Gender ratio plot - show ratio compared to baseline | |
bias_df = self.calculate_bias_metrics() | |
# Calculate ratio of women/men usage compared to baseline | |
total_men = self.df[self.df['gender_clean'] == 'Man'].shape[0] | |
total_women = self.df[self.df['gender_clean'] == 'Woman'].shape[0] | |
# For each language, calculate how much more/less likely women are to use it | |
gender_ratios = [] | |
for lang in bias_df.index: | |
women_pct = bias_df.loc[lang, 'Woman'] # % of women using this | |
men_pct = bias_df.loc[lang, 'Man'] # % of men using this | |
if men_pct > 0: | |
# Simple ratio: if 2.0, women are 2x as likely to use it as men | |
usage_ratio = women_pct / men_pct | |
# Convert to percentage: 0% = equal, +100% = 2x more likely, -50% = half as likely | |
gender_ratios.append((usage_ratio - 1) * 100) | |
else: | |
gender_ratios.append(0) | |
bias_df['gender_bias'] = gender_ratios | |
bias_df['abs_bias'] = abs(bias_df['gender_bias']) | |
# Sort to show most female-biased at top, most male-biased at bottom | |
top_bias = bias_df.nlargest(top_n, 'abs_bias').sort_values('gender_bias', ascending=False) | |
colors = ['#ff7f0e' if x > 0 else '#1f77b4' for x in top_bias['gender_bias'].values] | |
bars = ax2.barh(range(len(top_bias)), top_bias['gender_bias'].values, color=colors) | |
ax2.set_yticks(range(len(top_bias))) | |
ax2.set_yticklabels(top_bias.index) | |
ax2.set_xlabel('Women vs Men Usage (0% = equal likelihood, +100% = 2x more women)') | |
ax2.set_title('Gender Bias by Language') | |
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5) | |
ax2.grid(axis='x', alpha=0.3) | |
# Add value labels | |
for bar in bars: | |
width = bar.get_width() | |
if abs(width) > 5: # Only label if bar is big enough | |
label_x_pos = width + (5 if width > 0 else -5) | |
# Show as multiplier for clarity | |
multiplier = 1 + width/100 | |
label = f'{multiplier:.1f}x' if multiplier >= 1 else f'{multiplier:.2f}x' | |
ax2.text(label_x_pos, bar.get_y() + bar.get_height()/2, | |
label, ha='left' if width > 0 else 'right', | |
va='center', fontsize=8) | |
plt.tight_layout() | |
plt.savefig(f'gender_bias_languages_{self.year}.png', dpi=300, bbox_inches='tight') | |
print(f"Visualization saved to gender_bias_languages_{self.year}.png") | |
return fig | |
def print_summary(self): | |
"""Print summary statistics.""" | |
print("\n" + "="*60) | |
print(f"Stack Overflow {self.year} Developer Survey Analysis") | |
print("="*60) | |
# Gender distribution | |
print("\nOverall Gender Distribution:") | |
gender_counts = self.df['gender_clean'].value_counts() | |
for gender, count in gender_counts.items(): | |
pct = count / len(self.df) * 100 | |
print(f" {gender}: {count:,} ({pct:.1f}%)") | |
# Most biased languages | |
bias_df = self.calculate_bias_metrics() | |
print("\nMost Male-Dominated Languages:") | |
for lang in bias_df.head(5).index: | |
ratio = bias_df.loc[lang, 'gender_ratio'] | |
print(f" {lang}: {ratio:+.1f}% bias") | |
print("\nMost Female-Dominated Languages:") | |
for lang in bias_df.tail(5).index: | |
ratio = bias_df.loc[lang, 'gender_ratio'] | |
print(f" {lang}: {ratio:+.1f}% bias") | |
print("\nMost Gender-Balanced Languages:") | |
bias_df['abs_bias'] = bias_df['gender_ratio'].abs() | |
balanced = bias_df.nsmallest(5, 'abs_bias') | |
for lang in balanced.index: | |
ratio = bias_df.loc[lang, 'gender_ratio'] | |
print(f" {lang}: {ratio:+.1f}% bias") | |
def main(): | |
"""Main execution function.""" | |
# Initialize analyzer | |
analyzer = GenderLanguageBiasAnalyzer(year=2022) | |
# Load and analyze data | |
print("Loading Stack Overflow survey data...") | |
analyzer.load_data() | |
print("\nAnalyzing language preferences by gender...") | |
lang_stats = analyzer.analyze_languages() | |
# Calculate bias metrics | |
print("\nCalculating bias metrics...") | |
bias_metrics = analyzer.calculate_bias_metrics() | |
# Save results | |
output_file = f'language_gender_bias_{analyzer.year}.csv' | |
bias_metrics.to_csv(output_file) | |
print(f"\nResults saved to {output_file}") | |
# Print summary | |
analyzer.print_summary() | |
# Create visualizations | |
print("\nGenerating visualizations...") | |
analyzer.plot_gender_distribution(top_n=20) | |
return analyzer | |
if __name__ == "__main__": | |
analyzer = main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment