Skip to content

Instantly share code, notes, and snippets.

@Frank-Buss
Last active September 14, 2025 12:26
Show Gist options
  • Save Frank-Buss/34e62e1024265481b7089d8ba9cc4cee to your computer and use it in GitHub Desktop.
Save Frank-Buss/34e62e1024265481b7089d8ba9cc4cee to your computer and use it in GitHub Desktop.
gender bias in language selection
#!/usr/bin/env python3
"""
Analyze gender bias in programming language choice using Stack Overflow Developer Survey data.
"""
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg') # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import requests
import zipfile
import io
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')
class GenderLanguageBiasAnalyzer:
"""Analyze gender patterns in programming language preferences."""
def __init__(self, year: int = 2022):
"""
Initialize analyzer with survey year.
Args:
year: Survey year (2011-2023 available)
"""
self.year = year
self.data_dir = Path('data')
self.data_dir.mkdir(exist_ok=True)
self.df = None
self.languages_df = None
def download_data(self) -> bool:
"""
Download Stack Overflow survey data.
Returns:
bool: True if successful
"""
# URLs for different years of Stack Overflow survey data
urls = {
2023: "https://cdn.stackoverflow.co/files/jo7n4k8s/production/49915bfd46d0902c3564fd9a06b509d08a20488c.zip",
2022: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2022.zip",
2021: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip",
2020: "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2020.zip",
}
if self.year not in urls:
print(f"Year {self.year} not configured. Using 2023 data.")
self.year = 2023
url = urls[self.year]
csv_path = self.data_dir / f'survey_{self.year}.csv'
if csv_path.exists():
print(f"Data already exists at {csv_path}")
return True
print(f"Downloading Stack Overflow {self.year} Developer Survey...")
try:
response = requests.get(url, stream=True)
response.raise_for_status()
# Extract CSV from zip
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
for file in z.namelist():
if 'survey_results_public' in file.lower() and file.endswith('.csv'):
print(f"Extracting {file}...")
with z.open(file) as f:
df = pd.read_csv(f, low_memory=False)
df.to_csv(csv_path, index=False)
print(f"Saved to {csv_path}")
return True
print("Could not find survey results in zip file")
return False
except Exception as e:
print(f"Error downloading data: {e}")
print("You can manually download from: https://insights.stackoverflow.com/survey")
return False
def load_data(self) -> pd.DataFrame:
"""Load survey data from CSV."""
csv_path = self.data_dir / f'survey_{self.year}.csv'
if not csv_path.exists():
if not self.download_data():
raise FileNotFoundError(f"Could not download/find data at {csv_path}")
print(f"Loading data from {csv_path}...")
self.df = pd.read_csv(csv_path, low_memory=False)
print(f"Loaded {len(self.df):,} responses")
# Clean gender column
self._clean_gender_data()
return self.df
def _clean_gender_data(self):
"""Clean and standardize gender column."""
# Column names vary by year
gender_cols = ['Gender', 'GenderSelect', 'GenderIdentity']
gender_col = None
for col in gender_cols:
if col in self.df.columns:
gender_col = col
break
if not gender_col:
print("Warning: No gender column found")
return
# Standardize gender values
self.df['gender_clean'] = self.df[gender_col].str.strip()
# Map various responses to standard categories
gender_mapping = {
'Man': 'Man',
'Woman': 'Woman',
'Male': 'Man',
'Female': 'Woman',
'Non-binary': 'Non-binary',
'Or, in your own words:': 'Other',
'Prefer not to say': 'Prefer not to say',
}
self.df['gender_clean'] = self.df['gender_clean'].map(
lambda x: gender_mapping.get(x, 'Other') if pd.notna(x) else np.nan
)
print(f"Gender distribution:\n{self.df['gender_clean'].value_counts()}")
def analyze_languages(self) -> pd.DataFrame:
"""
Analyze programming language usage by gender.
Returns:
DataFrame with language usage statistics by gender
"""
# Find language column (varies by year)
lang_cols = ['LanguageWorkedWith', 'LanguageHaveWorkedWith', 'ProgrammingLanguages']
lang_col = None
for col in lang_cols:
if col in self.df.columns:
lang_col = col
break
if not lang_col:
print("Warning: No language column found")
return pd.DataFrame()
# Prepare data
data = []
for _, row in self.df.iterrows():
if pd.isna(row.get('gender_clean')) or pd.isna(row.get(lang_col)):
continue
gender = row['gender_clean']
languages = str(row[lang_col]).split(';')
for lang in languages:
lang = lang.strip()
if lang and lang != 'nan':
data.append({'gender': gender, 'language': lang})
# Create DataFrame
lang_df = pd.DataFrame(data)
# Calculate statistics
total_by_gender = lang_df.groupby('gender').size()
lang_by_gender = lang_df.groupby(['language', 'gender']).size().unstack(fill_value=0)
# Calculate percentages
lang_pct = lang_by_gender.div(total_by_gender, axis=1) * 100
# Add total usage
lang_pct['total_users'] = lang_by_gender.sum(axis=1)
lang_pct = lang_pct.sort_values('total_users', ascending=False)
self.languages_df = lang_pct
return lang_pct
def calculate_bias_metrics(self, min_users: int = 100) -> pd.DataFrame:
"""
Calculate gender bias metrics for each language.
Args:
min_users: Minimum number of users to include language
Returns:
DataFrame with bias metrics
"""
if self.languages_df is None:
self.analyze_languages()
# Filter languages with sufficient data
popular_langs = self.languages_df[self.languages_df['total_users'] >= min_users].copy()
# Calculate bias metrics
if 'Man' in popular_langs.columns and 'Woman' in popular_langs.columns:
# Calculate the overall gender distribution
total_men = self.df[self.df['gender_clean'] == 'Man'].shape[0]
total_women = self.df[self.df['gender_clean'] == 'Woman'].shape[0]
baseline_ratio = total_men / total_women # ~19:1 for this data
# For each language, calculate actual gender ratio
# This tells us: for every woman using this language, how many men use it?
lang_ratios = (popular_langs['Man'] * total_men / 100) / (popular_langs['Woman'] * total_women / 100)
# Gender bias: positive if more male-dominated than baseline, negative if more female-dominated
# We use log scale to make it symmetric (2x more men = -2x more women)
import math
popular_langs['gender_ratio'] = lang_ratios.apply(lambda x: math.log(x / baseline_ratio) * 100 if x > 0 else 0)
# Alternative simple metric: percentage point difference
popular_langs['simple_bias'] = popular_langs['Woman'] - popular_langs['Man']
# Sort by bias
popular_langs = popular_langs.sort_values('gender_ratio', ascending=False)
return popular_langs
def plot_gender_distribution(self, top_n: int = 20):
"""
Create visualization of gender distribution across languages.
Args:
top_n: Number of top languages to show
"""
if self.languages_df is None:
self.analyze_languages()
# Get top languages
top_langs = self.languages_df.nlargest(top_n, 'total_users')
# Prepare data for plotting - only use columns that exist
available_genders = []
for gender in ['Man', 'Woman', 'Non-binary', 'Other']:
if gender in top_langs.columns:
available_genders.append(gender)
plot_data = top_langs[available_genders].copy()
plot_data = plot_data.fillna(0)
# Create figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 8))
# Stacked bar chart
plot_data.plot(kind='bar', stacked=True, ax=ax1,
color=['#1f77b4', '#ff7f0e', '#2ca02c'])
ax1.set_title(f'Gender Distribution by Programming Language ({self.year})')
ax1.set_xlabel('Programming Language')
ax1.set_ylabel('Percentage of Gender Group Using Language')
ax1.legend(title='Gender')
ax1.grid(axis='y', alpha=0.3)
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')
# Gender ratio plot - show ratio compared to baseline
bias_df = self.calculate_bias_metrics()
# Calculate ratio of women/men usage compared to baseline
total_men = self.df[self.df['gender_clean'] == 'Man'].shape[0]
total_women = self.df[self.df['gender_clean'] == 'Woman'].shape[0]
# For each language, calculate how much more/less likely women are to use it
gender_ratios = []
for lang in bias_df.index:
women_pct = bias_df.loc[lang, 'Woman'] # % of women using this
men_pct = bias_df.loc[lang, 'Man'] # % of men using this
if men_pct > 0:
# Simple ratio: if 2.0, women are 2x as likely to use it as men
usage_ratio = women_pct / men_pct
# Convert to percentage: 0% = equal, +100% = 2x more likely, -50% = half as likely
gender_ratios.append((usage_ratio - 1) * 100)
else:
gender_ratios.append(0)
bias_df['gender_bias'] = gender_ratios
bias_df['abs_bias'] = abs(bias_df['gender_bias'])
# Sort to show most female-biased at top, most male-biased at bottom
top_bias = bias_df.nlargest(top_n, 'abs_bias').sort_values('gender_bias', ascending=False)
colors = ['#ff7f0e' if x > 0 else '#1f77b4' for x in top_bias['gender_bias'].values]
bars = ax2.barh(range(len(top_bias)), top_bias['gender_bias'].values, color=colors)
ax2.set_yticks(range(len(top_bias)))
ax2.set_yticklabels(top_bias.index)
ax2.set_xlabel('Women vs Men Usage (0% = equal likelihood, +100% = 2x more women)')
ax2.set_title('Gender Bias by Language')
ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax2.grid(axis='x', alpha=0.3)
# Add value labels
for bar in bars:
width = bar.get_width()
if abs(width) > 5: # Only label if bar is big enough
label_x_pos = width + (5 if width > 0 else -5)
# Show as multiplier for clarity
multiplier = 1 + width/100
label = f'{multiplier:.1f}x' if multiplier >= 1 else f'{multiplier:.2f}x'
ax2.text(label_x_pos, bar.get_y() + bar.get_height()/2,
label, ha='left' if width > 0 else 'right',
va='center', fontsize=8)
plt.tight_layout()
plt.savefig(f'gender_bias_languages_{self.year}.png', dpi=300, bbox_inches='tight')
print(f"Visualization saved to gender_bias_languages_{self.year}.png")
return fig
def print_summary(self):
"""Print summary statistics."""
print("\n" + "="*60)
print(f"Stack Overflow {self.year} Developer Survey Analysis")
print("="*60)
# Gender distribution
print("\nOverall Gender Distribution:")
gender_counts = self.df['gender_clean'].value_counts()
for gender, count in gender_counts.items():
pct = count / len(self.df) * 100
print(f" {gender}: {count:,} ({pct:.1f}%)")
# Most biased languages
bias_df = self.calculate_bias_metrics()
print("\nMost Male-Dominated Languages:")
for lang in bias_df.head(5).index:
ratio = bias_df.loc[lang, 'gender_ratio']
print(f" {lang}: {ratio:+.1f}% bias")
print("\nMost Female-Dominated Languages:")
for lang in bias_df.tail(5).index:
ratio = bias_df.loc[lang, 'gender_ratio']
print(f" {lang}: {ratio:+.1f}% bias")
print("\nMost Gender-Balanced Languages:")
bias_df['abs_bias'] = bias_df['gender_ratio'].abs()
balanced = bias_df.nsmallest(5, 'abs_bias')
for lang in balanced.index:
ratio = bias_df.loc[lang, 'gender_ratio']
print(f" {lang}: {ratio:+.1f}% bias")
def main():
"""Main execution function."""
# Initialize analyzer
analyzer = GenderLanguageBiasAnalyzer(year=2022)
# Load and analyze data
print("Loading Stack Overflow survey data...")
analyzer.load_data()
print("\nAnalyzing language preferences by gender...")
lang_stats = analyzer.analyze_languages()
# Calculate bias metrics
print("\nCalculating bias metrics...")
bias_metrics = analyzer.calculate_bias_metrics()
# Save results
output_file = f'language_gender_bias_{analyzer.year}.csv'
bias_metrics.to_csv(output_file)
print(f"\nResults saved to {output_file}")
# Print summary
analyzer.print_summary()
# Create visualizations
print("\nGenerating visualizations...")
analyzer.plot_gender_distribution(top_n=20)
return analyzer
if __name__ == "__main__":
analyzer = main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment