Skip to content

Instantly share code, notes, and snippets.

@jskherman
Last active April 4, 2025 00:50
Show Gist options
  • Save jskherman/c27813f559af85a46e41b249ee15e655 to your computer and use it in GitHub Desktop.
Save jskherman/c27813f559af85a46e41b249ee15e655 to your computer and use it in GitHub Desktop.
A Python script for exporting a GitHub user's daily commit counts to a CSV file
import requests
import datetime
import re
import pandas as pd
from bs4 import BeautifulSoup
import time
from typing import List, Dict, Optional
def get_available_years(github_username: str, headers: dict) -> List[int]:
"""
Fetches the main profile page and extracts the available contribution years.
"""
print("Attempting to find available contribution years...")
base_url = f"https://github.com/{github_username}"
years = set()
current_year = datetime.date.today().year
years.add(current_year) # Add current year by default
try:
_r = requests.get(base_url, headers=headers, timeout=15)
_r.raise_for_status()
html = BeautifulSoup(_r.text, "html.parser")
# --- Find Year Selection Buttons/Links ---
# This is an educated guess based on common patterns. Inspect the live page
# if this fails. Look for a container around the contribution graph.
# We target buttons whose text is exactly a 4-digit year.
year_elements = html.find_all('a', class_='js-year-link', href=re.compile(r'from=\d{4}-'))
# Fallback if the above class changed
if not year_elements:
print("Could not find years using 'a.js-year-link'. Trying broader search for year buttons...")
# Look for buttons/links with 4-digit text, possibly within a specific container
# This part might need refinement based on actual live page structure
graph_container = html.find('div', class_='js-yearly-contributions') # Example container class
if graph_container:
potential_year_elements = graph_container.find_all(['a', 'button'], string=re.compile(r'^\s*\d{4}\s*$'))
for el in potential_year_elements:
year_text = el.get_text(strip=True)
if year_text.isdigit() and len(year_text) == 4:
years.add(int(year_text))
else:
print("Could not find graph container. Using only current year as fallback.")
for link in year_elements:
href = link.get('href', '')
# Extract year from 'from=YYYY-MM-DD'
match = re.search(r'from=(\d{4})-\d{2}-\d{2}', href)
if match:
try:
years.add(int(match.group(1)))
except ValueError:
print(f"Warning: Could not parse year from href: {href}")
except requests.exceptions.RequestException as e:
print(f"Warning: Error fetching profile page to find years: {e}. Using only current year.")
except Exception as e:
print(f"Warning: Error parsing years from profile page: {e}. Using only current year.")
if not years:
print("Warning: No years found. Defaulting to current year.")
years.add(current_year)
sorted_years = sorted(list(years), reverse=True)
print(f"Found years: {sorted_years}")
return sorted_years
def get_contribution_fragment(github_username: str, year: int, headers: dict) -> Optional[str]:
"""
Fetches the HTML fragment containing the contribution graph for a specific year.
"""
# Construct the URL based on the include-fragment pattern
# Note: GitHub might simplify this, but this matches the observed pattern
fragment_url = f"https://github.com/{github_username}?tab=contributions&from={year}-01-01&to={year}-12-31"
# Alternative observed pattern if the above fails:
# fragment_url = f"https://github.com/users/{github_username}/contributions?from={year}-01-01&to={year}-12-31"
print(f"Fetching contribution fragment for {year} from {fragment_url}...")
try:
time.sleep(1.5) # Increased delay
_r = requests.get(fragment_url, headers=headers, timeout=15)
_r.raise_for_status()
# Check if we received HTML content
if 'html' not in _r.headers.get('Content-Type', '').lower():
print(f"Warning: Received non-HTML content type '{_r.headers.get('Content-Type')}' for year {year}. Skipping.")
return None
# Basic check for empty or placeholder response
if len(_r.text) < 500: # Arbitrary small length check
print(f"Warning: Received very short response for year {year}. May be empty or placeholder. Skipping.")
return None
return _r.text
except requests.exceptions.RequestException as e:
print(f"Warning: Error fetching fragment for year {year}: {e}. Skipping this year.")
return None
except Exception as e:
print(f"Warning: An unexpected error occurred fetching fragment for year {year}: {e}")
return None
def parse_contributions_from_fragment(fragment_html: str, year: int) -> List[Dict]:
"""
Parses the contribution data from the fetched HTML fragment using tooltips.
"""
contributions_data = []
if not fragment_html:
return contributions_data
soup = BeautifulSoup(fragment_html, "html.parser")
today_date = datetime.date.today()
# --- Find Contribution Cells (likely <td> in loaded fragment) ---
# Still using data-date as the primary identifier within the fragment
day_cells = soup.find_all(attrs={"data-date": True, "id": True})
if not day_cells:
print(f"Warning: No elements with 'data-date' and 'id' found in fragment for year {year}.")
# Fallback: Sometimes the rect elements are used instead of td
day_cells = soup.find_all('rect', attrs={"data-date": True, "id": True})
if not day_cells:
print(f"Warning: Also failed to find 'rect' elements for year {year}.")
return contributions_data
else:
print(f"Found 'rect' elements instead of 'td' for year {year}.")
for cell in day_cells:
date_str = cell.get("data-date")
cell_id = cell.get("id")
if not date_str or not cell_id:
continue
# Validate date and check if it's in the past/today
try:
current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
if current_date > today_date:
continue
# Ensure the date actually belongs to the year we are processing
if current_date.year != year:
# This can happen with overlapping displays at year boundaries
# print(f"Debug: Skipping date {date_str} as it doesn't belong to year {year}")
continue
except ValueError:
print(f"Warning: Could not parse date {date_str} in fragment for year {year}")
continue
# --- Find the corresponding tooltip using the cell's ID ---
tooltip = soup.find('tool-tip', attrs={'for': cell_id})
count = 0 # Default to 0
if tooltip:
tooltip_text = tooltip.get_text(strip=True)
# Regex to find "No contributions" or "X contribution(s)"
match = re.match(r"(No|\d+)\s+contribution", tooltip_text, re.IGNORECASE)
if match:
count_str = match.group(1)
if count_str.lower() == 'no':
count = 0
else:
try:
count = int(count_str)
except ValueError:
print(f"Warning: Could not convert count '{count_str}' to int for {date_str} in year {year}.")
count = 0 # Keep default if conversion fails
else:
# Check if the level indicates activity even if tooltip text is weird
level = cell.get("data-level", "0")
if level != '0':
print(f"Warning: Tooltip text '{tooltip_text}' for {date_str} (Level: {level}) in year {year} did not match expected pattern. Defaulting count to 0.")
else:
# Fallback if tooltip isn't found (less likely if ID exists)
level = cell.get("data-level", "0")
if level != '0':
print(f"Warning: Tooltip not found for cell {cell_id} (Date: {date_str}, Level: {level}) in year {year}. Defaulting count to 0.")
contributions_data.append({"Date": date_str, "Contributions": count})
return contributions_data
def get_gh_contribution_data(github_username: str) -> list:
"""
Get GitHub contribution data for a given username by fetching and parsing
contribution graph fragments for each available year.
"""
all_contributions = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
# Adding X-Requested-With might mimic AJAX requests sometimes used for fragments
'X-Requested-With': 'XMLHttpRequest'
}
available_years = get_available_years(github_username, headers)
if not available_years:
print("Error: Could not determine available contribution years.")
return []
for year in available_years:
fragment_html = get_contribution_fragment(github_username, year, headers)
if fragment_html:
year_contributions = parse_contributions_from_fragment(fragment_html, year)
print(f"Parsed {len(year_contributions)} contributions for {year}.")
all_contributions.extend(year_contributions)
else:
print(f"Skipping year {year} due to fetch error or empty fragment.")
print(f"Finished processing all years. Found {len(all_contributions)} contribution entries overall.")
return all_contributions
def main(github_username: str = None):
"""
Main function to get GitHub contribution data and save it to a CSV file.
"""
print(f"Starting script for GitHub user: {github_username}")
try:
if github_username is None:
github_username = input("Enter GitHub username: ")
data = get_gh_contribution_data(github_username)
if not data:
print("No contribution data was retrieved. Exiting.")
return
# Create DataFrame
df = pd.DataFrame(data)
# --- Data Cleaning and Preparation ---
if df.empty:
print("DataFrame is empty after collecting data. Exiting.")
return
# Convert 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date']) # Remove rows where date conversion failed
# Convert 'Contributions' to numeric
df['Contributions'] = pd.to_numeric(df['Contributions'], errors='coerce').fillna(0).astype(int)
# Remove duplicate date entries (important!)
# Keep the 'last' entry which usually corresponds to the most recent fetch for that date if overlaps occurred
initial_rows = len(df)
df = df.drop_duplicates(subset='Date', keep='last')
if len(df) < initial_rows:
print(f"Removed {initial_rows - len(df)} duplicate date entries.")
# Sort by date
df = df.sort_values(by="Date", ascending=True)
# Define output filename
output_filename = f"{github_username}_contribution_data.csv"
# Save the contribution data to a CSV file
df.to_csv(output_filename, index=False)
print(f"Successfully saved data for {len(df)} days to {output_filename}")
except requests.exceptions.RequestException as req_e:
print(f"Failed to fetch data due to network issues: {req_e}")
except Exception as e:
print(f"An unexpected error occurred in main: {e}")
import traceback
traceback.print_exc()
# Check if the script is being run directly and run the main function
if __name__ == "__main__":
main()
# Do `pip install beautifulsoup4 pandas`
beautifulsoup4
pandas
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment