Last active
April 4, 2025 00:50
-
-
Save jskherman/c27813f559af85a46e41b249ee15e655 to your computer and use it in GitHub Desktop.
A Python script for exporting a GitHub user's daily commit counts to a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import datetime | |
import re | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import time | |
from typing import List, Dict, Optional | |
def get_available_years(github_username: str, headers: dict) -> List[int]: | |
""" | |
Fetches the main profile page and extracts the available contribution years. | |
""" | |
print("Attempting to find available contribution years...") | |
base_url = f"https://github.com/{github_username}" | |
years = set() | |
current_year = datetime.date.today().year | |
years.add(current_year) # Add current year by default | |
try: | |
_r = requests.get(base_url, headers=headers, timeout=15) | |
_r.raise_for_status() | |
html = BeautifulSoup(_r.text, "html.parser") | |
# --- Find Year Selection Buttons/Links --- | |
# This is an educated guess based on common patterns. Inspect the live page | |
# if this fails. Look for a container around the contribution graph. | |
# We target buttons whose text is exactly a 4-digit year. | |
year_elements = html.find_all('a', class_='js-year-link', href=re.compile(r'from=\d{4}-')) | |
# Fallback if the above class changed | |
if not year_elements: | |
print("Could not find years using 'a.js-year-link'. Trying broader search for year buttons...") | |
# Look for buttons/links with 4-digit text, possibly within a specific container | |
# This part might need refinement based on actual live page structure | |
graph_container = html.find('div', class_='js-yearly-contributions') # Example container class | |
if graph_container: | |
potential_year_elements = graph_container.find_all(['a', 'button'], string=re.compile(r'^\s*\d{4}\s*$')) | |
for el in potential_year_elements: | |
year_text = el.get_text(strip=True) | |
if year_text.isdigit() and len(year_text) == 4: | |
years.add(int(year_text)) | |
else: | |
print("Could not find graph container. Using only current year as fallback.") | |
for link in year_elements: | |
href = link.get('href', '') | |
# Extract year from 'from=YYYY-MM-DD' | |
match = re.search(r'from=(\d{4})-\d{2}-\d{2}', href) | |
if match: | |
try: | |
years.add(int(match.group(1))) | |
except ValueError: | |
print(f"Warning: Could not parse year from href: {href}") | |
except requests.exceptions.RequestException as e: | |
print(f"Warning: Error fetching profile page to find years: {e}. Using only current year.") | |
except Exception as e: | |
print(f"Warning: Error parsing years from profile page: {e}. Using only current year.") | |
if not years: | |
print("Warning: No years found. Defaulting to current year.") | |
years.add(current_year) | |
sorted_years = sorted(list(years), reverse=True) | |
print(f"Found years: {sorted_years}") | |
return sorted_years | |
def get_contribution_fragment(github_username: str, year: int, headers: dict) -> Optional[str]: | |
""" | |
Fetches the HTML fragment containing the contribution graph for a specific year. | |
""" | |
# Construct the URL based on the include-fragment pattern | |
# Note: GitHub might simplify this, but this matches the observed pattern | |
fragment_url = f"https://github.com/{github_username}?tab=contributions&from={year}-01-01&to={year}-12-31" | |
# Alternative observed pattern if the above fails: | |
# fragment_url = f"https://github.com/users/{github_username}/contributions?from={year}-01-01&to={year}-12-31" | |
print(f"Fetching contribution fragment for {year} from {fragment_url}...") | |
try: | |
time.sleep(1.5) # Increased delay | |
_r = requests.get(fragment_url, headers=headers, timeout=15) | |
_r.raise_for_status() | |
# Check if we received HTML content | |
if 'html' not in _r.headers.get('Content-Type', '').lower(): | |
print(f"Warning: Received non-HTML content type '{_r.headers.get('Content-Type')}' for year {year}. Skipping.") | |
return None | |
# Basic check for empty or placeholder response | |
if len(_r.text) < 500: # Arbitrary small length check | |
print(f"Warning: Received very short response for year {year}. May be empty or placeholder. Skipping.") | |
return None | |
return _r.text | |
except requests.exceptions.RequestException as e: | |
print(f"Warning: Error fetching fragment for year {year}: {e}. Skipping this year.") | |
return None | |
except Exception as e: | |
print(f"Warning: An unexpected error occurred fetching fragment for year {year}: {e}") | |
return None | |
def parse_contributions_from_fragment(fragment_html: str, year: int) -> List[Dict]: | |
""" | |
Parses the contribution data from the fetched HTML fragment using tooltips. | |
""" | |
contributions_data = [] | |
if not fragment_html: | |
return contributions_data | |
soup = BeautifulSoup(fragment_html, "html.parser") | |
today_date = datetime.date.today() | |
# --- Find Contribution Cells (likely <td> in loaded fragment) --- | |
# Still using data-date as the primary identifier within the fragment | |
day_cells = soup.find_all(attrs={"data-date": True, "id": True}) | |
if not day_cells: | |
print(f"Warning: No elements with 'data-date' and 'id' found in fragment for year {year}.") | |
# Fallback: Sometimes the rect elements are used instead of td | |
day_cells = soup.find_all('rect', attrs={"data-date": True, "id": True}) | |
if not day_cells: | |
print(f"Warning: Also failed to find 'rect' elements for year {year}.") | |
return contributions_data | |
else: | |
print(f"Found 'rect' elements instead of 'td' for year {year}.") | |
for cell in day_cells: | |
date_str = cell.get("data-date") | |
cell_id = cell.get("id") | |
if not date_str or not cell_id: | |
continue | |
# Validate date and check if it's in the past/today | |
try: | |
current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date() | |
if current_date > today_date: | |
continue | |
# Ensure the date actually belongs to the year we are processing | |
if current_date.year != year: | |
# This can happen with overlapping displays at year boundaries | |
# print(f"Debug: Skipping date {date_str} as it doesn't belong to year {year}") | |
continue | |
except ValueError: | |
print(f"Warning: Could not parse date {date_str} in fragment for year {year}") | |
continue | |
# --- Find the corresponding tooltip using the cell's ID --- | |
tooltip = soup.find('tool-tip', attrs={'for': cell_id}) | |
count = 0 # Default to 0 | |
if tooltip: | |
tooltip_text = tooltip.get_text(strip=True) | |
# Regex to find "No contributions" or "X contribution(s)" | |
match = re.match(r"(No|\d+)\s+contribution", tooltip_text, re.IGNORECASE) | |
if match: | |
count_str = match.group(1) | |
if count_str.lower() == 'no': | |
count = 0 | |
else: | |
try: | |
count = int(count_str) | |
except ValueError: | |
print(f"Warning: Could not convert count '{count_str}' to int for {date_str} in year {year}.") | |
count = 0 # Keep default if conversion fails | |
else: | |
# Check if the level indicates activity even if tooltip text is weird | |
level = cell.get("data-level", "0") | |
if level != '0': | |
print(f"Warning: Tooltip text '{tooltip_text}' for {date_str} (Level: {level}) in year {year} did not match expected pattern. Defaulting count to 0.") | |
else: | |
# Fallback if tooltip isn't found (less likely if ID exists) | |
level = cell.get("data-level", "0") | |
if level != '0': | |
print(f"Warning: Tooltip not found for cell {cell_id} (Date: {date_str}, Level: {level}) in year {year}. Defaulting count to 0.") | |
contributions_data.append({"Date": date_str, "Contributions": count}) | |
return contributions_data | |
def get_gh_contribution_data(github_username: str) -> list: | |
""" | |
Get GitHub contribution data for a given username by fetching and parsing | |
contribution graph fragments for each available year. | |
""" | |
all_contributions = [] | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Language': 'en-US,en;q=0.9', | |
# Adding X-Requested-With might mimic AJAX requests sometimes used for fragments | |
'X-Requested-With': 'XMLHttpRequest' | |
} | |
available_years = get_available_years(github_username, headers) | |
if not available_years: | |
print("Error: Could not determine available contribution years.") | |
return [] | |
for year in available_years: | |
fragment_html = get_contribution_fragment(github_username, year, headers) | |
if fragment_html: | |
year_contributions = parse_contributions_from_fragment(fragment_html, year) | |
print(f"Parsed {len(year_contributions)} contributions for {year}.") | |
all_contributions.extend(year_contributions) | |
else: | |
print(f"Skipping year {year} due to fetch error or empty fragment.") | |
print(f"Finished processing all years. Found {len(all_contributions)} contribution entries overall.") | |
return all_contributions | |
def main(github_username: str = None): | |
""" | |
Main function to get GitHub contribution data and save it to a CSV file. | |
""" | |
print(f"Starting script for GitHub user: {github_username}") | |
try: | |
if github_username is None: | |
github_username = input("Enter GitHub username: ") | |
data = get_gh_contribution_data(github_username) | |
if not data: | |
print("No contribution data was retrieved. Exiting.") | |
return | |
# Create DataFrame | |
df = pd.DataFrame(data) | |
# --- Data Cleaning and Preparation --- | |
if df.empty: | |
print("DataFrame is empty after collecting data. Exiting.") | |
return | |
# Convert 'Date' column to datetime objects | |
df['Date'] = pd.to_datetime(df['Date'], errors='coerce') | |
df = df.dropna(subset=['Date']) # Remove rows where date conversion failed | |
# Convert 'Contributions' to numeric | |
df['Contributions'] = pd.to_numeric(df['Contributions'], errors='coerce').fillna(0).astype(int) | |
# Remove duplicate date entries (important!) | |
# Keep the 'last' entry which usually corresponds to the most recent fetch for that date if overlaps occurred | |
initial_rows = len(df) | |
df = df.drop_duplicates(subset='Date', keep='last') | |
if len(df) < initial_rows: | |
print(f"Removed {initial_rows - len(df)} duplicate date entries.") | |
# Sort by date | |
df = df.sort_values(by="Date", ascending=True) | |
# Define output filename | |
output_filename = f"{github_username}_contribution_data.csv" | |
# Save the contribution data to a CSV file | |
df.to_csv(output_filename, index=False) | |
print(f"Successfully saved data for {len(df)} days to {output_filename}") | |
except requests.exceptions.RequestException as req_e: | |
print(f"Failed to fetch data due to network issues: {req_e}") | |
except Exception as e: | |
print(f"An unexpected error occurred in main: {e}") | |
import traceback | |
traceback.print_exc() | |
# Check if the script is being run directly and run the main function | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Do `pip install beautifulsoup4 pandas` | |
beautifulsoup4 | |
pandas |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment