jskherman · April 4, 2025 00:50
diff --git a/get_gh_commits.py b/get_gh_commits.py
 import requests
 import datetime
 import re
 import pandas as pd
 from bs4 import BeautifulSoup
 import time
 from typing import List, Dict, Optional

 def get_available_years(github_username: str, headers: dict) -> List[int]:
    """
    Fetches the main profile page and extracts the available contribution years.
    """
    print("Attempting to find available contribution years...")
    base_url = f"https://github.com/{github_username}"
    years = set()
    current_year = datetime.date.today().year
    years.add(current_year) # Add current year by default

    try:
        _r = requests.get(base_url, headers=headers, timeout=15)
        _r.raise_for_status()
        html = BeautifulSoup(_r.text, "html.parser")

        # --- Find Year Selection Buttons/Links ---
        # This is an educated guess based on common patterns. Inspect the live page
        # if this fails. Look for a container around the contribution graph.
        # We target buttons whose text is exactly a 4-digit year.
        year_elements = html.find_all('a', class_='js-year-link', href=re.compile(r'from=\d{4}-'))
        # Fallback if the above class changed
        if not year_elements:
             print("Could not find years using 'a.js-year-link'. Trying broader search for year buttons...")
             # Look for buttons/links with 4-digit text, possibly within a specific container
             # This part might need refinement based on actual live page structure
             graph_container = html.find('div', class_='js-yearly-contributions') # Example container class
             if graph_container:
                 potential_year_elements = graph_container.find_all(['a', 'button'], string=re.compile(r'^\s*\d{4}\s*$'))
                 for el in potential_year_elements:
                      year_text = el.get_text(strip=True)
                      if year_text.isdigit() and len(year_text) == 4:
                           years.add(int(year_text))
             else:
                  print("Could not find graph container. Using only current year as fallback.")


        for link in year_elements:
             href = link.get('href', '')
             # Extract year from 'from=YYYY-MM-DD'
             match = re.search(r'from=(\d{4})-\d{2}-\d{2}', href)
             if match:
                  try:
                       years.add(int(match.group(1)))
                  except ValueError:
                       print(f"Warning: Could not parse year from href: {href}")

    except requests.exceptions.RequestException as e:
        print(f"Warning: Error fetching profile page to find years: {e}. Using only current year.")
    except Exception as e:
        print(f"Warning: Error parsing years from profile page: {e}. Using only current year.")

    if not years:
         print("Warning: No years found. Defaulting to current year.")
         years.add(current_year)

    sorted_years = sorted(list(years), reverse=True)
    print(f"Found years: {sorted_years}")
    return sorted_years

 def get_contribution_fragment(github_username: str, year: int, headers: dict) -> Optional[str]:
    """
    Fetches the HTML fragment containing the contribution graph for a specific year.
    """
    # Construct the URL based on the include-fragment pattern
    # Note: GitHub might simplify this, but this matches the observed pattern
    fragment_url = f"https://github.com/{github_username}?tab=contributions&from={year}-01-01&to={year}-12-31"
    # Alternative observed pattern if the above fails:
    # fragment_url = f"https://github.com/users/{github_username}/contributions?from={year}-01-01&to={year}-12-31"

    print(f"Fetching contribution fragment for {year} from {fragment_url}...")
    try:
        time.sleep(1.5) # Increased delay
        _r = requests.get(fragment_url, headers=headers, timeout=15)
        _r.raise_for_status()
        # Check if we received HTML content
        if 'html' not in _r.headers.get('Content-Type', '').lower():
             print(f"Warning: Received non-HTML content type '{_r.headers.get('Content-Type')}' for year {year}. Skipping.")
             return None
        # Basic check for empty or placeholder response
        if len(_r.text) < 500: # Arbitrary small length check
             print(f"Warning: Received very short response for year {year}. May be empty or placeholder. Skipping.")
             return None
        return _r.text
    except requests.exceptions.RequestException as e:
        print(f"Warning: Error fetching fragment for year {year}: {e}. Skipping this year.")
        return None
    except Exception as e:
        print(f"Warning: An unexpected error occurred fetching fragment for year {year}: {e}")
        return None


 def parse_contributions_from_fragment(fragment_html: str, year: int) -> List[Dict]:
    """
    Parses the contribution data from the fetched HTML fragment using tooltips.
    """
    contributions_data = []
    if not fragment_html:
        return contributions_data

    soup = BeautifulSoup(fragment_html, "html.parser")
    today_date = datetime.date.today()

    # --- Find Contribution Cells (likely <td> in loaded fragment) ---
    # Still using data-date as the primary identifier within the fragment
    day_cells = soup.find_all(attrs={"data-date": True, "id": True})

    if not day_cells:
        print(f"Warning: No elements with 'data-date' and 'id' found in fragment for year {year}.")
        # Fallback: Sometimes the rect elements are used instead of td
        day_cells = soup.find_all('rect', attrs={"data-date": True, "id": True})
        if not day_cells:
             print(f"Warning: Also failed to find 'rect' elements for year {year}.")
             return contributions_data
        else:
             print(f"Found 'rect' elements instead of 'td' for year {year}.")


    for cell in day_cells:
        date_str = cell.get("data-date")
        cell_id = cell.get("id")

        if not date_str or not cell_id:
            continue

        # Validate date and check if it's in the past/today
        try:
            current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
            if current_date > today_date:
                continue
            # Ensure the date actually belongs to the year we are processing
            if current_date.year != year:
                 # This can happen with overlapping displays at year boundaries
                 # print(f"Debug: Skipping date {date_str} as it doesn't belong to year {year}")
                 continue
        except ValueError:
            print(f"Warning: Could not parse date {date_str} in fragment for year {year}")
            continue

        # --- Find the corresponding tooltip using the cell's ID ---
        tooltip = soup.find('tool-tip', attrs={'for': cell_id})
        count = 0 # Default to 0

        if tooltip:
            tooltip_text = tooltip.get_text(strip=True)
            # Regex to find "No contributions" or "X contribution(s)"
            match = re.match(r"(No|\d+)\s+contribution", tooltip_text, re.IGNORECASE)
            if match:
                count_str = match.group(1)
                if count_str.lower() == 'no':
                    count = 0
                else:
                    try:
                        count = int(count_str)
                    except ValueError:
                        print(f"Warning: Could not convert count '{count_str}' to int for {date_str} in year {year}.")
                        count = 0 # Keep default if conversion fails
            else:
                 # Check if the level indicates activity even if tooltip text is weird
                 level = cell.get("data-level", "0")
                 if level != '0':
                      print(f"Warning: Tooltip text '{tooltip_text}' for {date_str} (Level: {level}) in year {year} did not match expected pattern. Defaulting count to 0.")
        else:
            # Fallback if tooltip isn't found (less likely if ID exists)
             level = cell.get("data-level", "0")
             if level != '0':
                  print(f"Warning: Tooltip not found for cell {cell_id} (Date: {date_str}, Level: {level}) in year {year}. Defaulting count to 0.")


        contributions_data.append({"Date": date_str, "Contributions": count})

    return contributions_data

 def get_gh_contribution_data(github_username: str) -> list:
    """
    Get GitHub contribution data for a given username by fetching and parsing
    contribution graph fragments for each available year.
    """
    all_contributions = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        # Adding X-Requested-With might mimic AJAX requests sometimes used for fragments
        'X-Requested-With': 'XMLHttpRequest'
    }

    available_years = get_available_years(github_username, headers)

    if not available_years:
        print("Error: Could not determine available contribution years.")
        return []

    for year in available_years:
        fragment_html = get_contribution_fragment(github_username, year, headers)
        if fragment_html:
            year_contributions = parse_contributions_from_fragment(fragment_html, year)
            print(f"Parsed {len(year_contributions)} contributions for {year}.")
            all_contributions.extend(year_contributions)
        else:
             print(f"Skipping year {year} due to fetch error or empty fragment.")

    print(f"Finished processing all years. Found {len(all_contributions)} contribution entries overall.")
    return all_contributions

 def main(github_username: str = None):
    """
    Main function to get GitHub contribution data and save it to a CSV file.
    """
    print(f"Starting script for GitHub user: {github_username}")
    try:
        if github_username is None:
            github_username = input("Enter GitHub username: ")
        data = get_gh_contribution_data(github_username)

        if not data:
            print("No contribution data was retrieved. Exiting.")
            return

        # Create DataFrame
        df = pd.DataFrame(data)

        # --- Data Cleaning and Preparation ---
        if df.empty:
             print("DataFrame is empty after collecting data. Exiting.")
             return

        # Convert 'Date' column to datetime objects
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.dropna(subset=['Date']) # Remove rows where date conversion failed

        # Convert 'Contributions' to numeric
        df['Contributions'] = pd.to_numeric(df['Contributions'], errors='coerce').fillna(0).astype(int)

        # Remove duplicate date entries (important!)
        # Keep the 'last' entry which usually corresponds to the most recent fetch for that date if overlaps occurred
        initial_rows = len(df)
        df = df.drop_duplicates(subset='Date', keep='last')
        if len(df) < initial_rows:
             print(f"Removed {initial_rows - len(df)} duplicate date entries.")

        # Sort by date
        df = df.sort_values(by="Date", ascending=True)

        # Define output filename
        output_filename = f"{github_username}_contribution_data.csv"

        # Save the contribution data to a CSV file
        df.to_csv(output_filename, index=False)
        print(f"Successfully saved data for {len(df)} days to {output_filename}")

    except requests.exceptions.RequestException as req_e:
        print(f"Failed to fetch data due to network issues: {req_e}")
    except Exception as e:
        print(f"An unexpected error occurred in main: {e}")
        import traceback
        traceback.print_exc()


 # Check if the script is being run directly and run the main function
 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 # Do `pip install beautifulsoup4 pandas`
 beautifulsoup4
 pandas
	import requests
	import datetime
	import re
	import pandas as pd
	from bs4 import BeautifulSoup
	import time
	from typing import List, Dict, Optional

	def get_available_years(github_username: str, headers: dict) -> List[int]:
	"""
	Fetches the main profile page and extracts the available contribution years.
	"""
	print("Attempting to find available contribution years...")
	base_url = f"https://github.com/{github_username}"
	years = set()
	current_year = datetime.date.today().year
	years.add(current_year) # Add current year by default

	try:
	_r = requests.get(base_url, headers=headers, timeout=15)
	_r.raise_for_status()
	html = BeautifulSoup(_r.text, "html.parser")

	# --- Find Year Selection Buttons/Links ---
	# This is an educated guess based on common patterns. Inspect the live page
	# if this fails. Look for a container around the contribution graph.
	# We target buttons whose text is exactly a 4-digit year.
	year_elements = html.find_all('a', class_='js-year-link', href=re.compile(r'from=\d{4}-'))
	# Fallback if the above class changed
	if not year_elements:
	print("Could not find years using 'a.js-year-link'. Trying broader search for year buttons...")
	# Look for buttons/links with 4-digit text, possibly within a specific container
	# This part might need refinement based on actual live page structure
	graph_container = html.find('div', class_='js-yearly-contributions') # Example container class
	if graph_container:
	potential_year_elements = graph_container.find_all(['a', 'button'], string=re.compile(r'^\s\d{4}\s$'))
	for el in potential_year_elements:
	year_text = el.get_text(strip=True)
	if year_text.isdigit() and len(year_text) == 4:
	years.add(int(year_text))
	else:
	print("Could not find graph container. Using only current year as fallback.")


	for link in year_elements:
	href = link.get('href', '')
	# Extract year from 'from=YYYY-MM-DD'
	match = re.search(r'from=(\d{4})-\d{2}-\d{2}', href)
	if match:
	try:
	years.add(int(match.group(1)))
	except ValueError:
	print(f"Warning: Could not parse year from href: {href}")

	except requests.exceptions.RequestException as e:
	print(f"Warning: Error fetching profile page to find years: {e}. Using only current year.")
	except Exception as e:
	print(f"Warning: Error parsing years from profile page: {e}. Using only current year.")

	if not years:
	print("Warning: No years found. Defaulting to current year.")
	years.add(current_year)

	sorted_years = sorted(list(years), reverse=True)
	print(f"Found years: {sorted_years}")
	return sorted_years

	def get_contribution_fragment(github_username: str, year: int, headers: dict) -> Optional[str]:
	"""
	Fetches the HTML fragment containing the contribution graph for a specific year.
	"""
	# Construct the URL based on the include-fragment pattern
	# Note: GitHub might simplify this, but this matches the observed pattern
	fragment_url = f"https://github.com/{github_username}?tab=contributions&from={year}-01-01&to={year}-12-31"
	# Alternative observed pattern if the above fails:
	# fragment_url = f"https://github.com/users/{github_username}/contributions?from={year}-01-01&to={year}-12-31"

	print(f"Fetching contribution fragment for {year} from {fragment_url}...")
	try:
	time.sleep(1.5) # Increased delay
	_r = requests.get(fragment_url, headers=headers, timeout=15)
	_r.raise_for_status()
	# Check if we received HTML content
	if 'html' not in _r.headers.get('Content-Type', '').lower():
	print(f"Warning: Received non-HTML content type '{_r.headers.get('Content-Type')}' for year {year}. Skipping.")
	return None
	# Basic check for empty or placeholder response
	if len(_r.text) < 500: # Arbitrary small length check
	print(f"Warning: Received very short response for year {year}. May be empty or placeholder. Skipping.")
	return None
	return _r.text
	except requests.exceptions.RequestException as e:
	print(f"Warning: Error fetching fragment for year {year}: {e}. Skipping this year.")
	return None
	except Exception as e:
	print(f"Warning: An unexpected error occurred fetching fragment for year {year}: {e}")
	return None


	def parse_contributions_from_fragment(fragment_html: str, year: int) -> List[Dict]:
	"""
	Parses the contribution data from the fetched HTML fragment using tooltips.
	"""
	contributions_data = []
	if not fragment_html:
	return contributions_data

	soup = BeautifulSoup(fragment_html, "html.parser")
	today_date = datetime.date.today()

	# --- Find Contribution Cells (likely <td> in loaded fragment) ---
	# Still using data-date as the primary identifier within the fragment
	day_cells = soup.find_all(attrs={"data-date": True, "id": True})

	if not day_cells:
	print(f"Warning: No elements with 'data-date' and 'id' found in fragment for year {year}.")
	# Fallback: Sometimes the rect elements are used instead of td
	day_cells = soup.find_all('rect', attrs={"data-date": True, "id": True})
	if not day_cells:
	print(f"Warning: Also failed to find 'rect' elements for year {year}.")
	return contributions_data
	else:
	print(f"Found 'rect' elements instead of 'td' for year {year}.")


	for cell in day_cells:
	date_str = cell.get("data-date")
	cell_id = cell.get("id")

	if not date_str or not cell_id:
	continue

	# Validate date and check if it's in the past/today
	try:
	current_date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
	if current_date > today_date:
	continue
	# Ensure the date actually belongs to the year we are processing
	if current_date.year != year:
	# This can happen with overlapping displays at year boundaries
	# print(f"Debug: Skipping date {date_str} as it doesn't belong to year {year}")
	continue
	except ValueError:
	print(f"Warning: Could not parse date {date_str} in fragment for year {year}")
	continue

	# --- Find the corresponding tooltip using the cell's ID ---
	tooltip = soup.find('tool-tip', attrs={'for': cell_id})
	count = 0 # Default to 0

	if tooltip:
	tooltip_text = tooltip.get_text(strip=True)
	# Regex to find "No contributions" or "X contribution(s)"
	match = re.match(r"(No\|\d+)\s+contribution", tooltip_text, re.IGNORECASE)
	if match:
	count_str = match.group(1)
	if count_str.lower() == 'no':
	count = 0
	else:
	try:
	count = int(count_str)
	except ValueError:
	print(f"Warning: Could not convert count '{count_str}' to int for {date_str} in year {year}.")
	count = 0 # Keep default if conversion fails
	else:
	# Check if the level indicates activity even if tooltip text is weird
	level = cell.get("data-level", "0")
	if level != '0':
	print(f"Warning: Tooltip text '{tooltip_text}' for {date_str} (Level: {level}) in year {year} did not match expected pattern. Defaulting count to 0.")
	else:
	# Fallback if tooltip isn't found (less likely if ID exists)
	level = cell.get("data-level", "0")
	if level != '0':
	print(f"Warning: Tooltip not found for cell {cell_id} (Date: {date_str}, Level: {level}) in year {year}. Defaulting count to 0.")


	contributions_data.append({"Date": date_str, "Contributions": count})

	return contributions_data

	def get_gh_contribution_data(github_username: str) -> list:
	"""
	Get GitHub contribution data for a given username by fetching and parsing
	contribution graph fragments for each available year.
	"""
	all_contributions = []
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	# Adding X-Requested-With might mimic AJAX requests sometimes used for fragments
	'X-Requested-With': 'XMLHttpRequest'
	}

	available_years = get_available_years(github_username, headers)

	if not available_years:
	print("Error: Could not determine available contribution years.")
	return []

	for year in available_years:
	fragment_html = get_contribution_fragment(github_username, year, headers)
	if fragment_html:
	year_contributions = parse_contributions_from_fragment(fragment_html, year)
	print(f"Parsed {len(year_contributions)} contributions for {year}.")
	all_contributions.extend(year_contributions)
	else:
	print(f"Skipping year {year} due to fetch error or empty fragment.")

	print(f"Finished processing all years. Found {len(all_contributions)} contribution entries overall.")
	return all_contributions

	def main(github_username: str = None):
	"""
	Main function to get GitHub contribution data and save it to a CSV file.
	"""
	print(f"Starting script for GitHub user: {github_username}")
	try:
	if github_username is None:
	github_username = input("Enter GitHub username: ")
	data = get_gh_contribution_data(github_username)

	if not data:
	print("No contribution data was retrieved. Exiting.")
	return

	# Create DataFrame
	df = pd.DataFrame(data)

	# --- Data Cleaning and Preparation ---
	if df.empty:
	print("DataFrame is empty after collecting data. Exiting.")
	return

	# Convert 'Date' column to datetime objects
	df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
	df = df.dropna(subset=['Date']) # Remove rows where date conversion failed

	# Convert 'Contributions' to numeric
	df['Contributions'] = pd.to_numeric(df['Contributions'], errors='coerce').fillna(0).astype(int)

	# Remove duplicate date entries (important!)
	# Keep the 'last' entry which usually corresponds to the most recent fetch for that date if overlaps occurred
	initial_rows = len(df)
	df = df.drop_duplicates(subset='Date', keep='last')
	if len(df) < initial_rows:
	print(f"Removed {initial_rows - len(df)} duplicate date entries.")

	# Sort by date
	df = df.sort_values(by="Date", ascending=True)

	# Define output filename
	output_filename = f"{github_username}_contribution_data.csv"

	# Save the contribution data to a CSV file
	df.to_csv(output_filename, index=False)
	print(f"Successfully saved data for {len(df)} days to {output_filename}")

	except requests.exceptions.RequestException as req_e:
	print(f"Failed to fetch data due to network issues: {req_e}")
	except Exception as e:
	print(f"An unexpected error occurred in main: {e}")
	import traceback
	traceback.print_exc()


	# Check if the script is being run directly and run the main function
	if __name__ == "__main__":
	main()
	# Do `pip install beautifulsoup4 pandas`
	beautifulsoup4
	pandas