Created
March 13, 2025 04:02
-
-
Save naufalso/ae2d1dd3dcbbdbd726e6c51b6d116332 to your computer and use it in GitHub Desktop.
a simple python function to fetch the git readme
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Naufal Suryanto | |
import requests | |
import re | |
from markdownify import markdownify as md | |
from urllib.parse import urlparse | |
def fetch_url_readme_from_git(url): | |
""" | |
Fetch the README file content from a Git repository given a URL. | |
Args: | |
url (str): The URL of the Git repository page. | |
Returns: | |
str: The content of the README file if found, otherwise an empty string. | |
""" | |
if not url: | |
return "" | |
try: | |
headers = { | |
'User-Agent': ( | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/133.0.0.0 Safari/537.36' | |
), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
} | |
# Fetch the original URL and convert its HTML to markdown | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
markdown_content = md(response.text, heading_style="ATX") | |
markdown_content = re.sub(r'(\n\s*){3,}', '\n\n', markdown_content).strip() | |
# Search for a markdown link labeled "readme" | |
readme_match = re.search(r'\[readme\]\(([^)]+)\)', markdown_content, re.IGNORECASE) | |
if not readme_match: | |
return "" | |
# Build the absolute URL for the README file | |
readme_url = readme_match.group(1).split(" ")[0] #readme_match.group(1) | |
parsed_url = urlparse(url) | |
if not readme_url.startswith("http"): | |
readme_url = f"{parsed_url.scheme}://{parsed_url.netloc}{readme_url}" | |
# Convert GitHub's blob URL to the raw URL | |
readme_url = readme_url.replace("blob", "raw") | |
# Fetch and return the README file content | |
readme_response = requests.get(readme_url, headers=headers, timeout=10) | |
readme_response.raise_for_status() | |
return readme_response.text | |
except Exception as e: | |
print(f"Error fetching README from {url}: {e}") | |
return "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment