cpfiffer · July 14, 2025 21:02 · cpfiffer · Jul 14, 2025
diff --git a/fetch_llms_txt.py b/fetch_llms_txt.py
 def fetch_llms_txt(url: str, timeout: int = 10):
    """
    Fetch content from URLs ending in 'llms.txt'.
    
    Args:
        url (str): The URL to fetch (must end with 'llms.txt')
        timeout (int): Request timeout in seconds (default: 10)
    Returns:
        str: Text content of the llms.txt file
    
    Raises:
        ValueError: If URL doesn't end with 'llms.txt' or is invalid
        Exception: For network errors, HTTP errors, or other failures
    """
    
    import requests
    import urllib.parse
    
    # Validate URL ends with llms.txt
    if not url.lower().endswith('llms.txt'):
        raise ValueError("URL must end with 'llms.txt'")
    
    # Validate URL format
    parsed_url = urllib.parse.urlparse(url)
    if not parsed_url.scheme or not parsed_url.netloc:
        raise ValueError("Invalid URL format")
    
    # Set default headers
    default_headers = {
        'User-Agent': 'llms.txt-fetcher/1.0',
        'Accept': 'text/plain, text/*, */*'
    }
    
    try:
        # Make the request
        response = requests.get(
            url,
            timeout=timeout,
            headers=default_headers,
            allow_redirects=True
        )
        
        # Check if request was successful
        if response.status_code == 200:
            return response.text
        else:
            raise Exception(f"HTTP {response.status_code}: {response.reason}. The llms.txt file may not exist at this URL or the server is returning an error.")
            
    except requests.exceptions.Timeout:
        raise Exception(f"Request timed out after {timeout} seconds. Try increasing the `timeout` argument for slower servers.")
        
    except requests.exceptions.ConnectionError:
        raise Exception("Connection error - could not reach the server. Check the URL and your internet connection.")
        
    except requests.exceptions.TooManyRedirects:
        raise Exception("Too many redirects. The server may be misconfigured or the URL may be invalid.")
        
    except requests.exceptions.RequestException as e:
        raise Exception(f"Request failed: {str(e)}. Check the URL format and network connectivity.")
        
    except Exception as e:
        # Re-raise if it's already our custom exception
        if any(phrase in str(e) for phrase in ['Request timed out', 'Connection error', 'Too many redirects', 'Request failed', 'HTTP ']):
            raise
        else:
            raise Exception(f"Unexpected error: {str(e)}. This may be due to network issues or server problems.")
	def fetch_llms_txt(url: str, timeout: int = 10):
	"""
	Fetch content from URLs ending in 'llms.txt'.

	Args:
	url (str): The URL to fetch (must end with 'llms.txt')
	timeout (int): Request timeout in seconds (default: 10)
	Returns:
	str: Text content of the llms.txt file

	Raises:
	ValueError: If URL doesn't end with 'llms.txt' or is invalid
	Exception: For network errors, HTTP errors, or other failures
	"""

	import requests
	import urllib.parse

	# Validate URL ends with llms.txt
	if not url.lower().endswith('llms.txt'):
	raise ValueError("URL must end with 'llms.txt'")

	# Validate URL format
	parsed_url = urllib.parse.urlparse(url)
	if not parsed_url.scheme or not parsed_url.netloc:
	raise ValueError("Invalid URL format")

	# Set default headers
	default_headers = {
	'User-Agent': 'llms.txt-fetcher/1.0',
	'Accept': 'text/plain, text/, /*'
	}

	try:
	# Make the request
	response = requests.get(
	url,
	timeout=timeout,
	headers=default_headers,
	allow_redirects=True
	)

	# Check if request was successful
	if response.status_code == 200:
	return response.text
	else:
	raise Exception(f"HTTP {response.status_code}: {response.reason}. The llms.txt file may not exist at this URL or the server is returning an error.")

	except requests.exceptions.Timeout:
	raise Exception(f"Request timed out after {timeout} seconds. Try increasing the `timeout` argument for slower servers.")

	except requests.exceptions.ConnectionError:
	raise Exception("Connection error - could not reach the server. Check the URL and your internet connection.")

	except requests.exceptions.TooManyRedirects:
	raise Exception("Too many redirects. The server may be misconfigured or the URL may be invalid.")

	except requests.exceptions.RequestException as e:
	raise Exception(f"Request failed: {str(e)}. Check the URL format and network connectivity.")

	except Exception as e:
	# Re-raise if it's already our custom exception
	if any(phrase in str(e) for phrase in ['Request timed out', 'Connection error', 'Too many redirects', 'Request failed', 'HTTP ']):
	raise
	else:
	raise Exception(f"Unexpected error: {str(e)}. This may be due to network issues or server problems.")