PatrickChoDev · May 6, 2025 12:07
diff --git a/download_from_gdrive.py b/download_from_gdrive.py
 import io
 import requests
 from bs4 import BeautifulSoup
 import logging

 logger = logging.getLogger(__name__)

 def download_google_drive_file(file_id, chunk_size=32768):
    """
    Download a file from Google Drive.
    Handles HTML virus check pages by extracting and submitting the form with GET method.
    
    Args:
        file_id (str): The Google Drive file ID
        chunk_size (int): Size of chunks to download
        
    Returns:
        io.BytesIO: Buffer containing the downloaded file
    """
    base_url = "https://docs.google.com/uc?export=download"
    session = requests.Session()

    # Initial request to get the file or HTML page
    response = session.get(base_url, params={"id": file_id}, stream=True)
    
    # Check if we got HTML instead of file content
    if "text/html" in response.headers.get("Content-Type", ""):
        logger.info("Received HTML page, extracting download form")
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the form and extract the download URL
        form = soup.find('form')
        if form:
            action_url = form.get('action', base_url)
            
            # Collect all form inputs to submit
            params = {}
            for input_tag in form.find_all('input'):
                if input_tag.get('name'):
                    params[input_tag.get('name')] = input_tag.get('value', '')
            
            # Submit the form using GET
            logger.info(f"Submitting form with params: {params}")
            response = session.get(action_url, params=params, stream=True)
        else:
            # If no form found, try direct download with "confirm" parameter
            logger.info("No form found, trying direct download with confirm parameter")
            response = session.get(
                base_url,
                params={"id": file_id, "confirm": "t"},
                stream=True,
            )

    # Double-check that we now have the file and not HTML
    if "text/html" in response.headers.get("Content-Type", ""):
        # Try to extract direct download link as fallback
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for download links in the HTML
        download_link = None
        for a in soup.find_all('a', href=True):
            if 'download' in a['href'] and file_id in a['href']:
                download_link = a['href']
                break
        
        if download_link:
            logger.info(f"Found direct download link: {download_link}")
            response = requests.get(download_link, stream=True)
        else:
            raise RuntimeError("Failed to download file — received HTML page instead of file data.")

    # Download the file in chunks
    buffer = io.BytesIO()
    for chunk in response.iter_content(chunk_size):
        if chunk:  # filter out keep-alive chunks
            buffer.write(chunk)

    buffer.seek(0)
    return buffer
	import io
	import requests
	from bs4 import BeautifulSoup
	import logging

	logger = logging.getLogger(__name__)

	def download_google_drive_file(file_id, chunk_size=32768):
	"""
	Download a file from Google Drive.
	Handles HTML virus check pages by extracting and submitting the form with GET method.

	Args:
	file_id (str): The Google Drive file ID
	chunk_size (int): Size of chunks to download

	Returns:
	io.BytesIO: Buffer containing the downloaded file
	"""
	base_url = "https://docs.google.com/uc?export=download"
	session = requests.Session()

	# Initial request to get the file or HTML page
	response = session.get(base_url, params={"id": file_id}, stream=True)

	# Check if we got HTML instead of file content
	if "text/html" in response.headers.get("Content-Type", ""):
	logger.info("Received HTML page, extracting download form")

	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the form and extract the download URL
	form = soup.find('form')
	if form:
	action_url = form.get('action', base_url)

	# Collect all form inputs to submit
	params = {}
	for input_tag in form.find_all('input'):
	if input_tag.get('name'):
	params[input_tag.get('name')] = input_tag.get('value', '')

	# Submit the form using GET
	logger.info(f"Submitting form with params: {params}")
	response = session.get(action_url, params=params, stream=True)
	else:
	# If no form found, try direct download with "confirm" parameter
	logger.info("No form found, trying direct download with confirm parameter")
	response = session.get(
	base_url,
	params={"id": file_id, "confirm": "t"},
	stream=True,
	)

	# Double-check that we now have the file and not HTML
	if "text/html" in response.headers.get("Content-Type", ""):
	# Try to extract direct download link as fallback
	soup = BeautifulSoup(response.content, 'html.parser')

	# Look for download links in the HTML
	download_link = None
	for a in soup.find_all('a', href=True):
	if 'download' in a['href'] and file_id in a['href']:
	download_link = a['href']
	break

	if download_link:
	logger.info(f"Found direct download link: {download_link}")
	response = requests.get(download_link, stream=True)
	else:
	raise RuntimeError("Failed to download file — received HTML page instead of file data.")

	# Download the file in chunks
	buffer = io.BytesIO()
	for chunk in response.iter_content(chunk_size):
	if chunk: # filter out keep-alive chunks
	buffer.write(chunk)

	buffer.seek(0)
	return buffer