Skip to content

Instantly share code, notes, and snippets.

@PatrickChoDev
Created May 6, 2025 12:07
Show Gist options
  • Save PatrickChoDev/150b2635568596ff897f563d791d00d2 to your computer and use it in GitHub Desktop.
Save PatrickChoDev/150b2635568596ff897f563d791d00d2 to your computer and use it in GitHub Desktop.
import io
import requests
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
def download_google_drive_file(file_id, chunk_size=32768):
"""
Download a file from Google Drive.
Handles HTML virus check pages by extracting and submitting the form with GET method.
Args:
file_id (str): The Google Drive file ID
chunk_size (int): Size of chunks to download
Returns:
io.BytesIO: Buffer containing the downloaded file
"""
base_url = "https://docs.google.com/uc?export=download"
session = requests.Session()
# Initial request to get the file or HTML page
response = session.get(base_url, params={"id": file_id}, stream=True)
# Check if we got HTML instead of file content
if "text/html" in response.headers.get("Content-Type", ""):
logger.info("Received HTML page, extracting download form")
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the form and extract the download URL
form = soup.find('form')
if form:
action_url = form.get('action', base_url)
# Collect all form inputs to submit
params = {}
for input_tag in form.find_all('input'):
if input_tag.get('name'):
params[input_tag.get('name')] = input_tag.get('value', '')
# Submit the form using GET
logger.info(f"Submitting form with params: {params}")
response = session.get(action_url, params=params, stream=True)
else:
# If no form found, try direct download with "confirm" parameter
logger.info("No form found, trying direct download with confirm parameter")
response = session.get(
base_url,
params={"id": file_id, "confirm": "t"},
stream=True,
)
# Double-check that we now have the file and not HTML
if "text/html" in response.headers.get("Content-Type", ""):
# Try to extract direct download link as fallback
soup = BeautifulSoup(response.content, 'html.parser')
# Look for download links in the HTML
download_link = None
for a in soup.find_all('a', href=True):
if 'download' in a['href'] and file_id in a['href']:
download_link = a['href']
break
if download_link:
logger.info(f"Found direct download link: {download_link}")
response = requests.get(download_link, stream=True)
else:
raise RuntimeError("Failed to download file — received HTML page instead of file data.")
# Download the file in chunks
buffer = io.BytesIO()
for chunk in response.iter_content(chunk_size):
if chunk: # filter out keep-alive chunks
buffer.write(chunk)
buffer.seek(0)
return buffer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment