Created
May 6, 2025 12:07
-
-
Save PatrickChoDev/150b2635568596ff897f563d791d00d2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import requests | |
from bs4 import BeautifulSoup | |
import logging | |
logger = logging.getLogger(__name__) | |
def download_google_drive_file(file_id, chunk_size=32768): | |
""" | |
Download a file from Google Drive. | |
Handles HTML virus check pages by extracting and submitting the form with GET method. | |
Args: | |
file_id (str): The Google Drive file ID | |
chunk_size (int): Size of chunks to download | |
Returns: | |
io.BytesIO: Buffer containing the downloaded file | |
""" | |
base_url = "https://docs.google.com/uc?export=download" | |
session = requests.Session() | |
# Initial request to get the file or HTML page | |
response = session.get(base_url, params={"id": file_id}, stream=True) | |
# Check if we got HTML instead of file content | |
if "text/html" in response.headers.get("Content-Type", ""): | |
logger.info("Received HTML page, extracting download form") | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the form and extract the download URL | |
form = soup.find('form') | |
if form: | |
action_url = form.get('action', base_url) | |
# Collect all form inputs to submit | |
params = {} | |
for input_tag in form.find_all('input'): | |
if input_tag.get('name'): | |
params[input_tag.get('name')] = input_tag.get('value', '') | |
# Submit the form using GET | |
logger.info(f"Submitting form with params: {params}") | |
response = session.get(action_url, params=params, stream=True) | |
else: | |
# If no form found, try direct download with "confirm" parameter | |
logger.info("No form found, trying direct download with confirm parameter") | |
response = session.get( | |
base_url, | |
params={"id": file_id, "confirm": "t"}, | |
stream=True, | |
) | |
# Double-check that we now have the file and not HTML | |
if "text/html" in response.headers.get("Content-Type", ""): | |
# Try to extract direct download link as fallback | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Look for download links in the HTML | |
download_link = None | |
for a in soup.find_all('a', href=True): | |
if 'download' in a['href'] and file_id in a['href']: | |
download_link = a['href'] | |
break | |
if download_link: | |
logger.info(f"Found direct download link: {download_link}") | |
response = requests.get(download_link, stream=True) | |
else: | |
raise RuntimeError("Failed to download file — received HTML page instead of file data.") | |
# Download the file in chunks | |
buffer = io.BytesIO() | |
for chunk in response.iter_content(chunk_size): | |
if chunk: # filter out keep-alive chunks | |
buffer.write(chunk) | |
buffer.seek(0) | |
return buffer |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment