Last active
September 7, 2022 14:43
-
-
Save giuliano-macedo/993934a39f78b64d6aa5d3794062eca9 to your computer and use it in GitHub Desktop.
Shallow download files from Google Drive folder with requests,bs4 and regex
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from bs4 import BeautifulSoup | |
import requests | |
import gdown | |
import json | |
import argparse | |
parser=argparse.ArgumentParser() | |
parser.add_argument("url") | |
args=parser.parse_args() | |
r=requests.get(args.url) | |
soup=BeautifulSoup(r.text,"lxml") | |
string_regex=re.compile(r"\'([^\']+)\'") #pretty dumb string regex, would fail in scaped strings | |
encoded_data=None | |
for script in soup.select("script"): | |
if "_DRIVE_ivd" in script.text: # hacky script tag search | |
encoded_data=string_regex.findall(script.text)[1] # second one, first one is '_DRIVE_ivdc' | |
break | |
if encoded_data==None: | |
raise RuntimeError("Didn't found script tag") | |
decoded = bytes(encoded_data, "utf-8").decode("unicode_escape") | |
data=json.loads(decoded) | |
ids=[elem[0] for elem in data[0]] #don't know why these indices | |
for id_ in ids: | |
gdown.download('https://drive.google.com/uc?id='+id_) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment