Last active
January 5, 2021 23:24
-
-
Save jgensler8/19c8fc263af7fb8ac1d42d8abb58daba to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# created by Gábor Bodnár 2020-12-24 | |
# you can download SQLSaturday session files, which does not req authentication | |
# order from the newest to older files in batch . The latest is: 28874 | |
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28865 | |
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=16007 | |
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28134 (you can download from logged in site) | |
import sys | |
import os | |
import ntpath | |
import requests | |
import logging | |
import re | |
from concurrent.futures import ThreadPoolExecutor | |
from shutil import move | |
from typing import List | |
# _ __ __ _ _ __ __ _ _ __ ___ ___ | |
# | '_ \ / _` | '__/ _` | '_ ` _ \/ __| | |
# | |_) | (_| | | | (_| | | | | | \__ \ | |
# | .__/ \__,_|_| \__,_|_| |_| |_|___/ | |
# |_| 0 | |
event_id_start = 1000 | |
event_id_end = 1001 | |
# headers used for auth, can be found from your browser's developer tools | |
browser_headers = { | |
# XXX: you'll need to fill this in with your own cookie | |
"cookie": r"...", | |
} | |
max_concurrent_downloads = 10 | |
root_download_folder = "/tmp/sql" | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s',) | |
# _ __ _ __ ___ __ _ _ __ __ _ _ __ ___ | |
# | '_ \| '__/ _ \ / _` | '__/ _` | '_ ` _ \ | |
# | |_) | | | (_) | (_| | | | (_| | | | | | | | |
# | .__/|_| \___/ \__, |_| \__,_|_| |_| |_| | |
# |_| |___/ | |
# TODO(jeffg): might be faster for cutomer to browse by 10121-2020-vienna | |
def get_event_name(event_id: int) -> str: | |
return "" | |
def get_session_name(even_id: int, session_id: int) -> str: | |
return "" | |
# returns the session_directory name | |
def create_directory_for_session(event_id: int, session_id: int) -> str: | |
event_name = get_event_name(event_id) | |
session_name = get_session_name(event_id, session_id) | |
download_folder = os.path.join(root_download_folder, f"{event_id}-{event_name}", f"{session_id}-{session_name}") | |
if not os.path.exists(download_folder): | |
os.makedirs(download_folder) | |
return download_folder | |
def get_session_ids(event_id: int) -> List[int]: | |
# get webpage | |
try: | |
page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Schedule", headers = browser_headers) | |
if page.status_code != 200: | |
logging.warning(f"non-200 status code for event_id {event_id}") | |
return [] | |
except Exception as e: | |
logging.warning(f"failed to download event_id {event_id}") | |
logging.exception(e) | |
# regex the sid pattern, sid is stored in the first group as zero group is the whole match | |
sids = [] | |
for m in re.finditer("https://www.sqlsaturday.com/" + str(event_id) + "/Sessions/Details\?sid=(\d+)", page.text): | |
sids.append(int(m.group(1))) | |
logging.debug(f"sesions ids: {sids}") | |
return sids | |
def get_session_material_ids(event_id: int, session_id: int) -> List[int]: | |
# get webpage | |
try: | |
page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Details/sid/{session_id}", headers = browser_headers) | |
if page.status_code != 200: | |
logging.warning(f"non-200 status code for event_id {event_id}") | |
return [] | |
except Exception as e: | |
logging.warning(f"failed to download event_id {event_id}") | |
logging.exception(e) | |
# regex the material id pattern, material id is stored in the first group as zero group is the whole match | |
material_ids = [] | |
for m in re.finditer("https://www.sqlsaturday.com/SessionDownload.aspx\?suid=(\d+)", page.text): | |
material_ids.append(int(m.group(1))) | |
logging.debug(f"material_ids: {material_ids}") | |
return material_ids | |
def download_material(event_id: int, session_id: int, material_id: int) -> None: | |
# get webpage | |
try: | |
page = requests.get(f"https://www.sqlsaturday.com/SessionDownload.aspx?suid={material_id}", headers = browser_headers, stream=True) | |
if page.status_code != 200: | |
logging.warning(f"non-200 status code for material_id {material_id}") | |
return [] | |
except Exception as e: | |
logging.warning(f"failed to download event_id {event_id}") | |
logging.exception(e) | |
# copied from https://stackoverflow.com/a/37060758 | |
session_directory_name = create_directory_for_session(event_id, session_id) | |
content_disposition_header = page.headers.get("content-disposition") | |
logging.debug(f"content-disposition header: {content_disposition_header}") | |
# for some reason, (\S+) was not matching "-" nor ".pdf" extension... | |
match = re.search(r"filename=(.+)", content_disposition_header) | |
if not match: | |
logging.warning(f"failed to parse filename for material_id {material_id}") | |
return | |
filename = match.group(1) | |
# filename = rfc6266.parse_requests_response(page).filename_unsafe | |
logging.debug(f"filename: {filename}") | |
with open(os.path.join(session_directory_name, filename), 'wb') as fd: | |
for chunk in page.iter_content(chunk_size=128): | |
fd.write(chunk) | |
# download loop | |
logging.info("starting") | |
executor = ThreadPoolExecutor(max_workers=max_concurrent_downloads) | |
for event_id in range(event_id_start, event_id_end): | |
session_ids = get_session_ids(event_id) | |
for session_id in session_ids: | |
# NOTE(jeffg): leaving uncommented will only create direcotories for assets | |
# create_directory_for_session(event_id, session_id) | |
session_material_ids = get_session_material_ids(event_id, session_id) | |
for material_id in session_material_ids: | |
# download_material(event_id, session_id, material_id) | |
executor.submit(download_material, event_id, session_id, material_id) | |
executor.shutdown(wait=True) | |
logging.info("done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment