Skip to content

Instantly share code, notes, and snippets.

@jgensler8
Last active January 5, 2021 23:24
Show Gist options
  • Save jgensler8/19c8fc263af7fb8ac1d42d8abb58daba to your computer and use it in GitHub Desktop.
Save jgensler8/19c8fc263af7fb8ac1d42d8abb58daba to your computer and use it in GitHub Desktop.
# created by Gábor Bodnár 2020-12-24
# you can download SQLSaturday session files, which does not req authentication
# order from the newest to older files in batch . The latest is: 28874
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28865
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=16007
# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28134 (you can download from logged in site)
import sys
import os
import ntpath
import requests
import logging
import re
from concurrent.futures import ThreadPoolExecutor
from shutil import move
from typing import List
# _ __ __ _ _ __ __ _ _ __ ___ ___
# | '_ \ / _` | '__/ _` | '_ ` _ \/ __|
# | |_) | (_| | | | (_| | | | | | \__ \
# | .__/ \__,_|_| \__,_|_| |_| |_|___/
# |_| 0
event_id_start = 1000
event_id_end = 1001
# headers used for auth, can be found from your browser's developer tools
browser_headers = {
# XXX: you'll need to fill this in with your own cookie
"cookie": r"...",
}
max_concurrent_downloads = 10
root_download_folder = "/tmp/sql"
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s',)
# _ __ _ __ ___ __ _ _ __ __ _ _ __ ___
# | '_ \| '__/ _ \ / _` | '__/ _` | '_ ` _ \
# | |_) | | | (_) | (_| | | | (_| | | | | | |
# | .__/|_| \___/ \__, |_| \__,_|_| |_| |_|
# |_| |___/
# TODO(jeffg): might be faster for cutomer to browse by 10121-2020-vienna
def get_event_name(event_id: int) -> str:
return ""
def get_session_name(even_id: int, session_id: int) -> str:
return ""
# returns the session_directory name
def create_directory_for_session(event_id: int, session_id: int) -> str:
event_name = get_event_name(event_id)
session_name = get_session_name(event_id, session_id)
download_folder = os.path.join(root_download_folder, f"{event_id}-{event_name}", f"{session_id}-{session_name}")
if not os.path.exists(download_folder):
os.makedirs(download_folder)
return download_folder
def get_session_ids(event_id: int) -> List[int]:
# get webpage
try:
page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Schedule", headers = browser_headers)
if page.status_code != 200:
logging.warning(f"non-200 status code for event_id {event_id}")
return []
except Exception as e:
logging.warning(f"failed to download event_id {event_id}")
logging.exception(e)
# regex the sid pattern, sid is stored in the first group as zero group is the whole match
sids = []
for m in re.finditer("https://www.sqlsaturday.com/" + str(event_id) + "/Sessions/Details\?sid=(\d+)", page.text):
sids.append(int(m.group(1)))
logging.debug(f"sesions ids: {sids}")
return sids
def get_session_material_ids(event_id: int, session_id: int) -> List[int]:
# get webpage
try:
page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Details/sid/{session_id}", headers = browser_headers)
if page.status_code != 200:
logging.warning(f"non-200 status code for event_id {event_id}")
return []
except Exception as e:
logging.warning(f"failed to download event_id {event_id}")
logging.exception(e)
# regex the material id pattern, material id is stored in the first group as zero group is the whole match
material_ids = []
for m in re.finditer("https://www.sqlsaturday.com/SessionDownload.aspx\?suid=(\d+)", page.text):
material_ids.append(int(m.group(1)))
logging.debug(f"material_ids: {material_ids}")
return material_ids
def download_material(event_id: int, session_id: int, material_id: int) -> None:
# get webpage
try:
page = requests.get(f"https://www.sqlsaturday.com/SessionDownload.aspx?suid={material_id}", headers = browser_headers, stream=True)
if page.status_code != 200:
logging.warning(f"non-200 status code for material_id {material_id}")
return []
except Exception as e:
logging.warning(f"failed to download event_id {event_id}")
logging.exception(e)
# copied from https://stackoverflow.com/a/37060758
session_directory_name = create_directory_for_session(event_id, session_id)
content_disposition_header = page.headers.get("content-disposition")
logging.debug(f"content-disposition header: {content_disposition_header}")
# for some reason, (\S+) was not matching "-" nor ".pdf" extension...
match = re.search(r"filename=(.+)", content_disposition_header)
if not match:
logging.warning(f"failed to parse filename for material_id {material_id}")
return
filename = match.group(1)
# filename = rfc6266.parse_requests_response(page).filename_unsafe
logging.debug(f"filename: {filename}")
with open(os.path.join(session_directory_name, filename), 'wb') as fd:
for chunk in page.iter_content(chunk_size=128):
fd.write(chunk)
# download loop
logging.info("starting")
executor = ThreadPoolExecutor(max_workers=max_concurrent_downloads)
for event_id in range(event_id_start, event_id_end):
session_ids = get_session_ids(event_id)
for session_id in session_ids:
# NOTE(jeffg): leaving uncommented will only create direcotories for assets
# create_directory_for_session(event_id, session_id)
session_material_ids = get_session_material_ids(event_id, session_id)
for material_id in session_material_ids:
# download_material(event_id, session_id, material_id)
executor.submit(download_material, event_id, session_id, material_id)
executor.shutdown(wait=True)
logging.info("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment