jgensler8 · January 5, 2021 23:24
diff --git a/download-sql-saturday-python3.py b/download-sql-saturday-python3.py
 # created by Gábor Bodnár 2020-12-24
 # you can download SQLSaturday session files, which does not req authentication
 # order from the newest to older files in batch . The latest is: 28874
 # https://www.sqlsaturday.com/SessionDownload.aspx?suid=28865
 # https://www.sqlsaturday.com/SessionDownload.aspx?suid=16007
 # https://www.sqlsaturday.com/SessionDownload.aspx?suid=28134 (you can download from logged in site)

 import sys
 import os
 import ntpath
 import requests
 import logging
 import re
 from concurrent.futures import ThreadPoolExecutor
 from shutil import move
 from typing import List

 #  _ __   __ _ _ __ __ _ _ __ ___  ___ 
 # | '_ \ / _` | '__/ _` | '_ ` _ \/ __|
 # | |_) | (_| | | | (_| | | | | | \__ \
 # | .__/ \__,_|_|  \__,_|_| |_| |_|___/
 # |_|                                  0

 event_id_start = 1000
 event_id_end = 1001
 # headers used for auth, can be found from your browser's developer tools
 browser_headers = {
    # XXX: you'll need to fill this in with your own cookie
    "cookie": r"...",
 }
 max_concurrent_downloads = 10
 root_download_folder = "/tmp/sql"
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s',)

 #  _ __  _ __ ___   __ _ _ __ __ _ _ __ ___  
 # | '_ \| '__/ _ \ / _` | '__/ _` | '_ ` _ \ 
 # | |_) | | | (_) | (_| | | | (_| | | | | | |
 # | .__/|_|  \___/ \__, |_|  \__,_|_| |_| |_|
 # |_|              |___/                     

 # TODO(jeffg): might be faster for cutomer to browse by 10121-2020-vienna
 def get_event_name(event_id: int) -> str:
    return ""


 def get_session_name(even_id: int, session_id: int) -> str:
    return ""


 # returns the session_directory name
 def create_directory_for_session(event_id: int, session_id: int) -> str:
    event_name = get_event_name(event_id)
    session_name = get_session_name(event_id, session_id)
    download_folder = os.path.join(root_download_folder, f"{event_id}-{event_name}", f"{session_id}-{session_name}")
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    return download_folder


 def get_session_ids(event_id: int) -> List[int]:
    # get webpage
    try:
        page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Schedule", headers = browser_headers)
        if page.status_code != 200:
            logging.warning(f"non-200 status code for event_id {event_id}")
            return []
    except Exception as e:
        logging.warning(f"failed to download event_id {event_id}")
        logging.exception(e)

    # regex the sid pattern, sid is stored in the first group as zero group is the whole match
    sids = []
    for m in re.finditer("https://www.sqlsaturday.com/" + str(event_id) + "/Sessions/Details\?sid=(\d+)", page.text):
        sids.append(int(m.group(1)))
    logging.debug(f"sesions ids: {sids}")
    return sids


 def get_session_material_ids(event_id: int, session_id: int) -> List[int]:
    # get webpage
    try:
        page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Details/sid/{session_id}", headers = browser_headers)
        if page.status_code != 200:
            logging.warning(f"non-200 status code for event_id {event_id}")
            return []
    except Exception as e:
        logging.warning(f"failed to download event_id {event_id}")
        logging.exception(e)

    # regex the material id pattern, material id is stored in the first group as zero group is the whole match
    material_ids = []
    for m in re.finditer("https://www.sqlsaturday.com/SessionDownload.aspx\?suid=(\d+)", page.text):
        material_ids.append(int(m.group(1)))
    logging.debug(f"material_ids: {material_ids}")
    return material_ids


 def download_material(event_id: int, session_id: int, material_id: int) -> None:
    # get webpage
    try:
        page = requests.get(f"https://www.sqlsaturday.com/SessionDownload.aspx?suid={material_id}", headers = browser_headers, stream=True)
        if page.status_code != 200:
            logging.warning(f"non-200 status code for material_id {material_id}")
            return []
    except Exception as e:
        logging.warning(f"failed to download event_id {event_id}")
        logging.exception(e)

    # copied from https://stackoverflow.com/a/37060758
    session_directory_name = create_directory_for_session(event_id, session_id)
    content_disposition_header = page.headers.get("content-disposition")
    logging.debug(f"content-disposition header: {content_disposition_header}")
    # for some reason, (\S+) was not matching "-" nor ".pdf" extension...
    match = re.search(r"filename=(.+)", content_disposition_header)
    if not match:
        logging.warning(f"failed to parse filename for material_id {material_id}")
        return
    filename = match.group(1)
    # filename = rfc6266.parse_requests_response(page).filename_unsafe
    logging.debug(f"filename: {filename}")
    with open(os.path.join(session_directory_name, filename), 'wb') as fd:
        for chunk in page.iter_content(chunk_size=128):
            fd.write(chunk)
    

 # download loop
 logging.info("starting")
 executor = ThreadPoolExecutor(max_workers=max_concurrent_downloads)
 for event_id in range(event_id_start, event_id_end):
    session_ids = get_session_ids(event_id)
    for session_id in session_ids:
        # NOTE(jeffg): leaving uncommented will only create direcotories for assets
        # create_directory_for_session(event_id, session_id)
        session_material_ids = get_session_material_ids(event_id, session_id)
        for material_id in session_material_ids:
            # download_material(event_id, session_id, material_id)
            executor.submit(download_material, event_id, session_id, material_id)
 executor.shutdown(wait=True)
 logging.info("done")
	# created by Gábor Bodnár 2020-12-24
	# you can download SQLSaturday session files, which does not req authentication
	# order from the newest to older files in batch . The latest is: 28874
	# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28865
	# https://www.sqlsaturday.com/SessionDownload.aspx?suid=16007
	# https://www.sqlsaturday.com/SessionDownload.aspx?suid=28134 (you can download from logged in site)

	import sys
	import os
	import ntpath
	import requests
	import logging
	import re
	from concurrent.futures import ThreadPoolExecutor
	from shutil import move
	from typing import List

	# _ __ __ _ _ __ __ _ _ __ ___ ___
	# \| '_ \ / _` \| '__/ _` \| '_ ` _ \/ __\|
	# \| \|_) \| (_\| \| \| \| (_\| \| \| \| \| \| \__ \
	# \| .__/ \__,_\|_\| \__,_\|_\| \|_\| \|_\|___/
	# \|_\| 0

	event_id_start = 1000
	event_id_end = 1001
	# headers used for auth, can be found from your browser's developer tools
	browser_headers = {
	# XXX: you'll need to fill this in with your own cookie
	"cookie": r"...",
	}
	max_concurrent_downloads = 10
	root_download_folder = "/tmp/sql"
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s',)

	# _ __ _ __ ___ __ _ _ __ __ _ _ __ ___
	# \| '_ \\| '__/ _ \ / _` \| '__/ _` \| '_ ` _ \
	# \| \|_) \| \| \| (_) \| (_\| \| \| \| (_\| \| \| \| \| \| \|
	# \| .__/\|_\| \___/ \__, \|_\| \__,_\|_\| \|_\| \|_\|
	# \|_\| \|___/

	# TODO(jeffg): might be faster for cutomer to browse by 10121-2020-vienna
	def get_event_name(event_id: int) -> str:
	return ""


	def get_session_name(even_id: int, session_id: int) -> str:
	return ""


	# returns the session_directory name
	def create_directory_for_session(event_id: int, session_id: int) -> str:
	event_name = get_event_name(event_id)
	session_name = get_session_name(event_id, session_id)
	download_folder = os.path.join(root_download_folder, f"{event_id}-{event_name}", f"{session_id}-{session_name}")
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)
	return download_folder


	def get_session_ids(event_id: int) -> List[int]:
	# get webpage
	try:
	page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Schedule", headers = browser_headers)
	if page.status_code != 200:
	logging.warning(f"non-200 status code for event_id {event_id}")
	return []
	except Exception as e:
	logging.warning(f"failed to download event_id {event_id}")
	logging.exception(e)

	# regex the sid pattern, sid is stored in the first group as zero group is the whole match
	sids = []
	for m in re.finditer("https://www.sqlsaturday.com/" + str(event_id) + "/Sessions/Details\?sid=(\d+)", page.text):
	sids.append(int(m.group(1)))
	logging.debug(f"sesions ids: {sids}")
	return sids


	def get_session_material_ids(event_id: int, session_id: int) -> List[int]:
	# get webpage
	try:
	page = requests.get(f"https://www.sqlsaturday.com/{event_id}/Sessions/Details/sid/{session_id}", headers = browser_headers)
	if page.status_code != 200:
	logging.warning(f"non-200 status code for event_id {event_id}")
	return []
	except Exception as e:
	logging.warning(f"failed to download event_id {event_id}")
	logging.exception(e)

	# regex the material id pattern, material id is stored in the first group as zero group is the whole match
	material_ids = []
	for m in re.finditer("https://www.sqlsaturday.com/SessionDownload.aspx\?suid=(\d+)", page.text):
	material_ids.append(int(m.group(1)))
	logging.debug(f"material_ids: {material_ids}")
	return material_ids


	def download_material(event_id: int, session_id: int, material_id: int) -> None:
	# get webpage
	try:
	page = requests.get(f"https://www.sqlsaturday.com/SessionDownload.aspx?suid={material_id}", headers = browser_headers, stream=True)
	if page.status_code != 200:
	logging.warning(f"non-200 status code for material_id {material_id}")
	return []
	except Exception as e:
	logging.warning(f"failed to download event_id {event_id}")
	logging.exception(e)

	# copied from https://stackoverflow.com/a/37060758
	session_directory_name = create_directory_for_session(event_id, session_id)
	content_disposition_header = page.headers.get("content-disposition")
	logging.debug(f"content-disposition header: {content_disposition_header}")
	# for some reason, (\S+) was not matching "-" nor ".pdf" extension...
	match = re.search(r"filename=(.+)", content_disposition_header)
	if not match:
	logging.warning(f"failed to parse filename for material_id {material_id}")
	return
	filename = match.group(1)
	# filename = rfc6266.parse_requests_response(page).filename_unsafe
	logging.debug(f"filename: {filename}")
	with open(os.path.join(session_directory_name, filename), 'wb') as fd:
	for chunk in page.iter_content(chunk_size=128):
	fd.write(chunk)


	# download loop
	logging.info("starting")
	executor = ThreadPoolExecutor(max_workers=max_concurrent_downloads)
	for event_id in range(event_id_start, event_id_end):
	session_ids = get_session_ids(event_id)
	for session_id in session_ids:
	# NOTE(jeffg): leaving uncommented will only create direcotories for assets
	# create_directory_for_session(event_id, session_id)
	session_material_ids = get_session_material_ids(event_id, session_id)
	for material_id in session_material_ids:
	# download_material(event_id, session_id, material_id)
	executor.submit(download_material, event_id, session_id, material_id)
	executor.shutdown(wait=True)
	logging.info("done")