Last active
June 28, 2021 19:39
-
-
Save bachhuberdesign/0dfc11cb4a5f959d65dcb914ee7c4dcf to your computer and use it in GitHub Desktop.
Scraper for Laracasts videos -- requires active Laracasts subscription (see comments in code).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################################################### | |
# Laracasts Video Scraper # | |
# # | |
# Author: Eric Bachhuber # | |
# # | |
# Video files will be output to folder 'Laracasts' # | |
# wherever the script is run from. # | |
# # | |
# Possibly violates the terms of service, so use # | |
# at your own risk. # | |
# # | |
# Required: Active Laracasts subscription # | |
# Required: Python 3 # | |
# Required: BeautifulSoup (install with pip3) # | |
###################################################### | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import json | |
import requests | |
import re | |
import os | |
SCRIPT_VERSION = '0.0.1' | |
def main(): | |
# Login to Laracasts via web and paste your laravel_session cookie here (inspect request header to find) | |
# Must be logged into a Laracasts account with an active subscription | |
cookies = { | |
'laravel_session': 'PASTE_YOUR_LARAVEL_SESSION_HERE' | |
} | |
# Get list of all categories with series via API (auth not required for this API call) | |
# Only non-archived series are returned from this endpoint. | |
# As of 10/9/2019, there are 79 "current" series and 19 "archived" series | |
seriesJson = requests.get('https://laracasts.com/api/series').json() | |
for category in seriesJson: | |
# As of 10/9/2019, Laracasts has 5 categories: | |
# Laravel, PHP, Testing, JavaScript, and Tooling | |
print("\nDownloading category: " + category) | |
for series in seriesJson[category]: | |
seriesTitle = sanitize_for_file_name(series['title']) | |
slug = series['slug'] | |
print("\nPreparing to download series: " + seriesTitle) | |
episodeCounter = 1 | |
while True: | |
episodeRequest = requests.get( | |
url='https://laracasts.com/series/' + slug + "/episodes/" + str(episodeCounter), | |
cookies=cookies, | |
allow_redirects=False # If an invalid episode number is used, Laracasts redirects to the series landing page | |
) | |
if episodeRequest.status_code == 302: | |
# Redirected, no remaining episodes for this series. | |
break | |
downloadLink = episodeRequest.text.split('download-link="')[1].split('"')[0] | |
# Grab episode title from <title> tag | |
soup = BeautifulSoup(episodeRequest.text, features="html.parser") | |
episodeName = sanitize_for_file_name(soup.title.string.replace(seriesTitle, '')) | |
episodePath = 'Laracasts/' + seriesTitle + "/Episode " + str(episodeCounter) + " - " + episodeName + ".mp4" | |
if not os.path.isdir('Laracasts'): | |
os.mkdir('Laracasts') | |
if not os.path.isdir('Laracasts/' + seriesTitle): | |
os.mkdir('Laracasts/' + seriesTitle) | |
if not os.path.exists(episodePath): | |
# Download episode and write to file | |
print("Downloading episode " + str(episodeCounter) + ": " + episodeName) | |
downloadRequest = requests.get( | |
url="https://www.laracasts.com" + downloadLink, | |
cookies=cookies, | |
allow_redirects=True | |
) | |
open(episodePath, 'wb').write(downloadRequest.content) | |
else: | |
# Episode already exists, skip download | |
print('Episode ' + episodeName + ' already exists, skipping.') | |
episodeCounter = episodeCounter + 1 | |
pass | |
def sanitize_for_file_name(toSanitize): | |
remove_punctuation_map = dict((ord(char), None) for char in '\\/*?:"<>|\'') | |
return toSanitize.translate(remove_punctuation_map).strip() | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment