Last active
November 14, 2021 14:58
-
-
Save APadierna/d5a12a301b318397a7ed to your computer and use it in GitHub Desktop.
Script to download The dilbert comic strips
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Simple script to download the Dilbert comic strips in a defined period of time | |
If no arguments are passed to the script, it will download all the Dilbert comic | |
strips in the current folder (It may take a while). | |
Acknowledgments | |
--------------- | |
This script is strongly based in the work from: | |
https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics | |
""" | |
import datetime | |
import os | |
import re | |
import sys | |
import time | |
import argparse | |
from dateutil import rrule, parser | |
# for backwards compatibility | |
if sys.version_info[0] > 2: | |
import urllib.request as ul | |
else: | |
import urllib as ul | |
def main(): | |
args = parse_input_arguments() | |
# If a dump folder has been defiled, create if (if does not already exists) | |
# and move to it | |
try: | |
if args.output != '.' and not(os.path.isdir(args.output)): | |
os.makedirs(args.output) | |
except: | |
args.output = '.' | |
os.chdir(args.output) | |
download_strips(args.start_date, args.end_date) | |
def parse_input_arguments(): | |
argp = argparse.ArgumentParser(description='Dilbert strips download script.') | |
argp.add_argument("-s", "--start", | |
help="start date (1989-04-17, 1st published strip).", | |
dest="start_date", | |
default='1989-04-17') | |
argp.add_argument("-e", "--end", | |
dest="end_date", | |
help="End date (default, today)", | |
default=None) | |
argp.add_argument("-o", "--output", | |
dest="output", | |
help="Comics dump folder", | |
default='.') | |
args = argp.parse_args() | |
if args.end_date is None: | |
args.end_date = datetime.datetime.now().date() | |
else: | |
args.end_date = parser.parse(args.end_date) | |
args.start_date = parser.parse(args.start_date) | |
return args | |
def download_strips(start_date, end_date): | |
for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)): | |
comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day) | |
url = 'http://dilbert.com/strip/' + comic_date | |
comic_name = comic_date + '.jpg' | |
print('getting comic from', comic_date) | |
ul.urlretrieve(get_true_comic_url(url), comic_name) | |
time.sleep(0.01) | |
def get_true_comic_url(comic_url, comic_name='comic'): | |
""" | |
get the true comic strip url from http://dilbert.com/strip/<date> | |
It looks like Scott Adams has protected himself against pointy haired | |
pirates by hiding him comic strips within the assets.amuniversal domain. | |
This function digs into the comic strip web-page, finds (and returns) | |
the URL where the original image lives. | |
""" | |
html=str(ul.urlopen(comic_url).read()) | |
comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+' | |
return re.search(comic_strip_pattern, html).group() | |
if __name__ == '__main__': | |
main() |
I'm afraid the script got a bit out of date since it was written. Thanks for pointing out the change from http to https !! :-D
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
line 92 should be:
comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+'