Last active
June 14, 2018 09:58
-
-
Save impshum/73b4fae7375d05588e47f7e4a26fa0dd to your computer and use it in GitHub Desktop.
Download all xkcd comics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a directory called comics next to the script first | |
import requests | |
n = 933 | |
while True: | |
if n == 404: | |
n += 1 | |
url = 'https://xkcd.com/{}/info.0.json'.format(n) | |
r = requests.get(url) | |
if r.status_code == 404: | |
break | |
d = r.json() | |
n = d['num'] | |
t = d['safe_title'] | |
u = d['img'] | |
print('{}: {}'.format(n, t)) | |
if u.endswith('.png'): | |
ext = 'png' | |
if u.endswith('.jpg'): | |
ext = 'jpg' | |
if u.endswith('.gif'): | |
ext = 'gif' | |
x = 'comics/{}-{}.{}'.format(n, t.replace(' ', '_').replace('/', '-'), ext) | |
with open(x, "wb") as f: | |
c = requests.get(u) | |
f.write(c.content) | |
n += 1 |
@sdegutis Did you do a full run?
@impshum I did not :/
Test it out man. I had to fix the 404 error/comic thing and the file types. I'll find a smaller way to do this when I do. For now it works from start to finish... BAM!
8 LOC:
import requests
lr = n = 1
while True:
r = requests.get('https://xkcd.com/{}/info.0.json'.format(n))
if r.status_code != 404 and r.json()['img'][-4]=='.':
with open('comics/{}-{}.{}'.format(r.json()['num'], ''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in r.json()['safe_title']]), r.json()['img'][-3:]), "wb") as f: f.write(requests.get(r.json()['img']).content)
elif lr==404: break
lr=r.status_code ; n += 1
Long version, with makedir:
import os # for mkdir -> remove for 12 line version
if not os.path.exists('comics'): # check for dir's existence -> remove for 12 line version
os.makedirs('comics') # make dir, if needed -> remove for 12 line version
import requests
last_status = n = 0 # init n and last request status code
while True:
r = requests.get('https://xkcd.com/{}/info.0.json'.format(n)) # get page
if r.status_code != 404 and r.json()['img'][-4]=='.': # check for status code and if there is a dot, which indicates a typical img filename ending
d = r.json() # parse
print('{}: {}'.format(d['num'], d['safe_title'])) # print id + title -> remove for 12 line version
with open('comics/{}-{}.{}'.format(d['num'], # a bit dense ;) create path
''.join(['_' if c in '\\/`*{}[]()<>#+!?:' else c for c in d['safe_title']]), #replace unwanted chars
d['img'][-3:]), "wb") as f: # get extension from json img info, open file
f.write(requests.get(d['img']).content) # write the content received from json img path
elif last_status == 404: # end condition: stop if we find a 2nd #404 error (there are no more pages)
break
last_status = r.status_code # remember last status code
n += 1 # next one, please
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Super cool! You inspired me to give it a try too! Here's my take in Node.js, same #LOC: