Created
January 26, 2018 18:00
-
-
Save chenzhuoyu/cecfea5e0b46753884ec1c909aaebf50 to your computer and use it in GitHub Desktop.
Pixiv downloader without login, requires BeautifulSoup4 and requests to work properly
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import bs4 | |
import sys | |
import requests | |
if len(sys.argv) != 2: | |
print >> sys.stderr, 'usage: ./get_pixiv.py <illustration_url>' | |
exit(1) | |
url = None | |
html = requests.get(sys.argv[1]).text | |
html = bs4.BeautifulSoup(html, 'html.parser') | |
for div in html.find_all('div'): | |
if div.has_attr('class'): | |
if len(div['class']) == 1 and div['class'][0] == 'img-container': | |
for img in div.find_all('img'): | |
if img.has_attr('src') and img['src'].startswith('https://i.pximg.net/'): | |
url = img['src'] | |
break | |
if url is None: | |
print >> sys.stderr, 'Sorry, but no valid illustration URLs found.' | |
exit(1) | |
p = re.compile(r'.+/(\d{4})/(\d{2})/(\d{2})/(\d{2})/(\d{2})/(\d{2})/(\d+)_(\w+)_.*\..*$') | |
match = p.match(url) | |
if not match or len(match.groups()) != 8: | |
print >> sys.stderr, 'Sorry, but no valid illustration URLs found.' | |
exit(1) | |
for ext in ['jpg', 'png']: | |
fn = '%s_%s.%s' % (match.groups()[-2:] + (ext,)) | |
url = 'https://i.pximg.net/img-original/img/%s/%s/%s/%s/%s/%s/%s_%s.%s' % (match.groups() + (ext,)) | |
resp = requests.get(url, headers = { | |
'Host' : 'i.pximg.net', | |
'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; rv:32.0) Gecko/20100101 Firefox/32.0", | |
'Accept' : '*/*', | |
'Accept-Encoding' : 'gzip, deflate, br', | |
'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', | |
'Referer' : sys.argv[1], | |
'Connection' : 'keep-alive', | |
'Cache-Control' : 'no-cache', | |
}) | |
if resp.status_code == 200: | |
break | |
if resp.status_code != 200: | |
print >> sys.stderr, 'Sorry, but server returned %d' % resp.status_code | |
exit(1) | |
print 'File Name:', fn | |
print 'Real Image URL:', url | |
with open(fn, 'wb') as f: | |
for chunk in resp.iter_content(16384): | |
f.write(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment