Created
April 7, 2016 11:52
-
-
Save HakurouKen/e036160d54ed1420e7a2c6b114ae4d16 to your computer and use it in GitHub Desktop.
Get pixiv original picture by id without login.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from pyquery import PyQuery | |
from urllib2 import HTTPError | |
import os | |
class Picture(): | |
''' | |
Get picture(s) from image page. | |
@Note: HTML changed after login. | |
''' | |
PAGE_URL = 'http://www.pixiv.net/member_illust.php?mode={mode}&illust_id={illust_id}' | |
MEDIUM_PAGE_URL = PAGE_URL.format(mode='medium',illust_id='{}') | |
MANGA_PAGE_URL = PAGE_URL.format(mode='manga',illust_id='{}') | |
FULL_PIC_URL = 'http://{domain}/img-original/img/{time}/{id}_p{index}.{suffix}' | |
SMALL_PIC_PATTERN = re.compile(r'https?:\/\/(\w+\.pixiv\.net)\/.*\/(\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2})\/(\d+)_p(\d+)\w+\.(.*)$') | |
def __init__(self,illust_id,multiple=None): | |
self.illust_id = illust_id | |
self._content = None | |
self._headers = {'Referer': 'http://www.pixiv.net/'} | |
self.multiple = multiple | |
@property | |
def images(self): | |
if self.multiple is None: | |
q = self._get_content() | |
if q('.img-container ._work').hasClass('multiple'): | |
self.multiple = True | |
else: | |
self.multiple = False | |
return self.images | |
elif self.multiple: | |
return self._get_multiple() | |
else: | |
return self._get_single() | |
@property | |
def author(self): | |
q = self._get_content() | |
return q('.userdata .name').text() | |
@property | |
def title(self): | |
q = self._get_content() | |
return q('.userdata .title').text() | |
def info(self): | |
return { | |
'author': self.author, | |
'title': self.title, | |
'images': self.images | |
} | |
def download(self,folder='.'): | |
images = self.images | |
author = self.author | |
title = self.title | |
for i,image in enumerate(images): | |
_,suffix = os.path.splitext(image) | |
if i == 0: | |
filename = author + ' - ' + title + suffix | |
else: | |
filename = author + ' - ' + title + str(i+1) + suffix | |
resp = requests.get(image,headers=self._headers) | |
with open(os.path.join(folder,filename),'wb') as f: | |
if resp.status_code == 200: | |
for chunk in resp.iter_content(1024*1024): | |
f.write(chunk) | |
def _build_error(self,resp=None,status=None,msg=None): | |
if resp is not None: | |
url = resp.url | |
status = resp.status_code | |
elif status: | |
url = '' | |
else: | |
url = '' | |
status = -1 | |
msg = msg or 'Error "{}" happend at ID {}'.format(resp.reason,self.illust_id) or '' | |
if status > 0: | |
return HTTPError(url,status,msg,None,None) | |
else: | |
return Error("Unknown error happend at ID {}".format(self.illust_id)) | |
def _get_content(self): | |
if self._content: | |
return self._content | |
url = self.MEDIUM_PAGE_URL.format(self.illust_id) | |
resp = requests.get(url,headers= self._headers) | |
if resp.status_code == 200: | |
self.content = PyQuery(resp.content) | |
return PyQuery(resp.content) | |
elif resp.status_code == 404: | |
raise self._build_error(resp,'ID {} does not exists'.format(self.illust_id)) | |
else: | |
raise self._build_error(resp) | |
def _get_single(self): | |
''' | |
Get picture url from page which has only one picture. | |
''' | |
q = self._get_content() | |
# the small img url is something like: | |
# http://{domain}/c/{size}/img-master/img/{date-with-slash}/{id}_p{num-of-img}_master{xxx}.{suffix} | |
small = q('.img-container img').attr('src') | |
pattern = self.SMALL_PIC_PATTERN | |
m = re.match(pattern,small) | |
info = { | |
"domain": m.group(1), | |
"time": m.group(2), | |
"id": m.group(3), | |
"index": m.group(4), | |
"suffix": '{}' | |
} | |
url_part = self.FULL_PIC_URL.format(**info) | |
suffix = self._get_img_suffix( url_part, default_suffix=m.group(5) ) | |
return [url_part.format(suffix)] | |
def _get_multiple_content(self): | |
''' | |
Get picture url from page which has more than one picture. | |
''' | |
url = self.MANGA_PAGE_URL.format(self.illust_id) | |
resp = requests.get( url, headers= self._headers) | |
if resp.status_code == 200: | |
return PyQuery(resp.content) | |
elif resp.status_code == 404: | |
raise self._build_error(resp,'ID {} does not exists'.format(self.illust_id)) | |
else: | |
raise self._build_error(resp) | |
def _get_multiple(self): | |
q = self._get_multiple_content() | |
imgs = q('.item-container img') | |
urls = [img.attr('data-src') for img in imgs.items()] | |
if not len(urls): | |
raise self._build_error(status=404,msg='ID {} do not have picture.'.format(self.illust_id)) | |
pattern = self.SMALL_PIC_PATTERN | |
url_parts = [] | |
_suffix = None | |
for url in urls: | |
m = re.match(pattern,url) | |
info = { | |
"domain": m.group(1), | |
"time": m.group(2), | |
"id": m.group(3), | |
"index": m.group(4), | |
"suffix": '{}' | |
} | |
url_parts.append(self.FULL_PIC_URL.format(**info)) | |
_suffix = _suffix or m.group(5) | |
suffix = self._get_img_suffix( url_parts[0], default_suffix=_suffix ) | |
return [url.format(suffix) for url in url_parts] | |
def _check_img_suffix(self,url_part,suffix): | |
url = url_part.format(suffix) | |
resp = requests.head(url,headers = self._headers) | |
if resp.status_code == 200: | |
return suffix | |
return None | |
def _get_img_suffix(self,url_part,default_suffix=None): | |
''' | |
Guess the image suffix. | |
Original picture's suffix may vary from small one. | |
''' | |
SUFFIX = ['png','jpg','gif'] | |
if default_suffix: | |
suffix = self._check_img_suffix(url_part,default_suffix) | |
if suffix: | |
return suffix | |
for suffix in SUFFIX: | |
if suffix == default_suffix: | |
pass | |
else: | |
suffix = self._check_img_suffix(url_part,suffix) | |
if suffix: | |
return suffix | |
raise self._build_error(status=404,msg='Cannot find picture of ID {}'.format(self.illust_id)) | |
if __name__ == '__main__': | |
import sys | |
argv = sys.argv[1:] | |
if len(argv): | |
id_ = int(argv[0]) | |
print Picture(id_).info() | |
else: | |
raise ValueError('Need an illust id.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment