Created
June 24, 2018 04:03
-
-
Save MisakaMikoto-35c5/3fdcc26bf05795cfcd4f34ddb99568bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib | |
from urllib import request | |
class PixivSpider: | |
IMAGE_PATTERN = re.compile(r'src="http[s]://[\w\d\-\_\.]{4,255}/\w/600x600/img-master/img/\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}/\d+_p0_master1200.jpg" alt="[^"]+"') | |
TAGS_PATTERN = re.compile(r'<ul class="inline-list"><li class="tag">.+</li></ul>') | |
TAG_PATTERN = re.compile(r'class="text">[^<>]+</a>') | |
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' | |
def get_web_content(self, url, post_data = None, headers = None): | |
default_headers = { | |
'User-Agent': self.DEFAULT_USER_AGENT | |
} | |
if headers != None: | |
default_headers.update(headers) | |
request = urllib.request.Request(url, headers = default_headers) | |
try: | |
response = urllib.request.urlopen(request, data = post_data) | |
except urllib.error.HTTPError as e: | |
response = e | |
content = response.read() | |
headers = response.headers | |
content_type = headers['Content-Type'] | |
if content_type != None and len(content_type) > 4 and content_type[0:4] == 'text': | |
content = content.decode('utf-8') | |
return {'content': content, 'code': response.code, 'headers': headers} | |
def guest_decode_image(self, html_content): | |
data = {} | |
image = self.IMAGE_PATTERN.findall(html_content) | |
if image == None: | |
return False | |
image = image[0] | |
image = image[5:-1] | |
image_split = image.split('" alt="') | |
title_split = image_split[1].rfind('/') | |
data['image'] = image_split[0] | |
data['title'] = image_split[1][:title_split] | |
data['illustrator'] = image_split[1][title_split+1:] | |
return data | |
def guest_decode_tags(self, html_content): | |
tags = self.TAGS_PATTERN.findall(html_content) | |
if tags == None: | |
return False | |
tags = tags[0] | |
tags = self.TAG_PATTERN.findall(tags) | |
if tags == None: | |
return False | |
data = [] | |
for i in tags: | |
data.append(i[13:-4]) | |
return data | |
def guest_get_pixiv(self, pid): | |
url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(pid) | |
http_result = self.get_web_content(url) | |
if http_result['code'] != 200: | |
raise GetWebContentError(http_result) | |
http_content = http_result['content'] | |
result = self.guest_decode_image(http_content) | |
if result == False: | |
raise GetWebContentError(http_result) | |
tags = self.guest_decode_tags(http_content) | |
if tags == False: | |
raise GetWebContentError(http_result) | |
result['tags'] = tags | |
return result | |
class GetWebContentError(Exception): | |
def __init__(self, http_result, message = None): | |
self.HTTP_RESULT = http_result | |
if __name__ == '__main__': | |
print(PixivSpider().guest_get_pixiv(67807764)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment