Skip to content

Instantly share code, notes, and snippets.

@MisakaMikoto-35c5
Created June 24, 2018 04:03
Show Gist options
  • Save MisakaMikoto-35c5/3fdcc26bf05795cfcd4f34ddb99568bd to your computer and use it in GitHub Desktop.
Save MisakaMikoto-35c5/3fdcc26bf05795cfcd4f34ddb99568bd to your computer and use it in GitHub Desktop.
import re
import urllib
from urllib import request
class PixivSpider:
IMAGE_PATTERN = re.compile(r'src="http[s]://[\w\d\-\_\.]{4,255}/\w/600x600/img-master/img/\d{4}/\d{2}/\d{2}/\d{2}/\d{2}/\d{2}/\d+_p0_master1200.jpg" alt="[^"]+"')
TAGS_PATTERN = re.compile(r'<ul class="inline-list"><li class="tag">.+</li></ul>')
TAG_PATTERN = re.compile(r'class="text">[^<>]+</a>')
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
def get_web_content(self, url, post_data = None, headers = None):
default_headers = {
'User-Agent': self.DEFAULT_USER_AGENT
}
if headers != None:
default_headers.update(headers)
request = urllib.request.Request(url, headers = default_headers)
try:
response = urllib.request.urlopen(request, data = post_data)
except urllib.error.HTTPError as e:
response = e
content = response.read()
headers = response.headers
content_type = headers['Content-Type']
if content_type != None and len(content_type) > 4 and content_type[0:4] == 'text':
content = content.decode('utf-8')
return {'content': content, 'code': response.code, 'headers': headers}
def guest_decode_image(self, html_content):
data = {}
image = self.IMAGE_PATTERN.findall(html_content)
if image == None:
return False
image = image[0]
image = image[5:-1]
image_split = image.split('" alt="')
title_split = image_split[1].rfind('/')
data['image'] = image_split[0]
data['title'] = image_split[1][:title_split]
data['illustrator'] = image_split[1][title_split+1:]
return data
def guest_decode_tags(self, html_content):
tags = self.TAGS_PATTERN.findall(html_content)
if tags == None:
return False
tags = tags[0]
tags = self.TAG_PATTERN.findall(tags)
if tags == None:
return False
data = []
for i in tags:
data.append(i[13:-4])
return data
def guest_get_pixiv(self, pid):
url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(pid)
http_result = self.get_web_content(url)
if http_result['code'] != 200:
raise GetWebContentError(http_result)
http_content = http_result['content']
result = self.guest_decode_image(http_content)
if result == False:
raise GetWebContentError(http_result)
tags = self.guest_decode_tags(http_content)
if tags == False:
raise GetWebContentError(http_result)
result['tags'] = tags
return result
class GetWebContentError(Exception):
def __init__(self, http_result, message = None):
self.HTTP_RESULT = http_result
if __name__ == '__main__':
print(PixivSpider().guest_get_pixiv(67807764))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment