Created
December 17, 2017 01:48
-
-
Save ikegami-yukino/f5296b2ac19cf431aa90193359b18865 to your computer and use it in GitHub Desktop.
Pixiv小説のクロール
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
from robobrowser import RoboBrowser | |
PIXIV_BASE_URL = 'https://www.pixiv.net' | |
TAG = '巴マミ' | |
MAX_PAGE = 190 | |
browser = RoboBrowser(parser='lxml', history=True) | |
browser.open('https://accounts.pixiv.net/login') | |
form = browser.get_forms('form', class_='')[0] | |
form['pixiv_id'] = 'USERNAME' | |
form['password'] = 'PASSWORD' | |
browser.submit_form(form) | |
for i in range(MAX_PAGE): | |
print(i) | |
browser.open(PIXIV_BASE_URL + '/novel/tags.php?tag={}&order=date&p='.format(TAG) + str(i)) | |
novel_items = browser.find(class_='novel-items') | |
if novel_items is None: | |
break | |
# ページ内の小説をループ | |
for novel in novel_items.find_all(class_='_novel-item'): | |
novel_url = PIXIV_BASE_URL + novel.find('h1').find('a')['href'] | |
browser.open(novel_url) | |
# 小説の情報 | |
title = novel.find('h1').find('a').text.replace('/', '_') | |
text = re.sub(r'\s|\n| ', '', browser.find('textarea', id='novel_text').text) | |
print(title) | |
with open(title + '.txt', 'w') as fd: | |
fd.write(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment