Last active
May 29, 2019 05:50
-
-
Save gsw945/75eefb773164a51bb381001295acd27d to your computer and use it in GitHub Desktop.
笔趣阁-《弃少归来张恒》 爬虫
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import json | |
import scrapy | |
from scrapy.crawler import CrawlerProcess | |
class BiqugesoSpider(scrapy.Spider): | |
# custom_settings = {} | |
name = 'biqugeso' | |
allowed_domains = ['www.biqugeso.com'] | |
CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html' | |
BOOK_ID = '45724' | |
def start_requests(self): | |
# 爬取目录 | |
index_url = 'https://www.biqugeso.com/book/{}/'.format(self.BOOK_ID) | |
req = scrapy.Request( | |
index_url, | |
callback=self.parse_catalog | |
) | |
yield req | |
def parse_catalog(self, response): | |
'''解析目录信息''' | |
chapter_list = [] | |
for a in response.css('#list-chapterAll').css('dd > a'): | |
name = a.xpath('./@title').extract_first() | |
name = re.sub( | |
r'第(\d+)章', | |
lambda mat: '第{}章'.format(mat.group(1).rjust(5, '0')), | |
name | |
) | |
chapter_list.append({ | |
'name': name, | |
'chapter': a.xpath('./@href').extract_first().split('.')[0] | |
}) | |
# 爬取章节第一部分 | |
CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html' | |
for chapter in chapter_list: | |
url_part1 = self.CHAPTER_URL.format(**{ | |
'book': self.BOOK_ID, | |
'chapter': chapter['chapter'] | |
}) | |
req = scrapy.Request( | |
url_part1, | |
callback=self.parse_chapter_part1 | |
) | |
req.meta['chapter_info'] = chapter | |
yield req | |
def parse_chapter(self, response): | |
'''解析章节信息''' | |
part_title = ''.join(response.css('.readTitle').xpath( | |
'./descendant-or-self::*/text()' | |
).extract()) | |
part_content = ''.join(response.css('#htmlContent').xpath( | |
'./descendant-or-self::*/text()[not(ancestor::p) and not(ancestor::script)]' | |
).extract()).replace( | |
'请记住【笔趣阁 m.biqugeso.com】,第一时间更新,无弹窗,免费读!', '' | |
).replace('\r\n\r\n\xa0\xa0\xa0\xa0', '\r\n').strip('>').strip() | |
return { | |
'part_title': part_title, | |
'part_content': part_content, | |
'part_url': response.url | |
} | |
def parse_chapter_part1(self, response): | |
'''解析章节信息(第一部分)''' | |
# 爬取章节第二部分 | |
chapter_info = response.meta['chapter_info'] | |
url_part2 = self.CHAPTER_URL.format(**{ | |
'book': self.BOOK_ID, | |
'chapter': chapter_info['chapter'] + '_2' | |
}) | |
req = scrapy.Request( | |
url_part2, | |
callback=self.parse_chapter_part2 | |
) | |
req.meta['chapter_info'] = chapter_info | |
req.meta['part1_data'] = self.parse_chapter(response) | |
yield req | |
def parse_chapter_part2(self, response): | |
'''解析章节信息(第二部分)''' | |
chapter_info = response.meta['chapter_info'] | |
part1_data = response.meta['part1_data'] | |
part2_data = self.parse_chapter(response) | |
chapter_info['part1'] = part1_data | |
chapter_info['part2'] = part2_data | |
# 返回数据 | |
yield chapter_info | |
if __name__ == '__main__': | |
# scrapy配置 | |
settings = { | |
'ROBOTSTXT_OBEY': False, | |
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', | |
'DOWNLOAD_DELAY': 0.2, | |
'DUPEFILTER_DEBUG': True, | |
'COOKIES_DEBUG': True, | |
'CONCURRENT_REQUESTS': 4, | |
'CONCURRENT_REQUESTS_PER_DOMAIN': 2, | |
'RETRY_ENABLED': True, | |
'RETRY_TIMES': 1, | |
'LOG_LEVEL': 'DEBUG', | |
'FEED_EXPORT_ENCODING': 'utf-8' | |
} | |
# 结果保存 | |
# from: https://stackoverflow.com/questions/23574636/scrapy-from-script-output-in-json/33005001#33005001 | |
settings.update({ | |
'FEED_FORMAT': 'json', | |
'FEED_URI': '{}-result.json'.format(BiqugesoSpider.name) | |
}) | |
# 单文件运行Spider | |
# from: https://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script/31374345#31374345 | |
process = CrawlerProcess(settings) | |
process.crawl(BiqugesoSpider) | |
process.start() | |
# 读取 | |
with open(settings['FEED_URI'], 'rt', encoding='utf-8') as rf: | |
content = json.load(rf) | |
# 排序 | |
content.sort(key=lambda item: item['name'], reverse=False) | |
# 保存 | |
with open(settings['FEED_URI'], 'wt', encoding='utf-8') as wf: | |
json.dump(content, wf, ensure_ascii=False, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment