Skip to content

Instantly share code, notes, and snippets.

@gsw945
Last active May 29, 2019 05:50
Show Gist options
  • Save gsw945/75eefb773164a51bb381001295acd27d to your computer and use it in GitHub Desktop.
Save gsw945/75eefb773164a51bb381001295acd27d to your computer and use it in GitHub Desktop.
笔趣阁-《弃少归来张恒》 爬虫
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class BiqugesoSpider(scrapy.Spider):
# custom_settings = {}
name = 'biqugeso'
allowed_domains = ['www.biqugeso.com']
CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
BOOK_ID = '45724'
def start_requests(self):
# 爬取目录
index_url = 'https://www.biqugeso.com/book/{}/'.format(self.BOOK_ID)
req = scrapy.Request(
index_url,
callback=self.parse_catalog
)
yield req
def parse_catalog(self, response):
'''解析目录信息'''
chapter_list = []
for a in response.css('#list-chapterAll').css('dd > a'):
name = a.xpath('./@title').extract_first()
name = re.sub(
r'第(\d+)章',
lambda mat: '第{}章'.format(mat.group(1).rjust(5, '0')),
name
)
chapter_list.append({
'name': name,
'chapter': a.xpath('./@href').extract_first().split('.')[0]
})
# 爬取章节第一部分
CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
for chapter in chapter_list:
url_part1 = self.CHAPTER_URL.format(**{
'book': self.BOOK_ID,
'chapter': chapter['chapter']
})
req = scrapy.Request(
url_part1,
callback=self.parse_chapter_part1
)
req.meta['chapter_info'] = chapter
yield req
def parse_chapter(self, response):
'''解析章节信息'''
part_title = ''.join(response.css('.readTitle').xpath(
'./descendant-or-self::*/text()'
).extract())
part_content = ''.join(response.css('#htmlContent').xpath(
'./descendant-or-self::*/text()[not(ancestor::p) and not(ancestor::script)]'
).extract()).replace(
'请记住【笔趣阁 m.biqugeso.com】,第一时间更新,无弹窗,免费读!', ''
).replace('\r\n\r\n\xa0\xa0\xa0\xa0', '\r\n').strip('>').strip()
return {
'part_title': part_title,
'part_content': part_content,
'part_url': response.url
}
def parse_chapter_part1(self, response):
'''解析章节信息(第一部分)'''
# 爬取章节第二部分
chapter_info = response.meta['chapter_info']
url_part2 = self.CHAPTER_URL.format(**{
'book': self.BOOK_ID,
'chapter': chapter_info['chapter'] + '_2'
})
req = scrapy.Request(
url_part2,
callback=self.parse_chapter_part2
)
req.meta['chapter_info'] = chapter_info
req.meta['part1_data'] = self.parse_chapter(response)
yield req
def parse_chapter_part2(self, response):
'''解析章节信息(第二部分)'''
chapter_info = response.meta['chapter_info']
part1_data = response.meta['part1_data']
part2_data = self.parse_chapter(response)
chapter_info['part1'] = part1_data
chapter_info['part2'] = part2_data
# 返回数据
yield chapter_info
if __name__ == '__main__':
# scrapy配置
settings = {
'ROBOTSTXT_OBEY': False,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'DOWNLOAD_DELAY': 0.2,
'DUPEFILTER_DEBUG': True,
'COOKIES_DEBUG': True,
'CONCURRENT_REQUESTS': 4,
'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
'RETRY_ENABLED': True,
'RETRY_TIMES': 1,
'LOG_LEVEL': 'DEBUG',
'FEED_EXPORT_ENCODING': 'utf-8'
}
# 结果保存
# from: https://stackoverflow.com/questions/23574636/scrapy-from-script-output-in-json/33005001#33005001
settings.update({
'FEED_FORMAT': 'json',
'FEED_URI': '{}-result.json'.format(BiqugesoSpider.name)
})
# 单文件运行Spider
# from: https://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script/31374345#31374345
process = CrawlerProcess(settings)
process.crawl(BiqugesoSpider)
process.start()
# 读取
with open(settings['FEED_URI'], 'rt', encoding='utf-8') as rf:
content = json.load(rf)
# 排序
content.sort(key=lambda item: item['name'], reverse=False)
# 保存
with open(settings['FEED_URI'], 'wt', encoding='utf-8') as wf:
json.dump(content, wf, ensure_ascii=False, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment