gsw945 · May 29, 2019 05:50
diff --git a/biqugeso_crawl.py b/biqugeso_crawl.py
 # -*- coding: utf-8 -*-
 import re
 import json

 import scrapy
 from scrapy.crawler import CrawlerProcess


 class BiqugesoSpider(scrapy.Spider):
    # custom_settings = {}
    name = 'biqugeso'
    allowed_domains = ['www.biqugeso.com']
    CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
    BOOK_ID = '45724'

    def start_requests(self):
        # 爬取目录
        index_url = 'https://www.biqugeso.com/book/{}/'.format(self.BOOK_ID)
        req = scrapy.Request(
            index_url,
            callback=self.parse_catalog
        )
        yield req

    def parse_catalog(self, response):
        '''解析目录信息'''
        chapter_list = []
        for a in response.css('#list-chapterAll').css('dd > a'):
            name = a.xpath('./@title').extract_first()
            name = re.sub(
                r'第(\d+)章',
                lambda mat: '第{}章'.format(mat.group(1).rjust(5, '0')),
                name
            )
            chapter_list.append({
                'name': name,
                'chapter': a.xpath('./@href').extract_first().split('.')[0]
            })
        # 爬取章节第一部分
        CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
        for chapter in chapter_list:
            url_part1 = self.CHAPTER_URL.format(**{
                'book': self.BOOK_ID,
                'chapter': chapter['chapter']
            })
            req = scrapy.Request(
                url_part1,
                callback=self.parse_chapter_part1
            )
            req.meta['chapter_info'] = chapter
            yield req

    def parse_chapter(self, response):
        '''解析章节信息'''
        part_title = ''.join(response.css('.readTitle').xpath(
            './descendant-or-self::*/text()'
        ).extract())
        part_content = ''.join(response.css('#htmlContent').xpath(
            './descendant-or-self::*/text()[not(ancestor::p) and not(ancestor::script)]'
        ).extract()).replace(
            '请记住【笔趣阁 m.biqugeso.com】，第一时间更新，无弹窗，免费读！', ''
        ).replace('\r\n\r\n\xa0\xa0\xa0\xa0', '\r\n').strip('>').strip()
        return {
            'part_title': part_title,
            'part_content': part_content,
            'part_url': response.url
        }

    def parse_chapter_part1(self, response):
        '''解析章节信息(第一部分)'''
        # 爬取章节第二部分
        chapter_info = response.meta['chapter_info']
        url_part2 = self.CHAPTER_URL.format(**{
            'book': self.BOOK_ID,
            'chapter': chapter_info['chapter'] + '_2'
        })
        req = scrapy.Request(
            url_part2,
            callback=self.parse_chapter_part2
        )
        req.meta['chapter_info'] = chapter_info
        req.meta['part1_data'] = self.parse_chapter(response)
        yield req

    def parse_chapter_part2(self, response):
        '''解析章节信息(第二部分)'''
        chapter_info = response.meta['chapter_info']
        part1_data = response.meta['part1_data']
        part2_data = self.parse_chapter(response)
        chapter_info['part1'] = part1_data
        chapter_info['part2'] = part2_data
        # 返回数据
        yield chapter_info


 if __name__ == '__main__':
    # scrapy配置
    settings = {
        'ROBOTSTXT_OBEY': False,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'DOWNLOAD_DELAY': 0.2,
        'DUPEFILTER_DEBUG': True,
        'COOKIES_DEBUG': True,
        'CONCURRENT_REQUESTS': 4,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
        'RETRY_ENABLED': True,
        'RETRY_TIMES': 1,
        'LOG_LEVEL': 'DEBUG',
        'FEED_EXPORT_ENCODING': 'utf-8'
    }
    # 结果保存
    # from: https://stackoverflow.com/questions/23574636/scrapy-from-script-output-in-json/33005001#33005001
    settings.update({
        'FEED_FORMAT': 'json',
        'FEED_URI': '{}-result.json'.format(BiqugesoSpider.name)
    })
    # 单文件运行Spider
    # from: https://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script/31374345#31374345
    process = CrawlerProcess(settings)
    process.crawl(BiqugesoSpider)
    process.start()
    # 读取
    with open(settings['FEED_URI'], 'rt', encoding='utf-8') as rf:
        content = json.load(rf)
    # 排序
    content.sort(key=lambda item: item['name'], reverse=False)
    # 保存
    with open(settings['FEED_URI'], 'wt', encoding='utf-8') as wf:
        json.dump(content, wf, ensure_ascii=False, indent=4)
	# -- coding: utf-8 --
	import re
	import json

	import scrapy
	from scrapy.crawler import CrawlerProcess


	class BiqugesoSpider(scrapy.Spider):
	# custom_settings = {}
	name = 'biqugeso'
	allowed_domains = ['www.biqugeso.com']
	CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
	BOOK_ID = '45724'

	def start_requests(self):
	# 爬取目录
	index_url = 'https://www.biqugeso.com/book/{}/'.format(self.BOOK_ID)
	req = scrapy.Request(
	index_url,
	callback=self.parse_catalog
	)
	yield req

	def parse_catalog(self, response):
	'''解析目录信息'''
	chapter_list = []
	for a in response.css('#list-chapterAll').css('dd > a'):
	name = a.xpath('./@title').extract_first()
	name = re.sub(
	r'第(\d+)章',
	lambda mat: '第{}章'.format(mat.group(1).rjust(5, '0')),
	name
	)
	chapter_list.append({
	'name': name,
	'chapter': a.xpath('./@href').extract_first().split('.')[0]
	})
	# 爬取章节第一部分
	CHAPTER_URL = 'https://www.biqugeso.com/book/{book}/{chapter}.html'
	for chapter in chapter_list:
	url_part1 = self.CHAPTER_URL.format(**{
	'book': self.BOOK_ID,
	'chapter': chapter['chapter']
	})
	req = scrapy.Request(
	url_part1,
	callback=self.parse_chapter_part1
	)
	req.meta['chapter_info'] = chapter
	yield req

	def parse_chapter(self, response):
	'''解析章节信息'''
	part_title = ''.join(response.css('.readTitle').xpath(
	'./descendant-or-self::*/text()'
	).extract())
	part_content = ''.join(response.css('#htmlContent').xpath(
	'./descendant-or-self::*/text()[not(ancestor::p) and not(ancestor::script)]'
	).extract()).replace(
	'请记住【笔趣阁 m.biqugeso.com】，第一时间更新，无弹窗，免费读！', ''
	).replace('\r\n\r\n\xa0\xa0\xa0\xa0', '\r\n').strip('>').strip()
	return {
	'part_title': part_title,
	'part_content': part_content,
	'part_url': response.url
	}

	def parse_chapter_part1(self, response):
	'''解析章节信息(第一部分)'''
	# 爬取章节第二部分
	chapter_info = response.meta['chapter_info']
	url_part2 = self.CHAPTER_URL.format(**{
	'book': self.BOOK_ID,
	'chapter': chapter_info['chapter'] + '_2'
	})
	req = scrapy.Request(
	url_part2,
	callback=self.parse_chapter_part2
	)
	req.meta['chapter_info'] = chapter_info
	req.meta['part1_data'] = self.parse_chapter(response)
	yield req

	def parse_chapter_part2(self, response):
	'''解析章节信息(第二部分)'''
	chapter_info = response.meta['chapter_info']
	part1_data = response.meta['part1_data']
	part2_data = self.parse_chapter(response)
	chapter_info['part1'] = part1_data
	chapter_info['part2'] = part2_data
	# 返回数据
	yield chapter_info


	if __name__ == '__main__':
	# scrapy配置
	settings = {
	'ROBOTSTXT_OBEY': False,
	'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
	'DOWNLOAD_DELAY': 0.2,
	'DUPEFILTER_DEBUG': True,
	'COOKIES_DEBUG': True,
	'CONCURRENT_REQUESTS': 4,
	'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
	'RETRY_ENABLED': True,
	'RETRY_TIMES': 1,
	'LOG_LEVEL': 'DEBUG',
	'FEED_EXPORT_ENCODING': 'utf-8'
	}
	# 结果保存
	# from: https://stackoverflow.com/questions/23574636/scrapy-from-script-output-in-json/33005001#33005001
	settings.update({
	'FEED_FORMAT': 'json',
	'FEED_URI': '{}-result.json'.format(BiqugesoSpider.name)
	})
	# 单文件运行Spider
	# from: https://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script/31374345#31374345
	process = CrawlerProcess(settings)
	process.crawl(BiqugesoSpider)
	process.start()
	# 读取
	with open(settings['FEED_URI'], 'rt', encoding='utf-8') as rf:
	content = json.load(rf)
	# 排序
	content.sort(key=lambda item: item['name'], reverse=False)
	# 保存
	with open(settings['FEED_URI'], 'wt', encoding='utf-8') as wf:
	json.dump(content, wf, ensure_ascii=False, indent=4)