Last active
August 23, 2023 14:19
-
-
Save alecxe/fc1527d6d9492b59c610 to your computer and use it in GitHub Desktop.
Self-contained minimum example script to run scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from scrapy.crawler import Crawler | |
from scrapy.contrib.loader import ItemLoader | |
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst | |
from scrapy import log, signals, Spider, Item, Field | |
from scrapy.settings import Settings | |
from twisted.internet import reactor | |
# define an item class | |
class DmozItem(Item): | |
title = Field() | |
link = Field() | |
desc = Field() | |
# define an item loader with input and output processors | |
class DmozItemLoader(ItemLoader): | |
default_input_processor = MapCompose(unicode.strip) | |
default_output_processor = TakeFirst() | |
desc_out = Join() | |
# define a pipeline | |
class JsonWriterPipeline(object): | |
def __init__(self): | |
self.file = open('items.jl', 'wb') | |
def process_item(self, item, spider): | |
line = json.dumps(dict(item)) + "\n" | |
self.file.write(line) | |
return item | |
# define a spider | |
class DmozSpider(Spider): | |
name = "dmoz" | |
allowed_domains = ["dmoz.org"] | |
start_urls = [ | |
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", | |
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" | |
] | |
def parse(self, response): | |
for sel in response.xpath('//ul/li'): | |
loader = DmozItemLoader(DmozItem(), selector=sel, response=response) | |
loader.add_xpath('title', 'a/text()') | |
loader.add_xpath('link', 'a/@href') | |
loader.add_xpath('desc', 'text()') | |
yield loader.load_item() | |
# callback fired when the spider is closed | |
def callback(spider, reason): | |
stats = spider.crawler.stats.get_stats() # collect/log stats? | |
# stop the reactor | |
reactor.stop() | |
# instantiate settings and provide a custom configuration | |
settings = Settings() | |
settings.set('ITEM_PIPELINES', { | |
'__main__.JsonWriterPipeline': 100 | |
}) | |
# instantiate a crawler passing in settings | |
crawler = Crawler(settings) | |
# instantiate a spider | |
spider = DmozSpider() | |
# configure signals | |
crawler.signals.connect(callback, signal=signals.spider_closed) | |
# configure and start the crawler | |
crawler.configure() | |
crawler.crawl(spider) | |
crawler.start() | |
# start logging | |
log.start() | |
# start the reactor (blocks execution) | |
reactor.run() |
This was exactly what I was looking for!!! I've been wanting to transition my Scrapy projects to stand-alone for a while now. Thanks!!!
Here's how I ended up doing the settings:
settings = Settings({
# piplines start with the project/module name so replace with __main__
'ITEM_PIPELINES': {
'__main__.WriterPipeline': 100,
},
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
})
process = CrawlerProcess(settings)
# you can run 30 of these at once if you want, e.g —
# process.crawl(CustomSpider)
# process.crawl(CustomSpider) etc.. * 30
process.crawl(CustomSpider)
process.start()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You could use this also:
process = CrawlerProcess({ 'ITEM_PIPELINES', { '__main__.JsonWriterPipeline': 100} })
process.crawl(DmozSpider)
process.start()
No need for a callback or starting/stopping the reactor manually