Last active
January 16, 2020 14:19
-
-
Save epigos/0185af79626ba6e00077 to your computer and use it in GitHub Desktop.
Mongodb pipeline for scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from twisted.internet.threads import deferToThread | |
#from scrapy.utils.serialize import ScrapyJSONEncoder | |
from pymongo import MongoClient | |
from pymongo.errors import DuplicateKeyError | |
from scrapy.exceptions import DropItem | |
class MongoPipeline(object): | |
"""Pushes scraped data in to mongodb""" | |
def __init__(self, host, port): | |
client = MongoClient() | |
db = client.demo # mongodb database | |
self.demo_items = db.items # mongodb database collection | |
# Assuming title field is unique to drop duplicates | |
self.demo_items.ensure_index([('title', 1)], unique=True) | |
@classmethod | |
def from_settings(cls, settings): | |
host = settings.get('MONGO_DB_HOST', '127.0.0.1') | |
port = settings.get('MONGO_DB_PORT', 27017) | |
return cls(host, port) | |
def process_item(self, item, spider): | |
"""Run the save the save item in a thread and return the result as a Deferred.""" | |
return deferToThread(self._process_item, item) | |
def _process_item(self, item): | |
"""Save item to mongodb""" | |
data = {'title': item.get('title', ''), | |
'link': item.get('link', ''), | |
'content': item.get('content', '') | |
} | |
try: | |
if self.demo_items.insert(data): | |
return item | |
return None | |
except DuplicateKeyError: | |
raise DropItem("Item already exist %s" % item) |
The pipeline objects requires a classmethod
called from_settings
. So add it your pipeline object and run it.
Add this to your settings.py
file
MONGO_DB_HOST = '127.0.0.1'
MONGO_DB_PORT = 27017
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this to your project's
settings.py