Last active
March 14, 2024 23:35
-
-
Save tzermias/6982723 to your computer and use it in GitHub Desktop.
Scrapy MySQL pipeline. Just a mirror to the asynchronous MySQL pipeline.
Copy-paste it directly to pipelines.py. Database credentials are stored in settings.py. Based on http://snipplr.com/view/66986/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MySQLdb.cursors | |
from twisted.enterprise import adbapi | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy import signals | |
from scrapy.utils.project import get_project_settings | |
from scrapy import log | |
SETTINGS = get_project_settings() | |
class MySQLPipeline(object): | |
@classmethod | |
def from_crawler(cls, crawler): | |
return cls(crawler.stats) | |
def __init__(self, stats): | |
#Instantiate DB | |
self.dbpool = adbapi.ConnectionPool ('MySQLdb', | |
host=SETTINGS['DB_HOST'], | |
user=SETTINGS['DB_USER'], | |
passwd=SETTINGS['DB_PASSWD'], | |
port=SETTINGS['DB_PORT'], | |
db=SETTINGS['DB_DB'], | |
charset='utf8', | |
use_unicode = True, | |
cursorclass=MySQLdb.cursors.DictCursor | |
) | |
self.stats = stats | |
dispatcher.connect(self.spider_closed, signals.spider_closed) | |
def spider_closed(self, spider): | |
""" Cleanup function, called after crawing has finished to close open | |
objects. | |
Close ConnectionPool. """ | |
self.dbpool.close() | |
def process_item(self, item, spider): | |
query = self.dbpool.runInteraction(self._insert_record, item) | |
query.addErrback(self._handle_error) | |
return item | |
def _insert_record(self, tx, item): | |
result = tx.execute( | |
""" INSERT INTO table VALUES (1,2,3)""" | |
) | |
if result > 0: | |
self.stats.inc_value('database/items_added') | |
def _handle_error(self, e): | |
log.err(e) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add these on your settings.py file | |
#Database settings | |
DB_HOST = 'localhost' | |
DB_PORT = 3306 | |
DB_USER = 'user' | |
DB_PASSWD = 'password' | |
DB_DB = 'database' |
Agreed with wmadaus
he actually wrote them part of the pipeline class (look at the calls) but the indentation was wrong.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Shouldn't _insert_record and _handle_error be part of the pipeline class?