Skip to content

Instantly share code, notes, and snippets.

@dapangmao
Created February 1, 2016 20:50
Show Gist options
  • Save dapangmao/e1a165ed6596e36085c0 to your computer and use it in GitHub Desktop.
Save dapangmao/e1a165ed6596e36085c0 to your computer and use it in GitHub Desktop.
minimal scrapy

Minimal Scrapy

import os
import subprocess

from __future__ import print_function
from pymongo import MongoClient

db = MongoClient('localhost', 27017)['test']
if 'dmoz' in db.collection_names():
    db.drop_collection("dmoz")

def tree(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            if f.endswith('py'):
                print('{}{}'.format(subindent, f))
tree('.')
./
    __init__.py
    pipelines.py
    settings.py
    spiders/
        __init__.py
        dmoz.py
# %load settings.py
# Scrapy settings for dirbot project

SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DOWNLOAD_HANDLERS = {'s3': None}
ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1}
# %load settings.py
# Scrapy settings for dirbot project

SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DOWNLOAD_HANDLERS = {'s3': None}
ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1}
# %load pipelines.py
from scrapy.exceptions import DropItem
from pymongo import MongoClient
import logging

client = MongoClient('localhost', 27017)
db = client['test']
collection = db.dmoz

class FilterWordsPipeline(object):
    """A pipeline for filtering out items which contain certain words in their
    description"""

    # put all words in lowercase
    words_to_filter = ['politics', 'religion']

    def process_item(self, item, spider):
        for word in self.words_to_filter:
            if word in unicode(item['description']).lower():
                raise DropItem("Contains forbidden word: %s" % word)
        else:
            collection.insert(item)
            logging.info('inserted!')
           # return item
# %load spiders/dmoz.py
from scrapy.spiders import Spider
from scrapy.selector import Selector

class DmozSpider(Spider):
    name = "dmoz"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
    ]

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')

        for site in sites:
            item = {}
            item['name'] = site.xpath('a/text()').extract()[0]
            item['url'] = site.xpath('a/@href').extract()[0]
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')[0]
            yield item
cd dirbot
/Users/home/projects/dirbot/dirbot
!scrapy list
/bin/sh: scrapy: command not found
subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "list"])
'dmoz\n'
!ls
�[31m__init__.py�[m�[m   __init__.pyc  �[31mpipelines.py�[m�[m  pipelines.pyc �[31msettings.py�[m�[m   settings.pyc  �[30m�[43mspiders�[m�[m
subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "crawl", "domz"])
---------------------------------------------------------------------------

CalledProcessError                        Traceback (most recent call last)

<ipython-input-51-cea91ee744a3> in <module>()
----> 1 subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "crawl", "domz"])


/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.pyc in check_output(*popenargs, **kwargs)
    571         if cmd is None:
    572             cmd = popenargs[0]
--> 573         raise CalledProcessError(retcode, cmd, output=output)
    574     return output
    575 


CalledProcessError: Command '['/Users/home/anaconda2/bin/scrapy', 'crawl', 'domz']' returned non-zero exit status 1
for x in list(db.dmoz.find({}, {"_id": 0}).limit(5)):
    print(x)
{u'url': u'http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html', u'name': u'Core Python Programming', u'description': u'- By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]\r'}
{u'url': u'http://www.brpreiss.com/books/opus7/html/book.html', u'name': u'Data Structures and Algorithms with Object-Oriented Design Patterns in Python', u'description': u'- The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\r'}
{u'url': u'http://www.diveintopython.net/', u'name': u'Dive Into Python 3', u'description': u'- By Mark Pilgrim, Guide to Python 3  and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3\r'}
{u'url': u'http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/', u'name': u'Foundations of Python Network Programming', u'description': u'- This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.\r'}
{u'url': u'http://www.techbooksforfree.com/perlpython.shtml', u'name': u'Free Python books', u'description': u'- Free Python books and tutorials.\r'}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment