import os
import subprocess
from __future__ import print_function
from pymongo import MongoClient
db = MongoClient('localhost', 27017)['test']
if 'dmoz' in db.collection_names():
db.drop_collection("dmoz")
def tree(startpath):
for root, dirs, files in os.walk(startpath):
level = root.replace(startpath, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
if f.endswith('py'):
print('{}{}'.format(subindent, f))
./
__init__.py
pipelines.py
settings.py
spiders/
__init__.py
dmoz.py
# %load settings.py
# Scrapy settings for dirbot project
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DOWNLOAD_HANDLERS = {'s3': None}
ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1}
# %load settings.py
# Scrapy settings for dirbot project
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DOWNLOAD_HANDLERS = {'s3': None}
ITEM_PIPELINES = {'dirbot.pipelines.FilterWordsPipeline': 1}
# %load pipelines.py
from scrapy.exceptions import DropItem
from pymongo import MongoClient
import logging
client = MongoClient('localhost', 27017)
db = client['test']
collection = db.dmoz
class FilterWordsPipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode(item['description']).lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
collection.insert(item)
logging.info('inserted!')
# return item
# %load spiders/dmoz.py
from scrapy.spiders import Spider
from scrapy.selector import Selector
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul[@class="directory-url"]/li')
for site in sites:
item = {}
item['name'] = site.xpath('a/text()').extract()[0]
item['url'] = site.xpath('a/@href').extract()[0]
item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')[0]
yield item
/Users/home/projects/dirbot/dirbot
/bin/sh: scrapy: command not found
subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "list"])
'dmoz\n'
�[31m__init__.py�[m�[m __init__.pyc �[31mpipelines.py�[m�[m pipelines.pyc �[31msettings.py�[m�[m settings.pyc �[30m�[43mspiders�[m�[m
subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "crawl", "domz"])
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
<ipython-input-51-cea91ee744a3> in <module>()
----> 1 subprocess.check_output(["/Users/home/anaconda2/bin/scrapy", "crawl", "domz"])
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.pyc in check_output(*popenargs, **kwargs)
571 if cmd is None:
572 cmd = popenargs[0]
--> 573 raise CalledProcessError(retcode, cmd, output=output)
574 return output
575
CalledProcessError: Command '['/Users/home/anaconda2/bin/scrapy', 'crawl', 'domz']' returned non-zero exit status 1
for x in list(db.dmoz.find({}, {"_id": 0}).limit(5)):
print(x)
{u'url': u'http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html', u'name': u'Core Python Programming', u'description': u'- By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]\r'}
{u'url': u'http://www.brpreiss.com/books/opus7/html/book.html', u'name': u'Data Structures and Algorithms with Object-Oriented Design Patterns in Python', u'description': u'- The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\r'}
{u'url': u'http://www.diveintopython.net/', u'name': u'Dive Into Python 3', u'description': u'- By Mark Pilgrim, Guide to Python 3 and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3\r'}
{u'url': u'http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/', u'name': u'Foundations of Python Network Programming', u'description': u'- This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.\r'}
{u'url': u'http://www.techbooksforfree.com/perlpython.shtml', u'name': u'Free Python books', u'description': u'- Free Python books and tutorials.\r'}