Skip to content

Instantly share code, notes, and snippets.

@nelix
Forked from qingfeng/sp.py
Created July 9, 2009 19:23
Show Gist options
  • Save nelix/143921 to your computer and use it in GitHub Desktop.
Save nelix/143921 to your computer and use it in GitHub Desktop.
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.item import ScrapedItem
def safecn(i):
try:
return unichr(int(i))
except:
return i
cn = lambda s:"".join(
map(lambda x:safecn(x.replace("&#","")),
s.strip().split(";"))
)
class ZaojiaoSpider(CrawlSpider):
domain_name = "zaojiao.com"
start_urls = [
"http://www.zaojiao.com/pregnancy/",
]
rules = (
Rule(SgmlLinkExtractor(allow=('\.html',),), \
callback='parse_item'),
)
def parse_item(self, response):
log.msg("response.url",response.url)
hxs = HtmlXPathSelector(response)
item = ScrapedItem()
item.title = cn(hxs.x('//h1/text()').extract()[0])
item.body = cn(hxs.x('//div[@id="content"]').extract()[0])
item.url = response.url
item.uuid = hash(response.url)
return [item]
SPIDER = ZaojiaoSpider()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment