chihchun · January 6, 2011 15:52
diff --git a/nextmedia.py b/nextmedia.py
 import re

 from scrapy.selector import HtmlXPathSelector
 from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 from scrapy.contrib.spiders import CrawlSpider, Rule
 from linkeddata.items import LinkeddataItem

 class NextmediaSpider(CrawlSpider):
    name = 'nextmedia'
    allowed_domains = ['tw.nextmedia.com']

    rules = (
        Rule(SgmlLinkExtractor(allow=r'applenews/article/art_id/'), callback='parse_content', follow=False),
    )

    def parse_content(self, response):

        hxs = HtmlXPathSelector(response)
        summary = hxs.select("//p[@class='summary']/text()").extract()[0]
        self.log("Summary: %s" % summary.strip())

        titles = hxs.select("//h2[@class='article_title']/text()").extract()
        texts = hxs.select("//p[@class='article_text']").extract()
        for i in range(0, len(titles)):
            self.log("Section Title: %s" % titles[i].strip())
            self.log(texts[i])
	import re

	from scrapy.selector import HtmlXPathSelector
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from linkeddata.items import LinkeddataItem

	class NextmediaSpider(CrawlSpider):
	name = 'nextmedia'
	allowed_domains = ['tw.nextmedia.com']

	rules = (
	Rule(SgmlLinkExtractor(allow=r'applenews/article/art_id/'), callback='parse_content', follow=False),
	)

	def parse_content(self, response):

	hxs = HtmlXPathSelector(response)
	summary = hxs.select("//p[@class='summary']/text()").extract()[0]
	self.log("Summary: %s" % summary.strip())

	titles = hxs.select("//h2[@class='article_title']/text()").extract()
	texts = hxs.select("//p[@class='article_text']").extract()
	for i in range(0, len(titles)):
	self.log("Section Title: %s" % titles[i].strip())
	self.log(texts[i])