Skip to content

Instantly share code, notes, and snippets.

@chihchun
Created January 6, 2011 15:52
Show Gist options
  • Save chihchun/768050 to your computer and use it in GitHub Desktop.
Save chihchun/768050 to your computer and use it in GitHub Desktop.
spider to parse tw.nextmedia.com
import re
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from linkeddata.items import LinkeddataItem
class NextmediaSpider(CrawlSpider):
name = 'nextmedia'
allowed_domains = ['tw.nextmedia.com']
rules = (
Rule(SgmlLinkExtractor(allow=r'applenews/article/art_id/'), callback='parse_content', follow=False),
)
def parse_content(self, response):
hxs = HtmlXPathSelector(response)
summary = hxs.select("//p[@class='summary']/text()").extract()[0]
self.log("Summary: %s" % summary.strip())
titles = hxs.select("//h2[@class='article_title']/text()").extract()
texts = hxs.select("//p[@class='article_text']").extract()
for i in range(0, len(titles)):
self.log("Section Title: %s" % titles[i].strip())
self.log(texts[i])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment