Last active
August 29, 2015 14:13
-
-
Save dmiro/eeeaaf7015389f1f0dde to your computer and use it in GitHub Desktop.
scrapy (scraping) + whoosh (indexer) example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # http://stackoverflow.com/questions/23921986/web-scraping-without-knowledge-of-page-structure | |
| _Author = Farsheed Ashouri | |
| import os | |
| import sys | |
| import re | |
| ## Spider libraries | |
| from scrapy.spider import BaseSpider | |
| from scrapy.selector import Selector | |
| from main.items import MainItem | |
| from scrapy.http import Request | |
| from urlparse import urljoin | |
| ## indexer libraries | |
| from whoosh.index import create_in, open_dir | |
| from whoosh.fields import * | |
| ## html to text conversion module | |
| import nltk | |
| def open_writer(): | |
| if not os.path.isdir("indexdir"): | |
| os.mkdir("indexdir") | |
| schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True)) | |
| ix = create_in("indexdir", schema) | |
| else: | |
| ix = open_dir("indexdir") | |
| return ix.writer() | |
| class Main(BaseSpider): | |
| name = "main" | |
| allowed_domains = ["en.wikipedia.org"] | |
| start_urls = ["http://en.wikipedia.org/wiki/Snakes"] | |
| def parse(self, response): | |
| writer = open_writer() ## for indexing | |
| sel = Selector(response) | |
| email_validation = re.compile(r'^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$') | |
| #general_link_validation = re.compile(r'') | |
| #We stored already crawled links in this list | |
| crawledLinks = set() | |
| titles = sel.xpath('//div[@id="content"]//h1[@id="firstHeading"]//span/text()').extract() | |
| contents = sel.xpath('//body/div[@id="content"]').extract() | |
| if contents: | |
| content = contents[0] | |
| if titles: | |
| title = titles[0] | |
| else: | |
| return | |
| links = sel.xpath('//a/@href').extract() | |
| for link in links: | |
| # If it is a proper link and is not checked yet, yield it to the Spider | |
| url = urljoin(response.url, link) | |
| #print url | |
| ## our url must not have any ":" character in it. link /wiki/talk:company | |
| if not url in crawledLinks and re.match(r'http://en.wikipedia.org/wiki/[^:]+$', url): | |
| crawledLinks.add(url) | |
| #print url, depth | |
| yield Request(url, self.parse) | |
| item = MainItem() | |
| item["title"] = title | |
| print '*'*80 | |
| print 'crawled: %s | it has %s links.' % (title, len(links)) | |
| #print content | |
| print '*'*80 | |
| item["links"] = list(crawledLinks) | |
| writer.add_document(title=title, content=nltk.clean_html(content)) ## I save only text from content. | |
| #print crawledLinks | |
| writer.commit() | |
| yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment