dmiro · August 29, 2015 14:13
diff --git a/main.py b/main.py
 # http://stackoverflow.com/questions/23921986/web-scraping-without-knowledge-of-page-structure
 _Author = Farsheed Ashouri
 import os
 import sys
 import re
 ## Spider libraries
 from scrapy.spider import BaseSpider
 from scrapy.selector import Selector
 from main.items import MainItem
 from scrapy.http import Request
 from urlparse import urljoin
 ## indexer libraries
 from whoosh.index import create_in, open_dir
 from whoosh.fields import *
 ## html to text conversion module
 import nltk

 def open_writer():
    if not os.path.isdir("indexdir"):
        os.mkdir("indexdir")
        schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
        ix = create_in("indexdir", schema)
    else:
        ix = open_dir("indexdir")
    return ix.writer()

 class Main(BaseSpider):
    name        = "main"
    allowed_domains = ["en.wikipedia.org"]
    start_urls  = ["http://en.wikipedia.org/wiki/Snakes"]

    def parse(self, response):
        writer = open_writer()  ## for indexing
        sel = Selector(response)
        email_validation = re.compile(r'^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$')
        #general_link_validation = re.compile(r'')
        #We stored already crawled links in this list
        crawledLinks    = set()
        titles = sel.xpath('//div[@id="content"]//h1[@id="firstHeading"]//span/text()').extract()
        contents = sel.xpath('//body/div[@id="content"]').extract()
        if contents:
            content = contents[0]
        if titles: 
            title = titles[0]
        else:
            return
        links   = sel.xpath('//a/@href').extract()


        for link in links:
            # If it is a proper link and is not checked yet, yield it to the Spider
            url = urljoin(response.url, link)
            #print url
            ## our url must not have any ":" character in it. link /wiki/talk:company
            if not url in crawledLinks and re.match(r'http://en.wikipedia.org/wiki/[^:]+$', url):
                crawledLinks.add(url)
                  #print url, depth
                yield Request(url, self.parse)
        item = MainItem()
        item["title"] = title
        print '*'*80
        print 'crawled: %s | it has %s links.' % (title, len(links))
        #print content
        print '*'*80
        item["links"] = list(crawledLinks)
        writer.add_document(title=title, content=nltk.clean_html(content))  ## I save only text from content.
        #print crawledLinks
        writer.commit()
        yield item
	# http://stackoverflow.com/questions/23921986/web-scraping-without-knowledge-of-page-structure
	_Author = Farsheed Ashouri
	import os
	import sys
	import re
	## Spider libraries
	from scrapy.spider import BaseSpider
	from scrapy.selector import Selector
	from main.items import MainItem
	from scrapy.http import Request
	from urlparse import urljoin
	## indexer libraries
	from whoosh.index import create_in, open_dir
	from whoosh.fields import *
	## html to text conversion module
	import nltk

	def open_writer():
	if not os.path.isdir("indexdir"):
	os.mkdir("indexdir")
	schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
	ix = create_in("indexdir", schema)
	else:
	ix = open_dir("indexdir")
	return ix.writer()

	class Main(BaseSpider):
	name = "main"
	allowed_domains = ["en.wikipedia.org"]
	start_urls = ["http://en.wikipedia.org/wiki/Snakes"]

	def parse(self, response):
	writer = open_writer() ## for indexing
	sel = Selector(response)
	email_validation = re.compile(r'^[_a-z0-9-]+(\.[_a-z0-9-]+)@[a-z0-9-]+(\.[a-z0-9-]+)(\.[a-z]{2,4})$')
	#general_link_validation = re.compile(r'')
	#We stored already crawled links in this list
	crawledLinks = set()
	titles = sel.xpath('//div[@id="content"]//h1[@id="firstHeading"]//span/text()').extract()
	contents = sel.xpath('//body/div[@id="content"]').extract()
	if contents:
	content = contents[0]
	if titles:
	title = titles[0]
	else:
	return
	links = sel.xpath('//a/@href').extract()


	for link in links:
	# If it is a proper link and is not checked yet, yield it to the Spider
	url = urljoin(response.url, link)
	#print url
	## our url must not have any ":" character in it. link /wiki/talk:company
	if not url in crawledLinks and re.match(r'http://en.wikipedia.org/wiki/[^:]+$', url):
	crawledLinks.add(url)
	#print url, depth
	yield Request(url, self.parse)
	item = MainItem()
	item["title"] = title
	print ''80
	print 'crawled: %s \| it has %s links.' % (title, len(links))
	#print content
	print ''80
	item["links"] = list(crawledLinks)
	writer.add_document(title=title, content=nltk.clean_html(content)) ## I save only text from content.
	#print crawledLinks
	writer.commit()
	yield item
No results found