December 30, 2012 16:32
diff --git a/wiki2couch.py b/wiki2couch.py
 # Imports a Wikipedia dump into a CouchDB
 # ... and makes it searchable by Apache Lucene

 # REQUIREMENTS:
 # - CouchDB
 # - couchdb-lucene (https://github.com/rnewson/couchdb-lucene)
 # - couchdb-python (http://code.google.com/p/couchdb-python/)

 # Run CouchDB, start this script, run couchdb-lucene while script is running
 # Trigger Indexing by Sample Query:
 #   http://127.0.0.1:5984/_fti/local/simple/_design/search/by_text
 #   ?q=information&limit=10



 #import xml.etree.ElementTree as etree # for python-builtin
 from lxml import etree # when LXML installed
 import couchdb
 import string

 #WIKI   = "./Wikipedia.Articles.2012.xml"
 WIKI = "./simplewiki-20121220-pages-articles.xml"
 DB = "simple" # "wiki"

 server = couchdb.Server("http://127.0.0.1:5984/")
 try:        server.delete(DB)
 except:     pass
 finally:    server.create(DB)
 couch = server[DB]

 # ------------------------------------------------------------------------------
 # LUCENE SETUP

 couch.save(
 {
    "_id":"_design/search",
    "fulltext": {
        "by_title": {
            "index":
                """
                function(doc) {
                    var ret = new Document();
                    ret.add(doc.title);
                    return ret
                }
                """
        },
        "by_text": {
            "index":
                """
                function(doc) {
                    var ret = new Document();
                    ret.add(doc.text);
                    return ret
                }
                """
        }
    }
 }
 )

 # ------------------------------------------------------------------------------
 # XML IMPORTER

 class WikiParser(object):
    def start(self, tag, attrib):
        if tag.endswith('text'):
            self.is_text = True
            self.text = ""
        elif tag.endswith('title'):
            self.is_title = True
            self.title = ""
        else:
            self.is_title = self.is_text = False
            
    def end(self, tag):
        if tag.endswith('text') and len(self.text) > 0 and \
           not self.text.startswith('#REDIRECT'):
            try:
                couch.save({'_id':   self.title.strip(),
                            'title': self.title.strip(),
                            'text' : self.text.strip()})
            except:
                pass
            self.is_title = self.is_text = False
            #print self.title.strip(), len(self.text)
        
    def data(self, data):
        if self.is_title:
            self.title += data
        elif self.is_text:
            self.text += data
            
    
    def close(self):
        pass

 with file(WIKI) as wiki:
    parser = etree.XMLParser(target = WikiParser())
    etree.parse(wiki, parser)
	# Imports a Wikipedia dump into a CouchDB
	# ... and makes it searchable by Apache Lucene

	# REQUIREMENTS:
	# - CouchDB
	# - couchdb-lucene (https://github.com/rnewson/couchdb-lucene)
	# - couchdb-python (http://code.google.com/p/couchdb-python/)

	# Run CouchDB, start this script, run couchdb-lucene while script is running
	# Trigger Indexing by Sample Query:
	# http://127.0.0.1:5984/_fti/local/simple/_design/search/by_text
	# ?q=information&limit=10



	#import xml.etree.ElementTree as etree # for python-builtin
	from lxml import etree # when LXML installed
	import couchdb
	import string

	#WIKI = "./Wikipedia.Articles.2012.xml"
	WIKI = "./simplewiki-20121220-pages-articles.xml"
	DB = "simple" # "wiki"

	server = couchdb.Server("http://127.0.0.1:5984/")
	try: server.delete(DB)
	except: pass
	finally: server.create(DB)
	couch = server[DB]

	# ------------------------------------------------------------------------------
	# LUCENE SETUP

	couch.save(
	{
	"_id":"_design/search",
	"fulltext": {
	"by_title": {
	"index":
	"""
	function(doc) {
	var ret = new Document();
	ret.add(doc.title);
	return ret
	}
	"""
	},
	"by_text": {
	"index":
	"""
	function(doc) {
	var ret = new Document();
	ret.add(doc.text);
	return ret
	}
	"""
	}
	}
	}
	)

	# ------------------------------------------------------------------------------
	# XML IMPORTER

	class WikiParser(object):
	def start(self, tag, attrib):
	if tag.endswith('text'):
	self.is_text = True
	self.text = ""
	elif tag.endswith('title'):
	self.is_title = True
	self.title = ""
	else:
	self.is_title = self.is_text = False

	def end(self, tag):
	if tag.endswith('text') and len(self.text) > 0 and \
	not self.text.startswith('#REDIRECT'):
	try:
	couch.save({'_id': self.title.strip(),
	'title': self.title.strip(),
	'text' : self.text.strip()})
	except:
	pass
	self.is_title = self.is_text = False
	#print self.title.strip(), len(self.text)

	def data(self, data):
	if self.is_title:
	self.title += data
	elif self.is_text:
	self.text += data


	def close(self):
	pass

	with file(WIKI) as wiki:
	parser = etree.XMLParser(target = WikiParser())
	etree.parse(wiki, parser)