Created
December 30, 2012 16:32
-
-
Save anonymous/4413657 to your computer and use it in GitHub Desktop.
Imports Wikipedia (given as article dump) into a CouchDB and adds couchdb-lucene search capabilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports a Wikipedia dump into a CouchDB | |
# ... and makes it searchable by Apache Lucene | |
# REQUIREMENTS: | |
# - CouchDB | |
# - couchdb-lucene (https://github.com/rnewson/couchdb-lucene) | |
# - couchdb-python (http://code.google.com/p/couchdb-python/) | |
# Run CouchDB, start this script, run couchdb-lucene while script is running | |
# Trigger Indexing by Sample Query: | |
# http://127.0.0.1:5984/_fti/local/simple/_design/search/by_text | |
# ?q=information&limit=10 | |
#import xml.etree.ElementTree as etree # for python-builtin | |
from lxml import etree # when LXML installed | |
import couchdb | |
import string | |
#WIKI = "./Wikipedia.Articles.2012.xml" | |
WIKI = "./simplewiki-20121220-pages-articles.xml" | |
DB = "simple" # "wiki" | |
server = couchdb.Server("http://127.0.0.1:5984/") | |
try: server.delete(DB) | |
except: pass | |
finally: server.create(DB) | |
couch = server[DB] | |
# ------------------------------------------------------------------------------ | |
# LUCENE SETUP | |
couch.save( | |
{ | |
"_id":"_design/search", | |
"fulltext": { | |
"by_title": { | |
"index": | |
""" | |
function(doc) { | |
var ret = new Document(); | |
ret.add(doc.title); | |
return ret | |
} | |
""" | |
}, | |
"by_text": { | |
"index": | |
""" | |
function(doc) { | |
var ret = new Document(); | |
ret.add(doc.text); | |
return ret | |
} | |
""" | |
} | |
} | |
} | |
) | |
# ------------------------------------------------------------------------------ | |
# XML IMPORTER | |
class WikiParser(object): | |
def start(self, tag, attrib): | |
if tag.endswith('text'): | |
self.is_text = True | |
self.text = "" | |
elif tag.endswith('title'): | |
self.is_title = True | |
self.title = "" | |
else: | |
self.is_title = self.is_text = False | |
def end(self, tag): | |
if tag.endswith('text') and len(self.text) > 0 and \ | |
not self.text.startswith('#REDIRECT'): | |
try: | |
couch.save({'_id': self.title.strip(), | |
'title': self.title.strip(), | |
'text' : self.text.strip()}) | |
except: | |
pass | |
self.is_title = self.is_text = False | |
#print self.title.strip(), len(self.text) | |
def data(self, data): | |
if self.is_title: | |
self.title += data | |
elif self.is_text: | |
self.text += data | |
def close(self): | |
pass | |
with file(WIKI) as wiki: | |
parser = etree.XMLParser(target = WikiParser()) | |
etree.parse(wiki, parser) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment