Created
January 11, 2010 15:58
-
-
Save zeisss/274324 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Parse the wikipedia dump into tiny parts. | |
""" | |
import sys, string | |
from xml.sax import saxutils, handler, make_parser | |
# --- The ContentHandler | |
def storer(database_name = 'wikipedia-de'): | |
from couchdb import client | |
server = client.Server('http://localhost:5984/') | |
if not database_name in server: | |
print "Creating database %s" % database_name | |
server.create(database_name) | |
else: | |
db = server[database_name] | |
while True: | |
docs = (yield) | |
print "Bulk inserting %r documents ... " % len(docs), | |
db.update(docs) | |
print "done" | |
def grouper(storer, count = 10000): | |
bulk = [] | |
try: | |
while True: | |
page = (yield) | |
print u"Adding %r" % page["title"] | |
bulk.append(page) | |
if len(bulk) == count: | |
storer.send(bulk) | |
bulk = [] | |
finally: | |
# Make sure, any leftover doc is stored | |
storer.send(bulk) | |
class WikiParser(handler.ContentHandler): | |
path = [] | |
page = dict() | |
def __init__(self, storer): | |
handler.ContentHandler.__init__(self) | |
self.storer = storer | |
# ContentHandler methods | |
def startElement(self, name, attrs): | |
if name == "mediawiki": | |
return | |
self.path.append(name) | |
if name == "page": | |
self.page = {'type':'http://types.moinz.de/wikipedia/Article'} | |
if name == "revision": | |
self.page['revision'] = {} | |
if name == "minor": | |
self.page['revision']['minor'] = True | |
if name == "contributor": | |
self.page['revision']['contributor'] = {} | |
def endElement(self, name): | |
self.path.pop(); | |
if name == "page": # end of </page> | |
self.storer.send(self.page) | |
def characters(self, content): | |
# Metadata | |
if self.path == ["page", "title"]: | |
self.page['title'] = content | |
elif self.path == ["page", "id"]: | |
self.page['dump-id'] = content | |
elif self.path == ["page", "restrictions"]: | |
self.page['restrictions'] = content | |
elif self.path == ["page", "revision", "id"]: | |
self.page['revision']["id"] = content; | |
elif self.path == ["page", "revision", "text"]: | |
self.page['revision']["text"] = content; | |
elif self.path == ["page", "revision", "timestamp"]: | |
self.page['revision']["timestamp"] = content; | |
elif self.path == ["page", "revision", "comment"]: | |
self.page['revision']["comment"] = content; | |
# Contributor | |
elif self.path == ["page", "revision", "contributor", "username"]: | |
self.page['revision']["contributor"]['username'] = content; | |
elif self.path == ["page", "revision", "contributor", "ip"]: | |
self.page['revision']["contributor"]['ip'] = content; | |
elif self.path == ["page", "revision", "contributor", "id"]: | |
self.page['revision']["contributor"]['id'] = content; | |
# Rest | |
elif content.strip() == "" or self.path[0] == "siteinfo": | |
pass | |
else: | |
print "Unknown tag content for path %s: %s" % (self.path, content) | |
# def ignorableWhitespace(self, content): | |
# self._out.write(content) | |
# --- The main program | |
# The storer performs bulk dumps of the articles into the couchdb | |
s = storer() | |
s.next() | |
# This groups incoming documents into a list and sets the list to the next level | |
g = grouper(s, 10000) | |
g.next() # get the storer ready | |
parser = make_parser() | |
parser.setContentHandler(WikiParser(g)) | |
parser.parse(sys.argv[1]) | |
handler.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment