Skip to content

Instantly share code, notes, and snippets.

@maedoc
Created August 22, 2013 16:14
Show Gist options
  • Save maedoc/6309374 to your computer and use it in GitHub Desktop.
Save maedoc/6309374 to your computer and use it in GitHub Desktop.
# coding: utf-8
import xml.dom.minidom as md
import sys; sys.path.append('/usr/local/lib/python2.7/dist-packages')
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
posts = md.parse('Posts.xml')
nontext = [n for n in posts.childNodes[0].childNodes if not isinstance(n, md.Text)]
for k, v in nontext[0].attributes.items():
print k, v
post_schema = Schema(
body = TEXT,
id = ID ( stored = True),
title = TEXT ( stored = True),
tags = TEXT ( stored = True)
)
ix = create_in("./", post_schema )
writer = ix.writer()
for p in nontext:
doc = {unicode(p.getAttribute(k.title())) for k in post_schema.names()}
writer.add_document(**doc)
print p.getAttribute('Id')
writer.commit()
from whoosh.qparser import QueryParser as QP
def dq(term):
with ix.searcher() as searcher:
q = QP("body", ix.schema).parse(term)
ret = searcher.search(q)
return ret
# %timeit dq('javascript')
#
# 1 - 10 ms per query
# quick word frequency analysis
from collections import Counter
wf = Counter()
for p in nontext:
for w in p.getAttribute('Body').split():
wf[w] += 1
from pylab import *
loglog([n for (w, n) in wf.most_common(len(wf))])
show()
# power law ?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment