Last active
April 12, 2019 09:33
-
-
Save jorgehatccrma/c90fcd4c873eb8bf8933e99a20c0819e to your computer and use it in GitHub Desktop.
Lucene + Jython
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This simple Jython script shows how to use Apache Lucene | |
directly in a Jython script | |
""" | |
# your usual Python imports | |
import sys | |
from contextlib import contextmanager | |
# add jars to classpath | |
jars = [ | |
"lucene-7.1.0/core/lucene-core-7.1.0.jar", | |
"lucene-7.1.0/queryparser/lucene-queryparser-7.1.0.jar", | |
] | |
for jar in jars: | |
sys.path.append(jar) | |
# Now that jars are in the path, we can import java code as if it | |
# was regular Python! | |
from org.apache.lucene.analysis.standard import StandardAnalyzer | |
from org.apache.lucene.document import Document | |
from org.apache.lucene.document import Field | |
from org.apache.lucene.document import StringField | |
from org.apache.lucene.document import TextField | |
from org.apache.lucene.index import DirectoryReader | |
from org.apache.lucene.index import IndexWriter | |
from org.apache.lucene.index import IndexWriterConfig | |
from org.apache.lucene.queryparser.classic import QueryParser | |
from org.apache.lucene.search import IndexSearcher | |
from org.apache.lucene.store import RAMDirectory | |
@contextmanager | |
def closing(thing): | |
""" | |
Simple wrapper to make Lucene's classes appear more pythonic. | |
""" | |
try: | |
yield thing | |
finally: | |
thing.close() | |
def make_index(analyzer): | |
""" Create an inverted index to power the search. """ | |
def add_doc(w, title, isbn): | |
""" Utility to add "documents" to the index. """ | |
doc = Document() | |
doc.add(TextField("title", title, Field.Store.YES)) | |
# use a string field for isbn because we don't | |
# want it tokenized | |
doc.add(StringField("isbn", isbn, Field.Store.YES)) | |
w.addDocument(doc) | |
# create the index | |
index = RAMDirectory() | |
config = IndexWriterConfig(analyzer) | |
with closing(IndexWriter(index, config)) as w: | |
add_doc(w, "Lucene in Action", "193398817") | |
add_doc(w, "Lucene for Dummies", "55320055Z") | |
add_doc(w, "Managing Gigabytes", "55063554A") | |
add_doc(w, "The Art of Computer Science", "9900333X") | |
return index | |
def query(querystr, index, analyzer): | |
""" Search for the `querystr` in the index. """ | |
# the "title" arg specifies the default field to use | |
# when no field is explicitly specified in the query. | |
q = QueryParser("title", analyzer).parse(querystr) | |
# search | |
hitsPerPage = 10 | |
with closing(DirectoryReader.open(index)) as reader: | |
searcher = IndexSearcher(reader) | |
docs = searcher.search(q, hitsPerPage) | |
hits = docs.scoreDocs | |
# display results (needs reader to be open) | |
print("Found {:d} hits.".format(len(hits))) | |
for i, hit in enumerate(hits): | |
docId = hit.doc | |
d = searcher.doc(docId) | |
print("{:d}. {}\t{}".format(i + 1, d.get("isbn"), d.get("title"))) | |
if __name__ == "__main__": | |
# Specify the analyzer for tokenizing text. | |
# The same analyzer should be used for indexing and searching | |
analyzer = StandardAnalyzer() | |
# create the index to search | |
index = make_index(analyzer) | |
# perform a search | |
querystr = sys.argv[1] if len(sys.argv) > 1 else "lucene" | |
query(querystr, index, analyzer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment