Skip to content

Instantly share code, notes, and snippets.

@mindis
Created November 30, 2016 11:43
Show Gist options
  • Save mindis/9ff1790026f03d5299060ee1afaec1e5 to your computer and use it in GitHub Desktop.
Save mindis/9ff1790026f03d5299060ee1afaec1e5 to your computer and use it in GitHub Desktop.
#install pylucene from http://lucene.apache.org/pylucene/
import sys
import lucene
import os
from java.io import File
from java.nio.file import Paths
from itertools import izip
from lucene import JavaError
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field, TextField, FieldType
from org.apache.lucene.search import FuzzyQuery, MultiTermQuery, IndexSearcher
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader, FieldInfo, IndexOptions,MultiReader, Term
from org.apache.lucene.store import RAMDirectory, SimpleFSDirectory
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.search.spans import SpanNearQuery, SpanQuery, SpanTermQuery, SpanMultiTermQueryWrapper
from org.apache.lucene.queryparser.classic import MultiFieldQueryParser, QueryParser
# also could use RAMDirectory to keep index in memory
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
storeDir=os.path.dirname(os.path.abspath("/my/path"))
store = SimpleFSDirectory(Paths.get(storeDir))
#define field settings
t2 = FieldType()
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
analyzer = WhitespaceAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
def addDoc(text, writer):
doc = Document()
doc.add(Field("field", text, t2))
writer.addDocument(doc)
# add your logic to add documents to index
addDoc("bananas loose", writer)
addDoc("organic bananas", writer)
addDoc("kids bananas", writer)
writer.commit()
writer.close()
searcher=IndexSearcher(DirectoryReader.open(store))
#query in simple way
clauses=[1,2]
clauses[0] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", "b1na3as")));
clauses[1] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", "k12s")));
query = SpanNearQuery(clauses,50, False)
hits = searcher.search(query, 1).scoreDocs
print hits
#search query automated
ocr_query="b1n3nas l12se".split()
for i, val in enumerate(ocr_query):
ocr_query[i] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", ocr_query[i])));
query = SpanNearQuery(ocr_query,50, False)
hits = searcher.search(query, 1).scoreDocs
print hits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment