Created
November 30, 2016 11:43
-
-
Save mindis/9ff1790026f03d5299060ee1afaec1e5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install pylucene from http://lucene.apache.org/pylucene/ | |
import sys | |
import lucene | |
import os | |
from java.io import File | |
from java.nio.file import Paths | |
from itertools import izip | |
from lucene import JavaError | |
from org.apache.lucene.analysis.standard import StandardAnalyzer | |
from org.apache.lucene.analysis.core import WhitespaceAnalyzer | |
from org.apache.lucene.document import Document, Field, TextField, FieldType | |
from org.apache.lucene.search import FuzzyQuery, MultiTermQuery, IndexSearcher | |
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader, FieldInfo, IndexOptions,MultiReader, Term | |
from org.apache.lucene.store import RAMDirectory, SimpleFSDirectory | |
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer | |
from org.apache.lucene.search.spans import SpanNearQuery, SpanQuery, SpanTermQuery, SpanMultiTermQueryWrapper | |
from org.apache.lucene.queryparser.classic import MultiFieldQueryParser, QueryParser | |
# also could use RAMDirectory to keep index in memory | |
lucene.initVM(vmargs=['-Djava.awt.headless=true']) | |
storeDir=os.path.dirname(os.path.abspath("/my/path")) | |
store = SimpleFSDirectory(Paths.get(storeDir)) | |
#define field settings | |
t2 = FieldType() | |
t2.setStored(False) | |
t2.setTokenized(True) | |
t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) | |
analyzer = WhitespaceAnalyzer() | |
config = IndexWriterConfig(analyzer) | |
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) | |
writer = IndexWriter(store, config) | |
def addDoc(text, writer): | |
doc = Document() | |
doc.add(Field("field", text, t2)) | |
writer.addDocument(doc) | |
# add your logic to add documents to index | |
addDoc("bananas loose", writer) | |
addDoc("organic bananas", writer) | |
addDoc("kids bananas", writer) | |
writer.commit() | |
writer.close() | |
searcher=IndexSearcher(DirectoryReader.open(store)) | |
#query in simple way | |
clauses=[1,2] | |
clauses[0] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", "b1na3as"))); | |
clauses[1] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", "k12s"))); | |
query = SpanNearQuery(clauses,50, False) | |
hits = searcher.search(query, 1).scoreDocs | |
print hits | |
#search query automated | |
ocr_query="b1n3nas l12se".split() | |
for i, val in enumerate(ocr_query): | |
ocr_query[i] = SpanMultiTermQueryWrapper(FuzzyQuery( Term("field", ocr_query[i]))); | |
query = SpanNearQuery(ocr_query,50, False) | |
hits = searcher.search(query, 1).scoreDocs | |
print hits |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment