Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created August 24, 2012 20:30
Show Gist options
  • Save alexstorer/3455295 to your computer and use it in GitHub Desktop.
Save alexstorer/3455295 to your computer and use it in GitHub Desktop.
For PyLucene! # This file demonstrates how to... # 1) Make a new analyzer # 2) Make a new filter # 3) Apply an analyzer chain to a string (via a query) # 4) Include phrases as tokens
# This file demonstrates how to...
# 1) Make a new analyzer
# 2) Make a new filter
# 3) Apply an analyzer chain to a string (via a query)
# 4) Include phrases as tokens
from lucene import *
class AnalyzerUtils(object):
def main(cls, argv):
print "SimpleAnalyzer"
cls.displayTokensWithFullDetails(SimpleAnalyzer(),
"The quick brown fox....")
print "\n----"
print "StandardAnalyzer"
cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")
def setPositionIncrement(cls, source, posIncr):
attr = source.addAttribute(PositionIncrementAttribute.class_)
attr.setPositionIncrement(posIncr)
def getPositionIncrement(cls, source):
attr = source.addAttribute(PositionIncrementAttribute.class_)
return attr.getPositionIncrement()
def setTerm(cls, source, term):
attr = source.addAttribute(TermAttribute.class_)
attr.setTermBuffer(term)
def getTerm(cls, source):
attr = source.addAttribute(TermAttribute.class_)
return attr.term()
def setType(cls, source, type):
attr = source.addAttribute(TypeAttribute.class_)
attr.setType(type)
def getType(cls, source):
attr = source.addAttribute(TypeAttribute.class_)
return attr.type()
def displayTokens(cls, analyzer, text):
tokenStream = analyzer.tokenStream("contents", StringReader(text))
term = tokenStream.addAttribute(TermAttribute.class_)
while tokenStream.incrementToken():
print "[%s]" %(term.term()),
def displayTokensWithPositions(cls, analyzer, text):
stream = analyzer.tokenStream("contents", StringReader(text))
term = stream.addAttribute(TermAttribute.class_)
posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
position = 0
while stream.incrementToken():
increment = posIncr.getPositionIncrement()
if increment > 0:
position = position + increment
print "\n%d:" %(position),
print "[%s]" %(term.term()),
print
def displayTokensWithFullDetails(cls, analyzer, text):
stream = analyzer.tokenStream("contents", StringReader(text))
term = stream.addAttribute(TermAttribute.class_)
posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
offset = stream.addAttribute(OffsetAttribute.class_)
type = stream.addAttribute(TypeAttribute.class_)
position = 0
while stream.incrementToken():
increment = posIncr.getPositionIncrement()
if increment > 0:
position = position + increment
print "\n%d:" %(position),
print "[%s:%d->%d:%s]" %(term.term(),
offset.startOffset(),
offset.endOffset(),
type.type()),
print
def assertAnalyzesTo(cls, analyzer, input, outputs):
stream = analyzer.tokenStream("field", StringReader(input))
termAttr = stream.addAttribute(TermAttribute.class_)
for output in outputs:
if not stream.incrementToken():
raise AssertionError, 'stream.incremementToken()'
if output != termAttr.term():
raise AssertionError, 'output == termAttr.term())'
if stream.incrementToken():
raise AssertionError, 'not stream.incremementToken()'
stream.close()
main = classmethod(main)
setPositionIncrement = classmethod(setPositionIncrement)
getPositionIncrement = classmethod(getPositionIncrement)
setTerm = classmethod(setTerm)
getTerm = classmethod(getTerm)
setType = classmethod(setType)
getType = classmethod(getType)
displayTokens = classmethod(displayTokens)
displayTokensWithPositions = classmethod(displayTokensWithPositions)
displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
assertAnalyzesTo = classmethod(assertAnalyzesTo)
class PhraseFilter(PythonTokenFilter):
'''
PhraseFilter is a TokenFilter that adds in phrases (as tokens) that match
user-defined phrases. You can then use these tokens when exporting a TDM.
'''
TOKEN_TYPE_SYNONYM = "SYNONYM"
TOKEN_TYPE_PHRASE = "PHRASE"
def __init__(self, inStream,allPhrases):
super(PhraseFilter, self).__init__(inStream)
self.synonymStack = []
self.termAttr = self.addAttribute(TermAttribute.class_)
self.save = inStream.cloneAttributes()
self.inStream = inStream
revSplitPhrases = []
for p in allPhrases:
psplit = p.split()
psplit.reverse()
revSplitPhrases.append(psplit)
self.allPhrases = revSplitPhrases
self.lag1 = ""
self.lag2 = ""
self.phraseStack = []
def incrementToken(self):
if len(self.phraseStack) > 0:
syn = self.phraseStack.pop()
self.restoreState(syn)
return True
if not self.inStream.incrementToken():
return False
for phrase in self.allPhrases:
addPhrase = False
lag0 = self.termAttr.term()
print "checking: ", self.termAttr.term()
print phrase
if len(phrase)==2:
if self.lag1==phrase[1] and lag0==phrase[0]:
print "matched!"
addPhrase = True
if len(phrase)==3:
if self.lag2==phrase[2] and self.lag1==phrase[1] and lag0==phrase[0]:
print "matched!"
addPhrase = True
if addPhrase:
rPhrase = phrase
rPhrase.reverse()
self.addPhrase(" ".join(rPhrase))
self.lag2 = self.lag1
self.lag1 = self.termAttr.term()
print "lag1: ", self.lag1
print "lag2: ", self.lag2
return True
def addPhrase(self,arg):
print "adding phrase", arg
current = self.captureState()
self.save.restoreState(current)
AnalyzerUtils.setTerm(self.save, arg)
AnalyzerUtils.setType(self.save, self.TOKEN_TYPE_PHRASE)
AnalyzerUtils.setPositionIncrement(self.save, 0)
self.phraseStack.append(self.save.captureState())
class PorterStemmerAnalyzer(PythonAnalyzer):
def setPhrases(self, myPhrases):
self.myPhrases = myPhrases
def tokenStream(self, fieldName, reader):
result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
result = StandardFilter(result)
#result = ShingleFilter(result)
#result.setOutputUnigrams(True)
result = LowerCaseFilter(result)
result = PorterStemFilter(result)
result = PhraseFilter(result,self.myPhrases)
result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
return result
class FooAnalyzer(PythonAnalyzer):
def tokenStream(self, fieldName, reader):
result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
#result = StandardFilter(result)
#result = ShingleFilter(result)
#result.setOutputUnigrams(True)
#result = LowerCaseFilter(result)
#result = PorterStemFilter(result)
#result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
return result
class QueryAnalyzer(PythonAnalyzer):
def tokenStream(self, fieldName, reader):
result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
result = StandardFilter(result)
result = LowerCaseFilter(result)
result = PorterStemFilter(result)
return result
if __name__ == '__main__':
initVM()
def stemPhrases(allPhrases,analyzer):
'''
We need to porter stem the phrases in the query so that it will
match the porterized versions. Added benefit: if you are looking
for the exact phrase 'buffalo wing' you will also get 'buffalo wings'
bug or feature? you decide!
sample call: stemPhrases(allPhrases,QueryAnalyzer)
'''
stemmedPhrases = []
for p in allPhrases:
query = QueryParser(Version.LUCENE_CURRENT, "removeme",
analyzer(Version.LUCENE_CURRENT)).parse('"'+p+'"')
stemmedQuery = query.toString()
stemmedPhrase = stemmedQuery.replace('removeme:','').replace('"','')
stemmedPhrases.append(stemmedPhrase)
return stemmedPhrases
directory = RAMDirectory()
allPhrases = ["buffalo wings","sweet deals","wings over boston"]
stemmedPhrases = stemPhrases(allPhrases,QueryAnalyzer)
psa = PorterStemmerAnalyzer(Version.LUCENE_CURRENT)
psa.setPhrases(stemmedPhrases)
iwriter = IndexWriter(directory,
psa,
True, IndexWriter.MaxFieldLength.LIMITED)
ts = ["The buffalo wings over boston fly like an eagle",
"The buffalo wings are better in Buffalo",
"What kind of buffalo has wings?",
"this bernhard is the text to be index text",
"put some text in the buffer",
"this claudia is the text to be index"]
for t in ts:
doc = Document()
doc.add(Field("fieldname", t,
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
iwriter.addDocument(doc)
iwriter.optimize()
iwriter.close()
ireader = IndexReader.open(directory, True)
tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
# Print out all of the tokens, so that we can verify that
# our analysis chain is successful
for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
print 'term: %s' % t
print ' freq: %i' % f
try:
print ' pos: ' + str([p for p in tpv.getTermPositions(i)])
except:
print ' no pos'
try:
print ' off: ' + \
str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
for o in tpv.getOffsets(i)])
except:
print ' no offsets'
# searcher = IndexSearcher(directory, True)
# reader = IndexReader.open(directory, True)
# analyzer = psa
# command = 'wings'
# query = QueryParser(Version.LUCENE_CURRENT, "fieldname",
# analyzer).parse(command)
# print "query:", query
# scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
# print "%s total matching documents." % len(scoreDocs)
# for scoreDoc in scoreDocs:
# doc = searcher.doc(scoreDoc.doc)
# vector = reader.getTermFreqVector(scoreDoc.doc,"fieldname")
# print vector.getTerms()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment