alexstorer · August 24, 2012 20:30
diff --git a/gistfile1.py b/gistfile1.py
 # This file demonstrates how to...
 # 1)  Make a new analyzer
 # 2)  Make a new filter
 # 3)  Apply an analyzer chain to a string (via a query)
 # 4)  Include phrases as tokens

 from lucene import *

 class AnalyzerUtils(object):

    def main(cls, argv):

        print "SimpleAnalyzer"
        cls.displayTokensWithFullDetails(SimpleAnalyzer(),
                                         "The quick brown fox....")

        print "\n----"
        print "StandardAnalyzer"
        cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")

    def setPositionIncrement(cls, source, posIncr):
        attr = source.addAttribute(PositionIncrementAttribute.class_)
        attr.setPositionIncrement(posIncr)

    def getPositionIncrement(cls, source):
        attr = source.addAttribute(PositionIncrementAttribute.class_)
        return attr.getPositionIncrement()

    def setTerm(cls, source, term):
        attr = source.addAttribute(TermAttribute.class_)
        attr.setTermBuffer(term)

    def getTerm(cls, source):
        attr = source.addAttribute(TermAttribute.class_)
        return attr.term()

    def setType(cls, source, type):
        attr = source.addAttribute(TypeAttribute.class_)
        attr.setType(type)

    def getType(cls, source):
        attr = source.addAttribute(TypeAttribute.class_)
        return attr.type()

    def displayTokens(cls, analyzer, text):

        tokenStream = analyzer.tokenStream("contents", StringReader(text))
        term = tokenStream.addAttribute(TermAttribute.class_)

        while tokenStream.incrementToken():
            print "[%s]" %(term.term()),

    def displayTokensWithPositions(cls, analyzer, text):

        stream = analyzer.tokenStream("contents", StringReader(text))
        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

        position = 0
        while stream.incrementToken():
            increment = posIncr.getPositionIncrement()
            if increment > 0:
                position = position + increment
                print "\n%d:" %(position),

            print "[%s]" %(term.term()),
        print

    def displayTokensWithFullDetails(cls, analyzer, text):

        stream = analyzer.tokenStream("contents", StringReader(text))

        term = stream.addAttribute(TermAttribute.class_)
        posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
        offset = stream.addAttribute(OffsetAttribute.class_)
        type = stream.addAttribute(TypeAttribute.class_)

        position = 0
        while stream.incrementToken():
            increment = posIncr.getPositionIncrement()
            if increment > 0:
                position = position + increment
                print "\n%d:" %(position),

            print "[%s:%d->%d:%s]" %(term.term(),
                                     offset.startOffset(),
                                     offset.endOffset(),
                                     type.type()),
        print

    def assertAnalyzesTo(cls, analyzer, input, outputs):

        stream = analyzer.tokenStream("field", StringReader(input))
        termAttr = stream.addAttribute(TermAttribute.class_)
        for output in outputs:
            if not stream.incrementToken():
                raise AssertionError, 'stream.incremementToken()'
            if output != termAttr.term():
                raise AssertionError, 'output == termAttr.term())'

        if stream.incrementToken():
            raise AssertionError, 'not stream.incremementToken()'

        stream.close()

    main = classmethod(main)
    setPositionIncrement = classmethod(setPositionIncrement)
    getPositionIncrement = classmethod(getPositionIncrement)
    setTerm = classmethod(setTerm)
    getTerm = classmethod(getTerm)
    setType = classmethod(setType)
    getType = classmethod(getType)
    displayTokens = classmethod(displayTokens)
    displayTokensWithPositions = classmethod(displayTokensWithPositions)
    displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
    assertAnalyzesTo = classmethod(assertAnalyzesTo)

 class PhraseFilter(PythonTokenFilter):
    '''
    PhraseFilter is a TokenFilter that adds in phrases (as tokens) that match
    user-defined phrases.  You can then use these tokens when exporting a TDM.
    '''
    TOKEN_TYPE_SYNONYM = "SYNONYM"
    TOKEN_TYPE_PHRASE = "PHRASE"

    def __init__(self, inStream,allPhrases):
        super(PhraseFilter, self).__init__(inStream)

        self.synonymStack = []
        self.termAttr = self.addAttribute(TermAttribute.class_)
        self.save = inStream.cloneAttributes()
        self.inStream = inStream

        revSplitPhrases = []
        for p in allPhrases:
            psplit = p.split()
            psplit.reverse()
            revSplitPhrases.append(psplit)

        self.allPhrases = revSplitPhrases
        self.lag1 = ""
        self.lag2 = ""
        self.phraseStack = []

    def incrementToken(self):

        if len(self.phraseStack) > 0:
            syn = self.phraseStack.pop()
            self.restoreState(syn)
            return True

        if not self.inStream.incrementToken():
            return False
        
        for phrase in self.allPhrases:
            addPhrase = False
            lag0 = self.termAttr.term()
            print "checking: ", self.termAttr.term()
            print phrase
            if len(phrase)==2:
                if self.lag1==phrase[1] and lag0==phrase[0]:
                    print "matched!"
                    addPhrase = True
            if len(phrase)==3:
                if self.lag2==phrase[2] and self.lag1==phrase[1] and lag0==phrase[0]:
                    print "matched!"
                    addPhrase = True
            if addPhrase:
                rPhrase = phrase
                rPhrase.reverse()
                self.addPhrase(" ".join(rPhrase))

        self.lag2 = self.lag1
        self.lag1 = self.termAttr.term()
        
        print "lag1: ", self.lag1
        print "lag2: ", self.lag2

        return True

    def addPhrase(self,arg):

        print "adding phrase", arg
        current = self.captureState()

        self.save.restoreState(current)
        AnalyzerUtils.setTerm(self.save, arg)
        AnalyzerUtils.setType(self.save, self.TOKEN_TYPE_PHRASE)
        AnalyzerUtils.setPositionIncrement(self.save, 0)
        self.phraseStack.append(self.save.captureState())


 class PorterStemmerAnalyzer(PythonAnalyzer):

    def setPhrases(self, myPhrases):
        self.myPhrases = myPhrases

    def tokenStream(self, fieldName, reader):

        result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
        result = StandardFilter(result)
        #result = ShingleFilter(result)
        #result.setOutputUnigrams(True)
        result = LowerCaseFilter(result)        
        result = PorterStemFilter(result)
        result = PhraseFilter(result,self.myPhrases)        
        result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        return result

 class FooAnalyzer(PythonAnalyzer):

    def tokenStream(self, fieldName, reader):

        result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
        #result = StandardFilter(result)
        #result = ShingleFilter(result)
        #result.setOutputUnigrams(True)
        #result = LowerCaseFilter(result)
        #result = PorterStemFilter(result)
        #result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)

        return result

 class QueryAnalyzer(PythonAnalyzer):

    def tokenStream(self, fieldName, reader):

        result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
        result = StandardFilter(result)
        result = LowerCaseFilter(result)
        result = PorterStemFilter(result)
        return result


 if __name__ == '__main__':
    initVM()


 def stemPhrases(allPhrases,analyzer):
    '''
    We need to porter stem the phrases in the query so that it will
    match the porterized versions.  Added benefit: if you are looking
    for the exact phrase 'buffalo wing'  you will also get 'buffalo wings'
    bug or feature?  you decide!

    sample call: stemPhrases(allPhrases,QueryAnalyzer)
    '''

    stemmedPhrases = []

    for p in allPhrases:
        query = QueryParser(Version.LUCENE_CURRENT, "removeme",
                            analyzer(Version.LUCENE_CURRENT)).parse('"'+p+'"')
        stemmedQuery = query.toString()
        stemmedPhrase = stemmedQuery.replace('removeme:','').replace('"','')
        stemmedPhrases.append(stemmedPhrase)

    return stemmedPhrases


 directory = RAMDirectory()

 allPhrases = ["buffalo wings","sweet deals","wings over boston"]        

 stemmedPhrases = stemPhrases(allPhrases,QueryAnalyzer)

 psa = PorterStemmerAnalyzer(Version.LUCENE_CURRENT)
 psa.setPhrases(stemmedPhrases)

 iwriter = IndexWriter(directory, 
                      psa,
                      True, IndexWriter.MaxFieldLength.LIMITED)
 ts = ["The buffalo wings over boston fly like an eagle",
      "The buffalo wings are better in Buffalo",
      "What kind of buffalo has wings?",
    "this bernhard is the text to be index text",
    "put some text in the buffer",
    "this claudia is the text to be index"]
 for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t,
                  Field.Store.YES, 
                  Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
    iwriter.addDocument(doc)
 iwriter.optimize()
 iwriter.close()

 ireader = IndexReader.open(directory, True)
 tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))

 # Print out all of the tokens, so that we can verify that
 # our analysis chain is successful

 for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
    print 'term: %s' % t
    print '  freq: %i' % f
    try:
        print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
    except:
        print '  no pos'
    try:
        print '  off: ' + \
              str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                   for o in tpv.getOffsets(i)])
    except:
        print '  no offsets'

 # searcher = IndexSearcher(directory, True)
 # reader = IndexReader.open(directory, True)
 # analyzer = psa
 # command = 'wings'
 # query = QueryParser(Version.LUCENE_CURRENT, "fieldname",
 #                     analyzer).parse(command)
 # print "query:", query

 # scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
 # print "%s total matching documents." % len(scoreDocs)

 # for scoreDoc in scoreDocs:
 #     doc = searcher.doc(scoreDoc.doc)
 #     vector = reader.getTermFreqVector(scoreDoc.doc,"fieldname")
 #     print vector.getTerms()
	# This file demonstrates how to...
	# 1) Make a new analyzer
	# 2) Make a new filter
	# 3) Apply an analyzer chain to a string (via a query)
	# 4) Include phrases as tokens

	from lucene import *

	class AnalyzerUtils(object):

	def main(cls, argv):

	print "SimpleAnalyzer"
	cls.displayTokensWithFullDetails(SimpleAnalyzer(),
	"The quick brown fox....")

	print "\n----"
	print "StandardAnalyzer"
	cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]")

	def setPositionIncrement(cls, source, posIncr):
	attr = source.addAttribute(PositionIncrementAttribute.class_)
	attr.setPositionIncrement(posIncr)

	def getPositionIncrement(cls, source):
	attr = source.addAttribute(PositionIncrementAttribute.class_)
	return attr.getPositionIncrement()

	def setTerm(cls, source, term):
	attr = source.addAttribute(TermAttribute.class_)
	attr.setTermBuffer(term)

	def getTerm(cls, source):
	attr = source.addAttribute(TermAttribute.class_)
	return attr.term()

	def setType(cls, source, type):
	attr = source.addAttribute(TypeAttribute.class_)
	attr.setType(type)

	def getType(cls, source):
	attr = source.addAttribute(TypeAttribute.class_)
	return attr.type()

	def displayTokens(cls, analyzer, text):

	tokenStream = analyzer.tokenStream("contents", StringReader(text))
	term = tokenStream.addAttribute(TermAttribute.class_)

	while tokenStream.incrementToken():
	print "[%s]" %(term.term()),

	def displayTokensWithPositions(cls, analyzer, text):

	stream = analyzer.tokenStream("contents", StringReader(text))
	term = stream.addAttribute(TermAttribute.class_)
	posIncr = stream.addAttribute(PositionIncrementAttribute.class_)

	position = 0
	while stream.incrementToken():
	increment = posIncr.getPositionIncrement()
	if increment > 0:
	position = position + increment
	print "\n%d:" %(position),

	print "[%s]" %(term.term()),
	print

	def displayTokensWithFullDetails(cls, analyzer, text):

	stream = analyzer.tokenStream("contents", StringReader(text))

	term = stream.addAttribute(TermAttribute.class_)
	posIncr = stream.addAttribute(PositionIncrementAttribute.class_)
	offset = stream.addAttribute(OffsetAttribute.class_)
	type = stream.addAttribute(TypeAttribute.class_)

	position = 0
	while stream.incrementToken():
	increment = posIncr.getPositionIncrement()
	if increment > 0:
	position = position + increment
	print "\n%d:" %(position),

	print "[%s:%d->%d:%s]" %(term.term(),
	offset.startOffset(),
	offset.endOffset(),
	type.type()),
	print

	def assertAnalyzesTo(cls, analyzer, input, outputs):

	stream = analyzer.tokenStream("field", StringReader(input))
	termAttr = stream.addAttribute(TermAttribute.class_)
	for output in outputs:
	if not stream.incrementToken():
	raise AssertionError, 'stream.incremementToken()'
	if output != termAttr.term():
	raise AssertionError, 'output == termAttr.term())'

	if stream.incrementToken():
	raise AssertionError, 'not stream.incremementToken()'

	stream.close()

	main = classmethod(main)
	setPositionIncrement = classmethod(setPositionIncrement)
	getPositionIncrement = classmethod(getPositionIncrement)
	setTerm = classmethod(setTerm)
	getTerm = classmethod(getTerm)
	setType = classmethod(setType)
	getType = classmethod(getType)
	displayTokens = classmethod(displayTokens)
	displayTokensWithPositions = classmethod(displayTokensWithPositions)
	displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails)
	assertAnalyzesTo = classmethod(assertAnalyzesTo)

	class PhraseFilter(PythonTokenFilter):
	'''
	PhraseFilter is a TokenFilter that adds in phrases (as tokens) that match
	user-defined phrases. You can then use these tokens when exporting a TDM.
	'''
	TOKEN_TYPE_SYNONYM = "SYNONYM"
	TOKEN_TYPE_PHRASE = "PHRASE"

	def __init__(self, inStream,allPhrases):
	super(PhraseFilter, self).__init__(inStream)

	self.synonymStack = []
	self.termAttr = self.addAttribute(TermAttribute.class_)
	self.save = inStream.cloneAttributes()
	self.inStream = inStream

	revSplitPhrases = []
	for p in allPhrases:
	psplit = p.split()
	psplit.reverse()
	revSplitPhrases.append(psplit)

	self.allPhrases = revSplitPhrases
	self.lag1 = ""
	self.lag2 = ""
	self.phraseStack = []

	def incrementToken(self):

	if len(self.phraseStack) > 0:
	syn = self.phraseStack.pop()
	self.restoreState(syn)
	return True

	if not self.inStream.incrementToken():
	return False

	for phrase in self.allPhrases:
	addPhrase = False
	lag0 = self.termAttr.term()
	print "checking: ", self.termAttr.term()
	print phrase
	if len(phrase)==2:
	if self.lag1==phrase[1] and lag0==phrase[0]:
	print "matched!"
	addPhrase = True
	if len(phrase)==3:
	if self.lag2==phrase[2] and self.lag1==phrase[1] and lag0==phrase[0]:
	print "matched!"
	addPhrase = True
	if addPhrase:
	rPhrase = phrase
	rPhrase.reverse()
	self.addPhrase(" ".join(rPhrase))

	self.lag2 = self.lag1
	self.lag1 = self.termAttr.term()

	print "lag1: ", self.lag1
	print "lag2: ", self.lag2

	return True

	def addPhrase(self,arg):

	print "adding phrase", arg
	current = self.captureState()

	self.save.restoreState(current)
	AnalyzerUtils.setTerm(self.save, arg)
	AnalyzerUtils.setType(self.save, self.TOKEN_TYPE_PHRASE)
	AnalyzerUtils.setPositionIncrement(self.save, 0)
	self.phraseStack.append(self.save.captureState())


	class PorterStemmerAnalyzer(PythonAnalyzer):

	def setPhrases(self, myPhrases):
	self.myPhrases = myPhrases

	def tokenStream(self, fieldName, reader):

	result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
	result = StandardFilter(result)
	#result = ShingleFilter(result)
	#result.setOutputUnigrams(True)
	result = LowerCaseFilter(result)
	result = PorterStemFilter(result)
	result = PhraseFilter(result,self.myPhrases)
	result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
	return result

	class FooAnalyzer(PythonAnalyzer):

	def tokenStream(self, fieldName, reader):

	result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
	#result = StandardFilter(result)
	#result = ShingleFilter(result)
	#result.setOutputUnigrams(True)
	#result = LowerCaseFilter(result)
	#result = PorterStemFilter(result)
	#result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET)

	return result

	class QueryAnalyzer(PythonAnalyzer):

	def tokenStream(self, fieldName, reader):

	result = StandardTokenizer(Version.LUCENE_CURRENT, reader)
	result = StandardFilter(result)
	result = LowerCaseFilter(result)
	result = PorterStemFilter(result)
	return result


	if __name__ == '__main__':
	initVM()


	def stemPhrases(allPhrases,analyzer):
	'''
	We need to porter stem the phrases in the query so that it will
	match the porterized versions. Added benefit: if you are looking
	for the exact phrase 'buffalo wing' you will also get 'buffalo wings'
	bug or feature? you decide!

	sample call: stemPhrases(allPhrases,QueryAnalyzer)
	'''

	stemmedPhrases = []

	for p in allPhrases:
	query = QueryParser(Version.LUCENE_CURRENT, "removeme",
	analyzer(Version.LUCENE_CURRENT)).parse('"'+p+'"')
	stemmedQuery = query.toString()
	stemmedPhrase = stemmedQuery.replace('removeme:','').replace('"','')
	stemmedPhrases.append(stemmedPhrase)

	return stemmedPhrases


	directory = RAMDirectory()

	allPhrases = ["buffalo wings","sweet deals","wings over boston"]

	stemmedPhrases = stemPhrases(allPhrases,QueryAnalyzer)

	psa = PorterStemmerAnalyzer(Version.LUCENE_CURRENT)
	psa.setPhrases(stemmedPhrases)

	iwriter = IndexWriter(directory,
	psa,
	True, IndexWriter.MaxFieldLength.LIMITED)
	ts = ["The buffalo wings over boston fly like an eagle",
	"The buffalo wings are better in Buffalo",
	"What kind of buffalo has wings?",
	"this bernhard is the text to be index text",
	"put some text in the buffer",
	"this claudia is the text to be index"]
	for t in ts:
	doc = Document()
	doc.add(Field("fieldname", t,
	Field.Store.YES,
	Field.Index.ANALYZED,
	Field.TermVector.WITH_POSITIONS_OFFSETS))
	iwriter.addDocument(doc)
	iwriter.optimize()
	iwriter.close()

	ireader = IndexReader.open(directory, True)
	tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))

	# Print out all of the tokens, so that we can verify that
	# our analysis chain is successful

	for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
	print 'term: %s' % t
	print ' freq: %i' % f
	try:
	print ' pos: ' + str([p for p in tpv.getTermPositions(i)])
	except:
	print ' no pos'
	try:
	print ' off: ' + \
	str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
	for o in tpv.getOffsets(i)])
	except:
	print ' no offsets'

	# searcher = IndexSearcher(directory, True)
	# reader = IndexReader.open(directory, True)
	# analyzer = psa
	# command = 'wings'
	# query = QueryParser(Version.LUCENE_CURRENT, "fieldname",
	# analyzer).parse(command)
	# print "query:", query

	# scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs
	# print "%s total matching documents." % len(scoreDocs)

	# for scoreDoc in scoreDocs:
	# doc = searcher.doc(scoreDoc.doc)
	# vector = reader.getTermFreqVector(scoreDoc.doc,"fieldname")
	# print vector.getTerms()