Created
August 24, 2012 20:30
-
-
Save alexstorer/3455295 to your computer and use it in GitHub Desktop.
For PyLucene! # This file demonstrates how to... # 1) Make a new analyzer # 2) Make a new filter # 3) Apply an analyzer chain to a string (via a query) # 4) Include phrases as tokens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This file demonstrates how to... | |
| # 1) Make a new analyzer | |
| # 2) Make a new filter | |
| # 3) Apply an analyzer chain to a string (via a query) | |
| # 4) Include phrases as tokens | |
| from lucene import * | |
| class AnalyzerUtils(object): | |
| def main(cls, argv): | |
| print "SimpleAnalyzer" | |
| cls.displayTokensWithFullDetails(SimpleAnalyzer(), | |
| "The quick brown fox....") | |
| print "\n----" | |
| print "StandardAnalyzer" | |
| cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at [email protected]") | |
| def setPositionIncrement(cls, source, posIncr): | |
| attr = source.addAttribute(PositionIncrementAttribute.class_) | |
| attr.setPositionIncrement(posIncr) | |
| def getPositionIncrement(cls, source): | |
| attr = source.addAttribute(PositionIncrementAttribute.class_) | |
| return attr.getPositionIncrement() | |
| def setTerm(cls, source, term): | |
| attr = source.addAttribute(TermAttribute.class_) | |
| attr.setTermBuffer(term) | |
| def getTerm(cls, source): | |
| attr = source.addAttribute(TermAttribute.class_) | |
| return attr.term() | |
| def setType(cls, source, type): | |
| attr = source.addAttribute(TypeAttribute.class_) | |
| attr.setType(type) | |
| def getType(cls, source): | |
| attr = source.addAttribute(TypeAttribute.class_) | |
| return attr.type() | |
| def displayTokens(cls, analyzer, text): | |
| tokenStream = analyzer.tokenStream("contents", StringReader(text)) | |
| term = tokenStream.addAttribute(TermAttribute.class_) | |
| while tokenStream.incrementToken(): | |
| print "[%s]" %(term.term()), | |
| def displayTokensWithPositions(cls, analyzer, text): | |
| stream = analyzer.tokenStream("contents", StringReader(text)) | |
| term = stream.addAttribute(TermAttribute.class_) | |
| posIncr = stream.addAttribute(PositionIncrementAttribute.class_) | |
| position = 0 | |
| while stream.incrementToken(): | |
| increment = posIncr.getPositionIncrement() | |
| if increment > 0: | |
| position = position + increment | |
| print "\n%d:" %(position), | |
| print "[%s]" %(term.term()), | |
| def displayTokensWithFullDetails(cls, analyzer, text): | |
| stream = analyzer.tokenStream("contents", StringReader(text)) | |
| term = stream.addAttribute(TermAttribute.class_) | |
| posIncr = stream.addAttribute(PositionIncrementAttribute.class_) | |
| offset = stream.addAttribute(OffsetAttribute.class_) | |
| type = stream.addAttribute(TypeAttribute.class_) | |
| position = 0 | |
| while stream.incrementToken(): | |
| increment = posIncr.getPositionIncrement() | |
| if increment > 0: | |
| position = position + increment | |
| print "\n%d:" %(position), | |
| print "[%s:%d->%d:%s]" %(term.term(), | |
| offset.startOffset(), | |
| offset.endOffset(), | |
| type.type()), | |
| def assertAnalyzesTo(cls, analyzer, input, outputs): | |
| stream = analyzer.tokenStream("field", StringReader(input)) | |
| termAttr = stream.addAttribute(TermAttribute.class_) | |
| for output in outputs: | |
| if not stream.incrementToken(): | |
| raise AssertionError, 'stream.incremementToken()' | |
| if output != termAttr.term(): | |
| raise AssertionError, 'output == termAttr.term())' | |
| if stream.incrementToken(): | |
| raise AssertionError, 'not stream.incremementToken()' | |
| stream.close() | |
| main = classmethod(main) | |
| setPositionIncrement = classmethod(setPositionIncrement) | |
| getPositionIncrement = classmethod(getPositionIncrement) | |
| setTerm = classmethod(setTerm) | |
| getTerm = classmethod(getTerm) | |
| setType = classmethod(setType) | |
| getType = classmethod(getType) | |
| displayTokens = classmethod(displayTokens) | |
| displayTokensWithPositions = classmethod(displayTokensWithPositions) | |
| displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails) | |
| assertAnalyzesTo = classmethod(assertAnalyzesTo) | |
| class PhraseFilter(PythonTokenFilter): | |
| ''' | |
| PhraseFilter is a TokenFilter that adds in phrases (as tokens) that match | |
| user-defined phrases. You can then use these tokens when exporting a TDM. | |
| ''' | |
| TOKEN_TYPE_SYNONYM = "SYNONYM" | |
| TOKEN_TYPE_PHRASE = "PHRASE" | |
| def __init__(self, inStream,allPhrases): | |
| super(PhraseFilter, self).__init__(inStream) | |
| self.synonymStack = [] | |
| self.termAttr = self.addAttribute(TermAttribute.class_) | |
| self.save = inStream.cloneAttributes() | |
| self.inStream = inStream | |
| revSplitPhrases = [] | |
| for p in allPhrases: | |
| psplit = p.split() | |
| psplit.reverse() | |
| revSplitPhrases.append(psplit) | |
| self.allPhrases = revSplitPhrases | |
| self.lag1 = "" | |
| self.lag2 = "" | |
| self.phraseStack = [] | |
| def incrementToken(self): | |
| if len(self.phraseStack) > 0: | |
| syn = self.phraseStack.pop() | |
| self.restoreState(syn) | |
| return True | |
| if not self.inStream.incrementToken(): | |
| return False | |
| for phrase in self.allPhrases: | |
| addPhrase = False | |
| lag0 = self.termAttr.term() | |
| print "checking: ", self.termAttr.term() | |
| print phrase | |
| if len(phrase)==2: | |
| if self.lag1==phrase[1] and lag0==phrase[0]: | |
| print "matched!" | |
| addPhrase = True | |
| if len(phrase)==3: | |
| if self.lag2==phrase[2] and self.lag1==phrase[1] and lag0==phrase[0]: | |
| print "matched!" | |
| addPhrase = True | |
| if addPhrase: | |
| rPhrase = phrase | |
| rPhrase.reverse() | |
| self.addPhrase(" ".join(rPhrase)) | |
| self.lag2 = self.lag1 | |
| self.lag1 = self.termAttr.term() | |
| print "lag1: ", self.lag1 | |
| print "lag2: ", self.lag2 | |
| return True | |
| def addPhrase(self,arg): | |
| print "adding phrase", arg | |
| current = self.captureState() | |
| self.save.restoreState(current) | |
| AnalyzerUtils.setTerm(self.save, arg) | |
| AnalyzerUtils.setType(self.save, self.TOKEN_TYPE_PHRASE) | |
| AnalyzerUtils.setPositionIncrement(self.save, 0) | |
| self.phraseStack.append(self.save.captureState()) | |
| class PorterStemmerAnalyzer(PythonAnalyzer): | |
| def setPhrases(self, myPhrases): | |
| self.myPhrases = myPhrases | |
| def tokenStream(self, fieldName, reader): | |
| result = StandardTokenizer(Version.LUCENE_CURRENT, reader) | |
| result = StandardFilter(result) | |
| #result = ShingleFilter(result) | |
| #result.setOutputUnigrams(True) | |
| result = LowerCaseFilter(result) | |
| result = PorterStemFilter(result) | |
| result = PhraseFilter(result,self.myPhrases) | |
| result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET) | |
| return result | |
| class FooAnalyzer(PythonAnalyzer): | |
| def tokenStream(self, fieldName, reader): | |
| result = StandardTokenizer(Version.LUCENE_CURRENT, reader) | |
| #result = StandardFilter(result) | |
| #result = ShingleFilter(result) | |
| #result.setOutputUnigrams(True) | |
| #result = LowerCaseFilter(result) | |
| #result = PorterStemFilter(result) | |
| #result = StopFilter(True, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET) | |
| return result | |
| class QueryAnalyzer(PythonAnalyzer): | |
| def tokenStream(self, fieldName, reader): | |
| result = StandardTokenizer(Version.LUCENE_CURRENT, reader) | |
| result = StandardFilter(result) | |
| result = LowerCaseFilter(result) | |
| result = PorterStemFilter(result) | |
| return result | |
| if __name__ == '__main__': | |
| initVM() | |
| def stemPhrases(allPhrases,analyzer): | |
| ''' | |
| We need to porter stem the phrases in the query so that it will | |
| match the porterized versions. Added benefit: if you are looking | |
| for the exact phrase 'buffalo wing' you will also get 'buffalo wings' | |
| bug or feature? you decide! | |
| sample call: stemPhrases(allPhrases,QueryAnalyzer) | |
| ''' | |
| stemmedPhrases = [] | |
| for p in allPhrases: | |
| query = QueryParser(Version.LUCENE_CURRENT, "removeme", | |
| analyzer(Version.LUCENE_CURRENT)).parse('"'+p+'"') | |
| stemmedQuery = query.toString() | |
| stemmedPhrase = stemmedQuery.replace('removeme:','').replace('"','') | |
| stemmedPhrases.append(stemmedPhrase) | |
| return stemmedPhrases | |
| directory = RAMDirectory() | |
| allPhrases = ["buffalo wings","sweet deals","wings over boston"] | |
| stemmedPhrases = stemPhrases(allPhrases,QueryAnalyzer) | |
| psa = PorterStemmerAnalyzer(Version.LUCENE_CURRENT) | |
| psa.setPhrases(stemmedPhrases) | |
| iwriter = IndexWriter(directory, | |
| psa, | |
| True, IndexWriter.MaxFieldLength.LIMITED) | |
| ts = ["The buffalo wings over boston fly like an eagle", | |
| "The buffalo wings are better in Buffalo", | |
| "What kind of buffalo has wings?", | |
| "this bernhard is the text to be index text", | |
| "put some text in the buffer", | |
| "this claudia is the text to be index"] | |
| for t in ts: | |
| doc = Document() | |
| doc.add(Field("fieldname", t, | |
| Field.Store.YES, | |
| Field.Index.ANALYZED, | |
| Field.TermVector.WITH_POSITIONS_OFFSETS)) | |
| iwriter.addDocument(doc) | |
| iwriter.optimize() | |
| iwriter.close() | |
| ireader = IndexReader.open(directory, True) | |
| tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) | |
| # Print out all of the tokens, so that we can verify that | |
| # our analysis chain is successful | |
| for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)): | |
| print 'term: %s' % t | |
| print ' freq: %i' % f | |
| try: | |
| print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) | |
| except: | |
| print ' no pos' | |
| try: | |
| print ' off: ' + \ | |
| str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) | |
| for o in tpv.getOffsets(i)]) | |
| except: | |
| print ' no offsets' | |
| # searcher = IndexSearcher(directory, True) | |
| # reader = IndexReader.open(directory, True) | |
| # analyzer = psa | |
| # command = 'wings' | |
| # query = QueryParser(Version.LUCENE_CURRENT, "fieldname", | |
| # analyzer).parse(command) | |
| # print "query:", query | |
| # scoreDocs = searcher.search(query, reader.maxDoc()).scoreDocs | |
| # print "%s total matching documents." % len(scoreDocs) | |
| # for scoreDoc in scoreDocs: | |
| # doc = searcher.doc(scoreDoc.doc) | |
| # vector = reader.getTermFreqVector(scoreDoc.doc,"fieldname") | |
| # print vector.getTerms() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment