Created
April 19, 2012 02:37
-
-
Save arjones/2417963 to your computer and use it in GitHub Desktop.
Playing with Lucene QueryParser and Query classes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
libraryDependencies += "org.scalaz" %% "scalaz-core" % "6.0.3" | |
libraryDependencies += "org.apache.lucene" % "lucene-core" % "3.5.0" | |
libraryDependencies += "org.apache.lucene" % "lucene-analyzers" % "3.5.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.queryParser.QueryParser | |
import org.apache.lucene.search._ | |
import org.apache.lucene.analysis._ | |
import org.apache.lucene.analysis.en.EnglishAnalyzer | |
import org.apache.lucene.analysis.tokenattributes._ | |
import org.apache.lucene.util.Version.LUCENE_35 | |
case class TextQuery(analyzer: Analyzer) { | |
val parser = new QueryParser(LUCENE_35 , "dummyfield", analyzer) | |
def fromString(searchTerms: String) = { | |
val q = parser.parse(searchTerms) | |
printf("%nquery[%s]%n", q) | |
QueryMatcher(q) | |
} | |
// transform a string into a vector of analyzed terms | |
private def analyzedString(s: String): Vector[String] = { | |
val reader = new java.io.StringReader(s) | |
val stream = analyzer.tokenStream(null, reader) | |
val attr = stream.getAttribute( | |
classOf[CharTermAttribute]).asInstanceOf[CharTermAttribute] | |
stream.reset() | |
val res = new collection.immutable.VectorBuilder[String] | |
while (stream.incrementToken()) { | |
res += attr.toString | |
} | |
stream.end() | |
stream.close() | |
res.result | |
} | |
case class QueryMatcher(query: Query) { | |
def match_?(text: String): Boolean = { | |
assert(query.toString != "", "empty query") | |
val tokenized = analyzedString(text) | |
query match { | |
case q: TermQuery => | |
tokenized.contains(q.getTerm.text) | |
case q: FuzzyQuery => | |
import scalaz._; import Scalaz._ | |
val term = q.getTerm.text.toArray | |
tokenized.exists{ t => | |
val diff = (t.toArray <---> term).toFloat / term.length | |
1 - diff > q.getMinSimilarity | |
} | |
case q: BooleanQuery => | |
import org.apache.lucene.search.BooleanClause.Occur._ | |
val clauses = q.getClauses | |
val must = clauses.filter(_.getOccur == MUST).forall{ c => | |
QueryMatcher(c.getQuery).match_?(text) | |
} | |
val mustNot = clauses.filter(_.getOccur == MUST_NOT).forall{ c => | |
!QueryMatcher(c.getQuery).match_?(text) | |
} | |
val should = clauses.filter(_.getOccur == SHOULD) match { | |
case cs if cs.size == 0 => true | |
case cs => cs.exists(c => QueryMatcher(c.getQuery).match_?(text)) | |
} | |
must && mustNot && should | |
case q: WildcardQuery => | |
val prefix = "^" + q.getTerm.text.replaceAll("\\?", ".").replaceAll("\\*", ".+") + "$" | |
tokenized.exists(_.matches(prefix)) | |
case q: PrefixQuery => | |
tokenized.exists(_.startsWith(q.getPrefix.text)) | |
case q: PhraseQuery => | |
val phrase = q.getTerms.map(_.text) | |
tokenized containsSlice phrase | |
case q: MultiPhraseQuery => | |
sys.error("not supported - requires lucene index") | |
case q: TermRangeQuery => | |
sys.error("not supported - requires notion of field") | |
case q: NumericRangeQuery[_] => | |
sys.error("not supported") | |
case _ => false | |
sys.error("not supported " + query.getClass) | |
} | |
} | |
} | |
} | |
object L { | |
def main(args: Array[String]) { | |
val analyzer = new WhitespaceAnalyzer(LUCENE_35) | |
val textQuery = TextQuery(analyzer) | |
assert(textQuery.fromString(""""to be" OR (NOT "to be")""").match_?("to be")) | |
assert(textQuery.fromString("-test").match_?("some string of text")) | |
assert(textQuery.fromString("NOT test").match_?("some string of text")) | |
assert(textQuery.fromString("a AND b").match_?("b a")) | |
assert(!textQuery.fromString("a AND b").match_?("a c")) | |
assert(!textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("d a b")) | |
assert(!textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("b")) | |
assert(textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("a b cdefg")) | |
assert(textQuery.fromString("a*").match_?("adf")) | |
// QueryParser does not support wildcard at beginning of word | |
assert(textQuery.fromString("ca??").match_?("cart")) | |
assert(!textQuery.fromString("ca??").match_?("carts")) | |
assert(!textQuery.fromString("ca??").match_?("cam")) | |
assert(textQuery.fromString("m*d").match_?("mood")) | |
assert(textQuery.fromString("fuzzy~").match_?("fuzzer")) | |
assert(!textQuery.fromString("fred~").match_?("fuzzer")) | |
assert(TextQuery(new EnglishAnalyzer(LUCENE_35)) | |
.fromString("lilly").match_?("Lillies")) | |
} | |
} | |
// vim:set ts=2 sw=2 et: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment