Skip to content

Instantly share code, notes, and snippets.

@arjones
Created April 19, 2012 02:37
Show Gist options
  • Save arjones/2417963 to your computer and use it in GitHub Desktop.
Save arjones/2417963 to your computer and use it in GitHub Desktop.
Playing with Lucene QueryParser and Query classes
libraryDependencies += "org.scalaz" %% "scalaz-core" % "6.0.3"
libraryDependencies += "org.apache.lucene" % "lucene-core" % "3.5.0"
libraryDependencies += "org.apache.lucene" % "lucene-analyzers" % "3.5.0"
import org.apache.lucene.queryParser.QueryParser
import org.apache.lucene.search._
import org.apache.lucene.analysis._
import org.apache.lucene.analysis.en.EnglishAnalyzer
import org.apache.lucene.analysis.tokenattributes._
import org.apache.lucene.util.Version.LUCENE_35
case class TextQuery(analyzer: Analyzer) {
val parser = new QueryParser(LUCENE_35 , "dummyfield", analyzer)
def fromString(searchTerms: String) = {
val q = parser.parse(searchTerms)
printf("%nquery[%s]%n", q)
QueryMatcher(q)
}
// transform a string into a vector of analyzed terms
private def analyzedString(s: String): Vector[String] = {
val reader = new java.io.StringReader(s)
val stream = analyzer.tokenStream(null, reader)
val attr = stream.getAttribute(
classOf[CharTermAttribute]).asInstanceOf[CharTermAttribute]
stream.reset()
val res = new collection.immutable.VectorBuilder[String]
while (stream.incrementToken()) {
res += attr.toString
}
stream.end()
stream.close()
res.result
}
case class QueryMatcher(query: Query) {
def match_?(text: String): Boolean = {
assert(query.toString != "", "empty query")
val tokenized = analyzedString(text)
query match {
case q: TermQuery =>
tokenized.contains(q.getTerm.text)
case q: FuzzyQuery =>
import scalaz._; import Scalaz._
val term = q.getTerm.text.toArray
tokenized.exists{ t =>
val diff = (t.toArray <---> term).toFloat / term.length
1 - diff > q.getMinSimilarity
}
case q: BooleanQuery =>
import org.apache.lucene.search.BooleanClause.Occur._
val clauses = q.getClauses
val must = clauses.filter(_.getOccur == MUST).forall{ c =>
QueryMatcher(c.getQuery).match_?(text)
}
val mustNot = clauses.filter(_.getOccur == MUST_NOT).forall{ c =>
!QueryMatcher(c.getQuery).match_?(text)
}
val should = clauses.filter(_.getOccur == SHOULD) match {
case cs if cs.size == 0 => true
case cs => cs.exists(c => QueryMatcher(c.getQuery).match_?(text))
}
must && mustNot && should
case q: WildcardQuery =>
val prefix = "^" + q.getTerm.text.replaceAll("\\?", ".").replaceAll("\\*", ".+") + "$"
tokenized.exists(_.matches(prefix))
case q: PrefixQuery =>
tokenized.exists(_.startsWith(q.getPrefix.text))
case q: PhraseQuery =>
val phrase = q.getTerms.map(_.text)
tokenized containsSlice phrase
case q: MultiPhraseQuery =>
sys.error("not supported - requires lucene index")
case q: TermRangeQuery =>
sys.error("not supported - requires notion of field")
case q: NumericRangeQuery[_] =>
sys.error("not supported")
case _ => false
sys.error("not supported " + query.getClass)
}
}
}
}
object L {
def main(args: Array[String]) {
val analyzer = new WhitespaceAnalyzer(LUCENE_35)
val textQuery = TextQuery(analyzer)
assert(textQuery.fromString(""""to be" OR (NOT "to be")""").match_?("to be"))
assert(textQuery.fromString("-test").match_?("some string of text"))
assert(textQuery.fromString("NOT test").match_?("some string of text"))
assert(textQuery.fromString("a AND b").match_?("b a"))
assert(!textQuery.fromString("a AND b").match_?("a c"))
assert(!textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("d a b"))
assert(!textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("b"))
assert(textQuery.fromString("a AND (b AND NOT (c OR d))").match_?("a b cdefg"))
assert(textQuery.fromString("a*").match_?("adf"))
// QueryParser does not support wildcard at beginning of word
assert(textQuery.fromString("ca??").match_?("cart"))
assert(!textQuery.fromString("ca??").match_?("carts"))
assert(!textQuery.fromString("ca??").match_?("cam"))
assert(textQuery.fromString("m*d").match_?("mood"))
assert(textQuery.fromString("fuzzy~").match_?("fuzzer"))
assert(!textQuery.fromString("fred~").match_?("fuzzer"))
assert(TextQuery(new EnglishAnalyzer(LUCENE_35))
.fromString("lilly").match_?("Lillies"))
}
}
// vim:set ts=2 sw=2 et:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment