-
-
Save eliasah/5ad6b48fb60351339db36acac17d1963 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.LinkedList; | |
import java.util.List; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.en.EnglishAnalyzer; | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.TextField; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.queryparser.classic.ParseException; | |
import org.apache.lucene.queryparser.classic.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.similarities.BM25Similarity; | |
import org.apache.lucene.search.similarities.ClassicSimilarity; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.document.Field; | |
public class testIndexer { | |
Directory corpus = new RAMDirectory(); | |
String analyzer = "bm25"; | |
boolean stopwords = false; | |
boolean stemmer = true; | |
public static void main(String[] args) throws IOException { | |
List<String> docs = new ArrayList<String>(); | |
docs.add("This is a document"); | |
docs.add("Search for this document, it's longer and has more document."); | |
docs.add("This one is longer still but it only has the one document contained within it's bounds."); | |
testIndexer indexer = new testIndexer(); | |
IndexWriterConfig cf = indexer.index(docs); | |
//Testing standard search (for "document") | |
List<String> results = indexer.search("document", cf); | |
for (String result : results) { | |
System.out.println(result); | |
} | |
//Testing stopword search | |
results = indexer.search("this", cf); | |
for (String result : results) { | |
System.out.println(result); | |
} | |
} | |
public IndexWriterConfig index(List<String> docs) throws IOException | |
{ | |
Analyzer analyz; | |
IndexWriterConfig config; | |
if (analyzer.equals("vsm") && stopwords && stemmer) | |
{ | |
//VSM cosine similarity with TFIDF + stopwords + stemmer | |
CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); | |
analyz = new EnglishAnalyzer(stopWords); | |
config = new IndexWriterConfig(analyz); | |
config.setSimilarity(new ClassicSimilarity()); | |
} | |
else if (analyzer.equals("vsm") && !stopwords && stemmer) | |
{ | |
//VSM cosine similarity with TFIDF - stopwords + stemmer | |
analyz = new EnglishAnalyzer(CharArraySet.EMPTY_SET); | |
config = new IndexWriterConfig(analyz); | |
config.setSimilarity(new ClassicSimilarity()); | |
} | |
else if (analyzer.equals("vsm") && stopwords && !stemmer) | |
{ | |
//VSM cosine similarity with TFIDF - stopwords - stemmer | |
CharArraySet stopWords = StandardAnalyzer.STOP_WORDS_SET; | |
analyz = new StandardAnalyzer(stopWords); | |
config = new IndexWriterConfig(analyz); | |
config.setSimilarity(new ClassicSimilarity()); | |
} | |
else if (analyzer.equals("bm25") && stopwords && stemmer) | |
{ | |
//Analyzer + stopwords + stemmer | |
CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); | |
analyz = new EnglishAnalyzer(stopWords); | |
config = new IndexWriterConfig(analyz); | |
//BM25 ranking method | |
config.setSimilarity(new BM25Similarity()); | |
} | |
else if (analyzer.equals("bm25") && !stopwords && stemmer) | |
{ | |
//Analyzer - stopwords + stemmer | |
analyz = new EnglishAnalyzer(CharArraySet.EMPTY_SET); | |
config = new IndexWriterConfig(analyz); | |
//BM25 ranking method | |
config.setSimilarity(new BM25Similarity()); | |
} | |
else if (analyzer.equals("bm25") && stopwords && !stemmer) | |
{ | |
//Analyzer + stopwords - stemmer | |
CharArraySet stopWords = StandardAnalyzer.STOP_WORDS_SET; | |
analyz = new StandardAnalyzer(stopWords); | |
config = new IndexWriterConfig(analyz); | |
//BM25 ranking method | |
config.setSimilarity(new BM25Similarity()); | |
} | |
else | |
{ | |
//some default | |
analyz = new StandardAnalyzer(); | |
config = new IndexWriterConfig(analyz); | |
config.setSimilarity(new ClassicSimilarity()); | |
} | |
IndexWriter w = new IndexWriter(corpus, config); | |
int doccount = 0; | |
for (String doc1 : docs) { | |
Document doc = new Document(); | |
doc.add(new TextField("title", "doc" + doccount, Field.Store.YES)); | |
doc.add(new TextField("text", doc1, Field.Store.YES)); | |
w.addDocument(doc); | |
doccount++; | |
} | |
w.close(); | |
return config; | |
} | |
public List<String> search(String searchQuery, IndexWriterConfig cf) throws IOException { | |
QueryParser qp = new QueryParser("text", cf.getAnalyzer()); | |
Query stemmedQuery = null; | |
try { | |
stemmedQuery = qp.parse(searchQuery); | |
} catch (ParseException e) { | |
e.printStackTrace(); | |
} | |
IndexReader reader = DirectoryReader.open(corpus); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
searcher.setSimilarity(cf.getSimilarity()); | |
TopDocs docs = searcher.search(stemmedQuery, 10); | |
ScoreDoc[] scored = docs.scoreDocs; | |
List<String> results = new LinkedList<String>(); | |
for (ScoreDoc aDoc : scored) { | |
Document d = searcher.doc(aDoc.doc); | |
results.add("+ " + d.get("title") + " | text: " + d.get("text") + " | score: " + aDoc.score); | |
} | |
return results; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment