Created
December 1, 2015 22:16
-
-
Save washingtonsoares/3317cebee9aa96a00c8b to your computer and use it in GitHub Desktop.
Apache Lucene
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// C�digo inspirado em: https://lucene.apache.org/core/quickstart.html | |
// Adaptado para Lucene 5 | |
package pratica15; | |
import java.io.*; | |
// lucene-5.3.1/analysis/common/lucene-analyzers-common-5.3.1.jar | |
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
// lucene-5.3.1/queryparser/lucene-queryparser-5.3.1.jar | |
import org.apache.lucene.queryparser.classic.*; | |
// lucene-5.3.1/core/lucene-core-5.3.1.jar | |
import org.apache.lucene.document.* ; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.search.*; | |
import org.apache.lucene.store.*; | |
import org.apache.lucene.util.*; | |
// export CLASSPATH=".:lucene-5.3.1/analysis/common/lucene-analyzers-common-5.3.1.jar:lucene-5.3.1/core/lucene-core-5.3.1.jar:lucene-5.3.1/queryparser/lucene-queryparser-5.3.1.jar" | |
public class Ufucene { | |
public static void main(String args[]) throws Exception { | |
// Analisador para processamento e constru��o de vocabul�rio: | |
StandardAnalyzer analyzer = new StandardAnalyzer(); | |
// Tamb�m � poss�vel passar lista de stopwords com: | |
//StandardAnalyzer analyzer = new StandardAnalyzer(new FileReader("stopwords.txt")); | |
// Defini��o do local e tipo de armazenamento do �ndice invertido: | |
// RAMDirectory armazena �ndice invertido em mem�ria principal | |
Directory index = new RAMDirectory(); | |
// Tamb�m � poss�vel guardar em disco, aqui no diret�rio "indiceinvertido/": | |
// Directory index = new SimpleFSDirectory(FileSystems.getDefault().getPath(".","indiceinvertido/")); | |
// Configura��o de um �ndice invertido: | |
IndexWriterConfig config = new IndexWriterConfig(analyzer); | |
System.out.println("carregando arquivo..."); | |
BufferedReader r = new BufferedReader(new FileReader(new File("corpus.txt"))); | |
IndexWriter w = new IndexWriter(index, config); | |
while(r.ready()){ | |
String titulo = r.readLine(); | |
String url = r.readLine(); | |
String conteudo = r.readLine(); | |
addDoc(w, titulo, url, conteudo); | |
} | |
w.close(); | |
// Receber e preparar para atender a uma consulta: | |
String textoConsulta = args.length > 0 ? args[0] : "brazil"; | |
Query q = new MultiFieldQueryParser(new String[] {"title","content"}, analyzer).parse(textoConsulta); | |
// Prepara��o para acessar o �ndice: | |
IndexReader reader = DirectoryReader.open(index); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
// Definir configura��o do tipo de resposta a consultas: | |
int numeroDeResultados= 10; | |
TopScoreDocCollector collector = TopScoreDocCollector.create(numeroDeResultados); | |
// Efetua a consulta | |
searcher.search(q, collector); | |
// Coleta a resposta a uma consulta | |
ScoreDoc[] hits = collector.topDocs().scoreDocs; | |
System.out.println("Achou " + hits.length + " resultados."); | |
// Exibe o resultado � consulta: | |
for(int i=0;i<hits.length;++i) { | |
int docId = hits[i].doc; | |
Document d = searcher.doc(docId); | |
System.out.println((i + 1) + ". " + d.get("url") + "\t" + d.get("title")); | |
} | |
} | |
private static void addDoc(IndexWriter w, String title, String url,String content) throws IOException { | |
Document doc = new Document(); | |
doc.add(new TextField("title", title, Field.Store.YES)); // TextField � tokenizado | |
doc.add(new StringField("url", url, Field.Store.YES)); // StringField N�O � tokenizado | |
doc.add(new TextField("content",content,Field.Store.YES));//conteudo do documento tokenizado | |
w.addDocument(doc); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment