Skip to content

Instantly share code, notes, and snippets.

@washingtonsoares
Created December 1, 2015 22:16
Show Gist options
  • Save washingtonsoares/3317cebee9aa96a00c8b to your computer and use it in GitHub Desktop.
Save washingtonsoares/3317cebee9aa96a00c8b to your computer and use it in GitHub Desktop.
Apache Lucene
// C�digo inspirado em: https://lucene.apache.org/core/quickstart.html
// Adaptado para Lucene 5
package pratica15;
import java.io.*;
// lucene-5.3.1/analysis/common/lucene-analyzers-common-5.3.1.jar
import org.apache.lucene.analysis.standard.StandardAnalyzer;
// lucene-5.3.1/queryparser/lucene-queryparser-5.3.1.jar
import org.apache.lucene.queryparser.classic.*;
// lucene-5.3.1/core/lucene-core-5.3.1.jar
import org.apache.lucene.document.* ;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
// export CLASSPATH=".:lucene-5.3.1/analysis/common/lucene-analyzers-common-5.3.1.jar:lucene-5.3.1/core/lucene-core-5.3.1.jar:lucene-5.3.1/queryparser/lucene-queryparser-5.3.1.jar"
public class Ufucene {
public static void main(String args[]) throws Exception {
// Analisador para processamento e constru��o de vocabul�rio:
StandardAnalyzer analyzer = new StandardAnalyzer();
// Tamb�m � poss�vel passar lista de stopwords com:
//StandardAnalyzer analyzer = new StandardAnalyzer(new FileReader("stopwords.txt"));
// Defini��o do local e tipo de armazenamento do �ndice invertido:
// RAMDirectory armazena �ndice invertido em mem�ria principal
Directory index = new RAMDirectory();
// Tamb�m � poss�vel guardar em disco, aqui no diret�rio "indiceinvertido/":
// Directory index = new SimpleFSDirectory(FileSystems.getDefault().getPath(".","indiceinvertido/"));
// Configura��o de um �ndice invertido:
IndexWriterConfig config = new IndexWriterConfig(analyzer);
System.out.println("carregando arquivo...");
BufferedReader r = new BufferedReader(new FileReader(new File("corpus.txt")));
IndexWriter w = new IndexWriter(index, config);
while(r.ready()){
String titulo = r.readLine();
String url = r.readLine();
String conteudo = r.readLine();
addDoc(w, titulo, url, conteudo);
}
w.close();
// Receber e preparar para atender a uma consulta:
String textoConsulta = args.length > 0 ? args[0] : "brazil";
Query q = new MultiFieldQueryParser(new String[] {"title","content"}, analyzer).parse(textoConsulta);
// Prepara��o para acessar o �ndice:
IndexReader reader = DirectoryReader.open(index);
IndexSearcher searcher = new IndexSearcher(reader);
// Definir configura��o do tipo de resposta a consultas:
int numeroDeResultados= 10;
TopScoreDocCollector collector = TopScoreDocCollector.create(numeroDeResultados);
// Efetua a consulta
searcher.search(q, collector);
// Coleta a resposta a uma consulta
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println("Achou " + hits.length + " resultados.");
// Exibe o resultado � consulta:
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("url") + "\t" + d.get("title"));
}
}
private static void addDoc(IndexWriter w, String title, String url,String content) throws IOException {
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES)); // TextField � tokenizado
doc.add(new StringField("url", url, Field.Store.YES)); // StringField N�O � tokenizado
doc.add(new TextField("content",content,Field.Store.YES));//conteudo do documento tokenizado
w.addDocument(doc);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment