|
package com.github.horitaku1124.java_searcher; |
|
|
|
import java.io.BufferedReader; |
|
import java.io.IOException; |
|
import java.io.InputStream; |
|
import java.io.InputStreamReader; |
|
import java.nio.charset.StandardCharsets; |
|
import java.nio.file.FileVisitResult; |
|
import java.nio.file.Files; |
|
import java.nio.file.Path; |
|
import java.nio.file.Paths; |
|
import java.nio.file.SimpleFileVisitor; |
|
import java.nio.file.attribute.BasicFileAttributes; |
|
import java.util.Date; |
|
|
|
import org.apache.lucene.analysis.Analyzer; |
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; |
|
import org.apache.lucene.document.LongPoint; |
|
import org.apache.lucene.document.Document; |
|
import org.apache.lucene.document.Field; |
|
import org.apache.lucene.document.StringField; |
|
import org.apache.lucene.document.TextField; |
|
import org.apache.lucene.index.IndexWriter; |
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
|
import org.apache.lucene.index.IndexWriterConfig; |
|
import org.apache.lucene.index.Term; |
|
import org.apache.lucene.store.Directory; |
|
import org.apache.lucene.store.FSDirectory; |
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; |
|
|
|
|
|
public class IndexFiles { |
|
private IndexFiles() {} |
|
|
|
/** Index all text files under a directory. */ |
|
public static void main(String[] args) { |
|
String usage = "java org.apache.lucene.demo.IndexFiles" |
|
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" |
|
+ "This indexes the documents in DOCS_PATH, creating a Lucene index" |
|
+ "in INDEX_PATH that can be searched with SearchFiles"; |
|
String indexPath = "index"; |
|
String docsPath = null; |
|
boolean create = true; |
|
for(int i=0;i<args.length;i++) { |
|
if ("-index".equals(args[i])) { |
|
indexPath = args[i+1]; |
|
i++; |
|
} else if ("-docs".equals(args[i])) { |
|
docsPath = args[i+1]; |
|
i++; |
|
} else if ("-update".equals(args[i])) { |
|
create = false; |
|
} |
|
} |
|
|
|
if (docsPath == null) { |
|
System.err.println("Usage: " + usage); |
|
System.exit(1); |
|
} |
|
|
|
final Path docDir = Paths.get(docsPath); |
|
if (!Files.isReadable(docDir)) { |
|
System.out.println("Document directory '" +docDir.toAbsolutePath()+ "' does not exist or is not readable, please check the path"); |
|
System.exit(1); |
|
} |
|
|
|
Date start = new Date(); |
|
try { |
|
System.out.println("Indexing to directory '" + indexPath + "'..."); |
|
|
|
Directory dir = FSDirectory.open(Paths.get(indexPath)); |
|
Analyzer analyzer = new StandardAnalyzer(); |
|
IndexWriterConfig iwc = new IndexWriterConfig(analyzer); |
|
|
|
if (create) { |
|
// Create a new index in the directory, removing any |
|
// previously indexed documents: |
|
iwc.setOpenMode(OpenMode.CREATE); |
|
} else { |
|
// Add new documents to an existing index: |
|
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); |
|
} |
|
|
|
// Optional: for better indexing performance, if you |
|
// are indexing many documents, increase the RAM |
|
// buffer. But if you do this, increase the max heap |
|
// size to the JVM (eg add -Xmx512m or -Xmx1g): |
|
// |
|
// iwc.setRAMBufferSizeMB(256.0); |
|
|
|
IndexWriter writer = new IndexWriter(dir, iwc); |
|
indexDocs(writer, docDir); |
|
|
|
// NOTE: if you want to maximize search performance, |
|
// you can optionally call forceMerge here. This can be |
|
// a terribly costly operation, so generally it's only |
|
// worth it when your index is relatively static (ie |
|
// you're done adding documents to it): |
|
// |
|
// writer.forceMerge(1); |
|
|
|
writer.close(); |
|
|
|
Date end = new Date(); |
|
System.out.println(end.getTime() - start.getTime() + " total milliseconds"); |
|
|
|
} catch (IOException e) { |
|
e.printStackTrace(); |
|
} |
|
} |
|
|
|
/** |
|
* Indexes the given file using the given writer, or if a directory is given, |
|
* recurses over files and directories found under the given directory. |
|
* |
|
* NOTE: This method indexes one document per input file. This is slow. For good |
|
* throughput, put multiple documents into your input file(s). An example of this is |
|
* in the benchmark module, which can create "line doc" files, one document per line, |
|
* using the |
|
* <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" |
|
* >WriteLineDocTask</a>. |
|
* |
|
* @param writer Writer to the index where the given file/dir info will be stored |
|
* @param path The file to index, or the directory to recurse into to find files to index |
|
* @throws IOException If there is a low-level I/O error |
|
*/ |
|
static void indexDocs(final IndexWriter writer, Path path) throws IOException { |
|
if (Files.isDirectory(path)) { |
|
Files.walkFileTree(path, new SimpleFileVisitor<Path>() { |
|
@Override |
|
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { |
|
try { |
|
String fileName = file.toAbsolutePath().toString(); |
|
if (fileName.endsWith(".xls") || fileName.endsWith(".xlsx")) { |
|
indexDoc_xls(writer, file, attrs.lastModifiedTime().toMillis()); |
|
} |
|
} catch (Exception e) { |
|
e.printStackTrace(); |
|
// don't index files that can't be read. |
|
} |
|
return FileVisitResult.CONTINUE; |
|
} |
|
}); |
|
} else { |
|
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); |
|
} |
|
} |
|
|
|
static void indexDoc_xls(IndexWriter writer, Path file, long lastModified) throws IOException, InvalidFormatException { |
|
String content = DumpExcel.readFile(file.toString()); |
|
|
|
// make a new, empty document |
|
Document doc = new Document(); |
|
|
|
// Add the path of the file as a field named "path". Use a |
|
// field that is indexed (i.e. searchable), but don't tokenize |
|
// the field into separate words and don't index term frequency |
|
// or positional information: |
|
Field pathField = new StringField("path", file.toString(), Field.Store.YES); |
|
doc.add(pathField); |
|
|
|
// Add the last modified date of the file a field named "modified". |
|
// Use a LongPoint that is indexed (i.e. efficiently filterable with |
|
// PointRangeQuery). This indexes to milli-second resolution, which |
|
// is often too fine. You could instead create a number based on |
|
// year/month/day/hour/minutes/seconds, down the resolution you require. |
|
// For example the long value 2011021714 would mean |
|
// February 17, 2011, 2-3 PM. |
|
doc.add(new LongPoint("modified", lastModified)); |
|
|
|
// Add the contents of the file to a field named "contents". Specify a Reader, |
|
// so that the text of the file is tokenized and indexed, but not stored. |
|
// Note that FileReader expects the file to be in UTF-8 encoding. |
|
// If that's not the case searching for special characters will fail. |
|
doc.add(new TextField("contents", content, Field.Store.YES)); |
|
|
|
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { |
|
// New index, so we just add the document (no old document can be there): |
|
System.out.println("adding " + file); |
|
writer.addDocument(doc); |
|
} else { |
|
// Existing index (an old copy of this document may have been indexed) so |
|
// we use updateDocument instead to replace the old one matching the exact |
|
// path, if present: |
|
System.out.println("updating " + file); |
|
writer.updateDocument(new Term("path", file.toString()), doc); |
|
} |
|
} |
|
/** Indexes a single document */ |
|
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { |
|
try (InputStream stream = Files.newInputStream(file)) { |
|
// make a new, empty document |
|
Document doc = new Document(); |
|
|
|
// Add the path of the file as a field named "path". Use a |
|
// field that is indexed (i.e. searchable), but don't tokenize |
|
// the field into separate words and don't index term frequency |
|
// or positional information: |
|
Field pathField = new StringField("path", file.toString(), Field.Store.YES); |
|
doc.add(pathField); |
|
|
|
// Add the last modified date of the file a field named "modified". |
|
// Use a LongPoint that is indexed (i.e. efficiently filterable with |
|
// PointRangeQuery). This indexes to milli-second resolution, which |
|
// is often too fine. You could instead create a number based on |
|
// year/month/day/hour/minutes/seconds, down the resolution you require. |
|
// For example the long value 2011021714 would mean |
|
// February 17, 2011, 2-3 PM. |
|
doc.add(new LongPoint("modified", lastModified)); |
|
|
|
// Add the contents of the file to a field named "contents". Specify a Reader, |
|
// so that the text of the file is tokenized and indexed, but not stored. |
|
// Note that FileReader expects the file to be in UTF-8 encoding. |
|
// If that's not the case searching for special characters will fail. |
|
doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); |
|
|
|
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { |
|
// New index, so we just add the document (no old document can be there): |
|
System.out.println("adding " + file); |
|
writer.addDocument(doc); |
|
} else { |
|
// Existing index (an old copy of this document may have been indexed) so |
|
// we use updateDocument instead to replace the old one matching the exact |
|
// path, if present: |
|
System.out.println("updating " + file); |
|
writer.updateDocument(new Term("path", file.toString()), doc); |
|
} |
|
} |
|
} |
|
} |