Last active
August 17, 2020 10:02
-
-
Save drewr/cfbcd30fdbdc4d38c3c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% java -cp ~/tmp/elasticsearch-2.2.1/lib/lucene-core-5.4.1.jar:. DiskUsage ~/tmp/elasticsearch-2.2.1/data/elasticsearch/nodes/0/indices/wiki/0/index | |
analyzing... (using /var/folders/2n/4k8wl8bx2h9_2ys5cg4jbsp00000gn/T/4022249571793658223 for temporary storage) | |
total disk: 19,021,096 | |
stored fields: 10,165,878 | |
term vectors: 0 | |
norms: 2,147 | |
docvalues: 1,712,937 | |
postings: 7,136,954 | |
field total terms dict postings proximity docvalues % with dv features | |
===== ===== ========== ======== ========= ========= ======== ======== | |
text 5,250,627 1,273,904 935,941 3,040,782 0 0.0% positions norms | |
link 3,449,421 1,681,688 74,584 77 1,693,072 99.9% docs sorted_set | |
category 96,464 50,748 15,694 30,022 0 0.0% positions norms | |
title 35,982 17,958 111 78 17,835 100.0% docs sorted_set | |
_uid 8,192 8,005 110 77 0 0.0% docs | |
_field_names 2,725 437 2,211 77 0 0.0% docs | |
_type 1,684 327 738 77 542 100.0% docs sorted_set | |
redirect 1,416 280 735 77 324 100.0% docs sorted_numeric | |
stub 1,130 277 452 77 324 100.0% docs sorted_numeric | |
disambiguation 1,085 277 407 77 324 100.0% docs sorted_numeric | |
special 973 279 293 77 324 100.0% docs sorted_numeric | |
_version 192 0 0 0 192 100.0% numeric | |
_source 0 0 0 0 0 0.0% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.nio.file.Paths; | |
import java.text.DecimalFormat; | |
import java.util.HashMap; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.TreeSet; | |
import org.apache.lucene.codecs.DocValuesFormat; | |
import org.apache.lucene.codecs.PostingsFormat; | |
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; | |
import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode; | |
import org.apache.lucene.codecs.lucene54.Lucene54Codec; | |
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat; | |
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; | |
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; | |
import org.apache.lucene.index.CodecReader; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.DocValuesType; | |
import org.apache.lucene.index.FieldInfo; | |
import org.apache.lucene.index.IndexFileNames; | |
import org.apache.lucene.index.IndexOptions; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.index.IndexWriterConfig.OpenMode; | |
import org.apache.lucene.index.IndexWriterConfig; | |
import org.apache.lucene.index.SegmentReader; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.Bits; | |
import org.apache.lucene.util.IOUtils; | |
/** | |
* does ugly hacks to print out disk usage analysis of a lucene index | |
* <p> | |
* You need a lucene 5.x core jar, then do this: | |
* javac -cp /path/to/lucene-core.jar DiskUsage.java | |
* java -cp /path/to/lucene-core.jar:. DiskUsage /elasticserach/data/elasticsearch/nodes/0/indices/whatever/0/index | |
*/ | |
public class DiskUsage { | |
public static void main(String args[]) throws Exception { | |
if (args.length != 1) { | |
System.err.println("java [-Djava.io.tmpdir=/scratch] [-Dmode=BEST_COMPRESSION] -cp lucene-core.jar:./build DiskUsage <path to lucene index>"); | |
System.exit(1); | |
} | |
IndexWriterConfig conf = new IndexWriterConfig(null); | |
conf.setOpenMode(OpenMode.CREATE); | |
// force codec to write per-field filenames. | |
conf.setCodec(new Lucene54Codec(Mode.valueOf(System.getProperty("mode", "BEST_SPEED"))) { | |
@Override | |
public PostingsFormat getPostingsFormatForField(String field) { | |
return new Lucene50PostingsFormat(); | |
} | |
@Override | |
public DocValuesFormat getDocValuesFormatForField(String field) { | |
return new Lucene54DocValuesFormat(); | |
} | |
}); | |
Path tmp = Files.createTempDirectory(null); | |
System.err.println("analyzing... (using " + tmp + " for temporary storage)"); | |
try (Directory dir = FSDirectory.open(Paths.get(args[0])); | |
DirectoryReader reader = DirectoryReader.open(dir); | |
Directory scratch = FSDirectory.open(tmp); | |
IndexWriter writer = new IndexWriter(scratch, conf)) { | |
CodecReader inputs[] = new CodecReader[reader.leaves().size()]; | |
for (int i = 0; i < inputs.length; i++) { | |
inputs[i] = (CodecReader) reader.leaves().get(i).reader(); | |
} | |
writer.addIndexes(inputs); | |
try (DirectoryReader newReader = DirectoryReader.open(writer, false)) { | |
assert newReader.leaves().size() == 1; | |
SegmentReader sr = (SegmentReader) newReader.leaves().get(0).reader(); | |
report(sr, analyzeFields(sr)); | |
} | |
} finally { | |
IOUtils.rm(tmp); | |
} | |
} | |
/** Returns the codec suffix from this file name, or null if there is no suffix. */ | |
public static String parseSuffix(String filename) { | |
if (!filename.startsWith("_")) { | |
return null; | |
} | |
String parts[] = IndexFileNames.stripExtension(filename).substring(1).split("_"); | |
// 4 cases: | |
// segment.ext | |
// segment_gen.ext | |
// segment_codec_suffix.ext | |
// segment_gen_codec_suffix.ext | |
if (parts.length == 3) { | |
return parts[2]; | |
} else if (parts.length == 4) { | |
return parts[3]; | |
} else { | |
return null; | |
} | |
} | |
static class FieldStats implements Comparable<FieldStats> { | |
final String name; | |
long termsBytes; | |
long postingsBytes; | |
long proxBytes; | |
long dvBytes; | |
int docCountWithField; | |
FieldStats(String name) { | |
this.name = name; | |
} | |
long totalBytes() { | |
return termsBytes + postingsBytes + proxBytes + dvBytes; | |
} | |
@Override | |
public int compareTo(FieldStats o) { | |
// reverse order | |
int cmp = Long.compare(o.totalBytes(), totalBytes()); | |
if (cmp == 0) { | |
cmp = name.compareTo(o.name); | |
} | |
return cmp; | |
} | |
} | |
static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception { | |
Map<String,FieldStats> stats = new HashMap<>(); | |
Map<String,String> dvSuffixes = new HashMap<>(); | |
Map<String,String> postingsSuffixes = new HashMap<>(); | |
for (FieldInfo field : reader.getFieldInfos()) { | |
FieldStats fieldStats = new FieldStats(field.name); | |
stats.put(field.name, fieldStats); | |
Map<String,String> attributes = field.attributes(); | |
if (attributes != null) { | |
String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY); | |
if (postingsSuffix != null) { | |
postingsSuffixes.put(postingsSuffix, field.name); | |
} | |
String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY); | |
if (dvSuffix != null) { | |
dvSuffixes.put(dvSuffix, field.name); | |
} | |
} | |
Bits docsWithField = reader.getDocsWithField(field.name); | |
if (docsWithField != null) { | |
int count = 0; | |
for(int docID=0;docID<reader.maxDoc();docID++) { | |
if (docsWithField.get(docID)) { | |
count++; | |
} | |
} | |
fieldStats.docCountWithField = count; | |
} | |
} | |
Directory directory = reader.directory(); | |
for (String file : directory.listAll()) { | |
String suffix = parseSuffix(file); | |
long bytes = directory.fileLength(file); | |
if (suffix != null) { | |
switch (IndexFileNames.getExtension(file)) { | |
case "dvd": | |
case "dvm": | |
stats.get(dvSuffixes.get(suffix)).dvBytes += bytes; | |
break; | |
case "tim": | |
case "tip": | |
stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes; | |
break; | |
case "doc": | |
stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes; | |
break; | |
case "pos": | |
case "pay": | |
stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes; | |
break; | |
default: | |
throw new AssertionError("unexpected suffixed file: " + file); | |
} | |
} | |
} | |
return new TreeSet<FieldStats>(stats.values()); | |
} | |
static void report(SegmentReader reader, Set<FieldStats> stats) throws Exception { | |
long totalSize = 0; | |
long storeSize = 0; | |
long vectorSize = 0; | |
long normsSize = 0; | |
long dvsSize = 0; | |
long postingsSize = 0; | |
for (String file : reader.directory().listAll()) { | |
long size = reader.directory().fileLength(file); | |
totalSize += size; | |
String extension = IndexFileNames.getExtension(file); | |
if (extension != null) { | |
switch (extension) { | |
case "fdt": | |
case "fdx": | |
storeSize += size; | |
break; | |
case "tvx": | |
case "tvd": | |
vectorSize += size; | |
break; | |
case "nvd": | |
case "nvm": | |
normsSize += size; | |
break; | |
case "dvd": | |
case "dvm": | |
dvsSize += size; | |
break; | |
case "tim": | |
case "tip": | |
case "doc": | |
case "pos": | |
case "pay": | |
postingsSize += size; | |
break; | |
} | |
} | |
} | |
DecimalFormat df = new DecimalFormat("#,##0"); | |
System.out.printf("total disk: %15s\n", df.format(totalSize)); | |
System.out.printf("stored fields: %15s\n", df.format(storeSize)); | |
System.out.printf("term vectors: %15s\n", df.format(vectorSize)); | |
System.out.printf("norms: %15s\n", df.format(normsSize)); | |
System.out.printf("docvalues: %15s\n", df.format(dvsSize)); | |
System.out.printf("postings: %15s\n", df.format(postingsSize)); | |
System.out.println(); | |
int maxFieldNameLength = 0; | |
for (FieldStats field : stats) { | |
maxFieldNameLength = Math.max(maxFieldNameLength, field.name.length()); | |
} | |
// Make sure we format to enough room for the max field length: | |
String fieldNameFormat = "%" + maxFieldNameLength + "s"; | |
System.out.printf(fieldNameFormat + " %15s %15s %15s %15s %15s %15s %20s\n", "field", "total", "terms dict", "postings", "proximity", "docvalues", "% with dv", "features"); | |
System.out.printf(fieldNameFormat + " %15s %15s %15s %15s %15s %15s %20s\n", "=====", "=====", "==========", "========", "=========", "=========", "========", "========"); | |
for (FieldStats field : stats) { | |
System.out.printf(fieldNameFormat + " %15s %15s %15s %15s %15s %14.1f%% %20s\n", | |
field.name, | |
df.format(field.totalBytes()), | |
df.format(field.termsBytes), | |
df.format(field.postingsBytes), | |
df.format(field.proxBytes), | |
df.format(field.dvBytes), | |
(100.0*field.docCountWithField)/reader.maxDoc(), | |
features(reader.getFieldInfos().fieldInfo(field.name))); | |
} | |
} | |
static String features(FieldInfo fi) { | |
StringBuilder sb = new StringBuilder(); | |
IndexOptions options = fi.getIndexOptions(); | |
if (options != IndexOptions.NONE) { | |
String words[] = options.toString().split("_"); | |
sb.append(words[words.length-1].toLowerCase()); | |
sb.append(" "); | |
} | |
if (fi.hasPayloads()) { | |
sb.append("payloads "); | |
} | |
if (fi.hasNorms()) { | |
sb.append("norms "); | |
} | |
if (fi.hasVectors()) { | |
sb.append("vectors "); | |
} | |
DocValuesType dvType = fi.getDocValuesType(); | |
if (dvType != DocValuesType.NONE) { | |
sb.append(dvType.toString().toLowerCase()); | |
} | |
return sb.toString().trim(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment