Skip to content

Instantly share code, notes, and snippets.

@masayuki038
Created July 22, 2017 15:12
Show Gist options
  • Save masayuki038/6c8847da0eed2f5ca34662a6badf52c1 to your computer and use it in GitHub Desktop.
Save masayuki038/6c8847da0eed2f5ca34662a6badf52c1 to your computer and use it in GitHub Desktop.
SampleParquetReader.java
package net.wrap_trap.parquet_sample3;
import org.apache.commons.lang3.builder.ToStringBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.page.*;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import java.util.List;
public class SampleParquetReader {
public static void main(String[] args) throws Exception {
execute("D:\\development\\repository\\git\\drill\\sample-data\\nationsMF\\nationsMF.parquet");
}
protected static void execute(String path) throws IOException {
Configuration conf = new Configuration();
Path inPath = new Path(path);
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath);
MessageType schema = metaData.getFileMetaData().getSchema();
dump(conf, metaData, schema, inPath);
}
protected static void dump(Configuration conf, ParquetMetadata metaData, MessageType schema, Path inPath) throws IOException {
List<BlockMetaData> blocks = metaData.getBlocks();
List<ColumnDescriptor> columns = schema.getColumns();
ParquetFileReader reader = null;
for (BlockMetaData block : blocks) {
System.out.println(ToStringBuilder.reflectionToString(block));
System.out.println();
List<ColumnChunkMetaData> columnChunkMetadataList = block.getColumns();
for (ColumnChunkMetaData columnChunkMetadata : columnChunkMetadataList) {
System.out.println(ToStringBuilder.reflectionToString(columnChunkMetadata));
reader = new ParquetFileReader(conf, inPath, blocks, columns);
PageReadStore store = reader.readNextRowGroup();
while (store != null) {
for (ColumnDescriptor column: columns) {
PageReader pageReader = store.getPageReader(column);
long valueCount = pageReader.getTotalValueCount();
int maxRepetitionLevel = column.getMaxRepetitionLevel();
int maxDefinitionLevel = column.getMaxDefinitionLevel();
System.out.println(String.format("column: %s, valueCount: %d, maxRepetitionLevel: %d, maDefinitionLevel: %d", column.getPath(), valueCount, maxRepetitionLevel, maxDefinitionLevel));
DictionaryPage dict = pageReader.readDictionaryPage();
if (dict != null) {
System.out.println("Dictionary Size: " + dict.getDictionarySize());
System.out.println("Dictionary Encoding: " + dict.getEncoding());
}
DataPage page = pageReader.readPage();
for (long count = 0L; page != null; count++) {
System.out.println("page: " + count);
page.accept(new DataPage.Visitor<Void>() {
@Override
public Void visit(DataPageV1 pageV1) {
System.out.println("Repetition Level Encoding: " + pageV1.getRlEncoding());
System.out.println("Definition Level Encoding: " + pageV1.getDlEncoding());
System.out.println("Value Encoding: " + pageV1.getValueEncoding());
return null;
}
@Override
public Void visit(DataPageV2 pageV2) {
System.out.println("Value Encoding: " + pageV2.getDataEncoding());
return null;
}
});
System.out.println("Uncompressed Size: " + page.getUncompressedSize());
System.out.println("Value Count: " + page.getValueCount());
page = pageReader.readPage();
System.out.println();
}
}
System.out.println();
store = reader.readNextRowGroup();
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment