Created
July 22, 2017 15:38
-
-
Save masayuki038/4025be686800da424a97aad4324e65c8 to your computer and use it in GitHub Desktop.
SampleParquetReader2.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.wrap_trap.parquet_sample3; | |
import org.apache.commons.lang3.builder.ToStringBuilder; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.parquet.column.ColumnDescriptor; | |
import org.apache.parquet.column.ColumnReader; | |
import org.apache.parquet.column.impl.ColumnReadStoreImpl; | |
import org.apache.parquet.column.impl.ColumnReaderImpl; | |
import org.apache.parquet.column.page.*; | |
import org.apache.parquet.hadoop.ParquetFileReader; | |
import org.apache.parquet.hadoop.metadata.BlockMetaData; | |
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; | |
import org.apache.parquet.hadoop.metadata.ParquetMetadata; | |
import org.apache.parquet.io.api.Binary; | |
import org.apache.parquet.io.api.Converter; | |
import org.apache.parquet.io.api.GroupConverter; | |
import org.apache.parquet.io.api.PrimitiveConverter; | |
import org.apache.parquet.schema.MessageType; | |
import java.io.IOException; | |
import java.math.BigInteger; | |
import java.nio.CharBuffer; | |
import java.nio.charset.Charset; | |
import java.nio.charset.CharsetDecoder; | |
import java.util.List; | |
public class SampleParquetReader2 { | |
private static final Charset UTF8 = Charset.forName("UTF-8"); | |
private static final CharsetDecoder UTF8_DECODER = UTF8.newDecoder(); | |
public static void main(String[] args) throws Exception { | |
execute("D:\\development\\repository\\git\\drill\\sample-data\\nationsMF\\nationsMF.parquet"); | |
} | |
protected static void execute(String path) throws IOException { | |
Configuration conf = new Configuration(); | |
Path inPath = new Path(path); | |
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath); | |
MessageType schema = metaData.getFileMetaData().getSchema(); | |
dump(conf, metaData, schema, inPath); | |
} | |
protected static void dump(Configuration conf, ParquetMetadata metaData, MessageType schema, Path inPath) throws IOException { | |
List<BlockMetaData> blocks = metaData.getBlocks(); | |
List<ColumnDescriptor> columns = schema.getColumns(); | |
ParquetFileReader reader = null; | |
for (ColumnDescriptor column : columns) { | |
System.out.println(ToStringBuilder.reflectionToString(column)); | |
reader = new ParquetFileReader(conf, inPath, blocks, columns); | |
PageReadStore store = reader.readNextRowGroup(); | |
while (store != null) { | |
ColumnReadStoreImpl columnReadStoreImpl = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, ""); | |
int maxDefinitionLevel = column.getMaxDefinitionLevel(); | |
ColumnReader columnReader = columnReadStoreImpl.getColumnReader(column); | |
for (long i = 0L, e = columnReader.getTotalValueCount(); i < e; ++i) { | |
int repetitionLevel = columnReader.getCurrentRepetitionLevel(); | |
int definitionLevel = columnReader.getCurrentDefinitionLevel(); | |
if (definitionLevel == maxDefinitionLevel) { | |
switch (column.getType()) { | |
case BINARY: | |
System.out.println(binaryToString(columnReader.getBinary())); | |
break; | |
case BOOLEAN: | |
System.out.println(columnReader.getBoolean()); | |
break; | |
case DOUBLE: | |
System.out.println(columnReader.getDouble()); | |
break; | |
case FLOAT: | |
System.out.println(columnReader.getFloat()); | |
break; | |
case INT32: | |
System.out.println(columnReader.getInteger()); | |
break; | |
case INT64: | |
System.out.println(columnReader.getLong()); | |
break; | |
case INT96: | |
System.out.println(binaryToBigInteger(columnReader.getBinary())); | |
break; | |
case FIXED_LEN_BYTE_ARRAY: | |
System.out.println(binaryToString(columnReader.getBinary())); | |
break; | |
} | |
} else { | |
System.out.println("<null>"); | |
} | |
columnReader.consume(); | |
} | |
System.out.println(); | |
store = reader.readNextRowGroup(); | |
} | |
} | |
} | |
public static String binaryToString(Binary value) { | |
byte[] data = value.getBytes(); | |
if (data == null) return null; | |
try { | |
CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer()); | |
return buffer.toString(); | |
} catch (Throwable th) { | |
} | |
return "<bytes...>"; | |
} | |
public static BigInteger binaryToBigInteger(Binary value) { | |
byte[] data = value.getBytes(); | |
if (data == null) return null; | |
return new BigInteger(data); | |
} | |
private static final class DumpGroupConverter extends GroupConverter { | |
@Override public void start() { } | |
@Override public void end() { } | |
@Override public Converter getConverter(int fieldIndex) { return new DumpConverter(); } | |
} | |
private static final class DumpConverter extends PrimitiveConverter { | |
@Override public GroupConverter asGroupConverter() { return new DumpGroupConverter(); } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment