Created
November 20, 2017 18:57
-
-
Save animeshtrivedi/904ea99153fc205c029095a490eee21c to your computer and use it in GitHub Desktop.
Example of parquet vectorized reading
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// http://www.jofre.de/?p=1459 | |
package de.jofre.test; | |
import java.io.IOException; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.fs.Path; | |
import org.apache.parquet.column.page.PageReadStore; | |
import org.apache.parquet.example.data.Group; | |
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter; | |
import org.apache.parquet.format.converter.ParquetMetadataConverter; | |
import org.apache.parquet.hadoop.ParquetFileReader; | |
import org.apache.parquet.hadoop.metadata.ParquetMetadata; | |
import org.apache.parquet.io.ColumnIOFactory; | |
import org.apache.parquet.io.MessageColumnIO; | |
import org.apache.parquet.io.RecordReader; | |
import org.apache.parquet.schema.MessageType; | |
import org.apache.parquet.schema.Type; | |
public class Main { | |
private static Path path = new Path("file:\\C:\\myfile.snappy.parquet"); | |
private static void printGroup(Group g) { | |
int fieldCount = g.getType().getFieldCount(); | |
for (int field = 0; field < fieldCount; field++) { | |
int valueCount = g.getFieldRepetitionCount(field); | |
Type fieldType = g.getType().getType(field); | |
String fieldName = fieldType.getName(); | |
for (int index = 0; index < valueCount; index++) { | |
if (fieldType.isPrimitive()) { | |
System.out.println(fieldName + " " + g.getValueToString(field, index)); | |
} | |
} | |
} | |
System.out.println(""); | |
} | |
public static void main(String[] args) throws IllegalArgumentException { | |
Configuration conf = new Configuration(); | |
try { | |
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER); | |
MessageType schema = readFooter.getFileMetaData().getSchema(); | |
ParquetFileReader r = new ParquetFileReader(conf, path, readFooter); | |
PageReadStore pages = null; | |
try { | |
while (null != (pages = r.readNextRowGroup())) { | |
final long rows = pages.getRowCount(); | |
System.out.println("Number of rows: " + rows); | |
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); | |
final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)); | |
for (int i = 0; i < rows; i++) { | |
final Group g = recordReader.read(); | |
printGroup(g); | |
// TODO Compare to System.out.println(g); | |
} | |
} | |
} finally { | |
r.close(); | |
} | |
} catch (IOException e) { | |
System.out.println("Error reading parquet file."); | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment