-
-
Save dain/e931a43b3463136fd7bf to your computer and use it in GitHub Desktop.
| /* | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| package com.facebook.presto.orc; | |
| import com.facebook.presto.orc.metadata.OrcMetadataReader; | |
| import com.facebook.presto.spi.block.Block; | |
| import com.facebook.presto.spi.type.Type; | |
| import com.facebook.presto.testing.TestingConnectorSession; | |
| import com.google.common.collect.ImmutableMap; | |
| import io.airlift.units.DataSize; | |
| import org.joda.time.DateTimeZone; | |
| import java.io.File; | |
| import java.io.IOException; | |
| import java.util.Map; | |
| import java.util.Map.Entry; | |
| import static com.facebook.presto.spi.type.BigintType.BIGINT; | |
| import static com.facebook.presto.spi.type.DoubleType.DOUBLE; | |
| import static com.facebook.presto.spi.type.VarcharType.VARCHAR; | |
| import static io.airlift.units.DataSize.Unit.MEGABYTE; | |
| @SuppressWarnings("UseOfSystemOutOrSystemErr") | |
| public final class ScanOrcFile | |
| { | |
| public static void main(String... args) | |
| throws Exception | |
| { | |
| File file = new File(args[0]); | |
| FileOrcDataSource orcDataSource = new FileOrcDataSource(file, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE)); | |
| OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE)); | |
| // | |
| // Set your column types here | |
| // | |
| Map<Integer, Type> columnTypes = ImmutableMap.<Integer, Type>builder() | |
| .put(0, BIGINT) | |
| .put(1, DOUBLE) | |
| .put(2, VARCHAR) | |
| .build(); | |
| OrcRecordReader recordReader = orcReader.createRecordReader(columnTypes, OrcPredicate.TRUE, DateTimeZone.getDefault()); | |
| long rows = 0; | |
| for (int batchSize = recordReader.nextBatch(); batchSize > 0; batchSize = recordReader.nextBatch()) { | |
| rows += readBatch(columnTypes, recordReader); | |
| } | |
| System.out.println(); | |
| System.out.println("rows: " + rows); | |
| } | |
| private static int readBatch(Map<Integer, Type> columnTypes, OrcRecordReader recordReader) | |
| throws IOException | |
| { | |
| int batchSize = recordReader.nextBatch(); | |
| for (Entry<Integer, Type> entry : columnTypes.entrySet()) { | |
| Block block = recordReader.readBlock(entry.getValue(), entry.getKey()); | |
| System.out.print(entry.getValue().getObjectValue(TestingConnectorSession.SESSION, block, 0)); | |
| System.out.print(","); | |
| } | |
| System.out.println(); | |
| return batchSize; | |
| } | |
| } |
Hi Dain,
Thanks for the example code. It's been very helpful for me to start playing with ORC.
I wonder if you can further explain how to get lazy materialization to work? I adapted the above code into a simple example below. After I read the blocks, I found their class to be FixedWidthBlock, instead of LazyFixedWidthBlock. IIUC it means blocks are not lazily loaded. Do you know how I can somehow turn on lazy materialization?
Thanks.
import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions;
import org.joda.time.DateTimeZone;
import com.facebook.presto.orc.FileOrcDataSource;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcReader;
import com.facebook.presto.orc.OrcRecordReader;
import com.facebook.presto.orc.TupleDomainOrcPredicate;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.Type;
import com.google.common.collect.ImmutableMap;
import io.airlift.units.DataSize;
import io.airlift.units.DataSize.Unit;
public class TestOrc {
private static final String ORC_FILE = "/tmp/test.orc";
private static class TestStruct {
public final long key;
public final double value;
public TestStruct(long key, double value) {
this.key = key;
this.value = value;
}
}
private static void write() throws IOException {
ObjectInspector inspector = ObjectInspectorFactory
.getReflectionObjectInspector(
TestStruct.class,
ObjectInspectorOptions.JAVA);
OrcFile.WriterOptions options = OrcFile.writerOptions(new Configuration())
.inspector(inspector);
Writer writer = OrcFile.createWriter(new Path(ORC_FILE), options);
for (int i = 0; i < 10000; ++i) {
writer.addRow(new TestStruct(i, i * 2));
}
writer.close();
}
private static void read() throws IOException {
OrcDataSource source = new FileOrcDataSource(
new File(ORC_FILE),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));
OrcReader reader = new OrcReader(
source,
new OrcMetadataReader(),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));
Map<Integer, Type> columns = ImmutableMap.<Integer, Type>builder()
.put(0, BIGINT)
.put(1, DOUBLE)
.build();
OrcRecordReader recordReader = reader.createRecordReader(
columns, TupleDomainOrcPredicate.TRUE, DateTimeZone.UTC,
new AggregatedMemoryContext());
long rows = 0;
for (int batchSize = recordReader.nextBatch(); batchSize > 0;
batchSize = recordReader.nextBatch()) {
rows += batchSize;
for (Entry<Integer, Type> entry : columns.entrySet()) {
Block block = recordReader.readBlock(entry.getValue(), entry.getKey());
System.out.println(block.getClass());
}
}
System.out.println(rows);
}
public static void main(String[] args) throws IOException {
write();
read();
}
}
Hi Dain,
Thanks For the above code
I'm using above code to scan orc file, but i'm facing this issue,
Exception in thread "main" java.lang.IllegalArgumentException: Unsupported type: VARCHAR
at com.facebook.presto.orc.reader.StreamReaders.createStreamReader(StreamReaders.java:57)
at com.facebook.presto.orc.OrcRecordReader.createStreamReaders(OrcRecordReader.java:385)
at com.facebook.presto.orc.OrcRecordReader.(OrcRecordReader.java:175)
at com.facebook.presto.orc.OrcReader.createRecordReader(OrcReader.java:193)
at com.facebook.presto.orc.OrcReader.createRecordReader(OrcReader.java:168)
at com.facebook.presto.orc.ScanOrcFile.main(ScanOrcFile.java:58)
Could you please let me know whats the problem?
Actually the orc file which i'm trying to read is created by using Hive. By the way i'm using 0.126